From 9904cfc7c55a81f648d30c9aadf5c3e1c8669ccb Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sun, 23 Jan 2022 21:59:06 -0600 Subject: [PATCH 01/63] FEAT-#4035: Upgrade pandas support to 1.4 Signed-off-by: Devin Petersohn --- modin/core/io/io.py | 4 +- modin/core/io/text/fwf_dispatcher.py | 3 +- modin/core/io/text/text_file_dispatcher.py | 5 +- modin/pandas/__init__.py | 129 +++++++++++---------- modin/pandas/base.py | 120 ++++++++++++++----- modin/pandas/dataframe.py | 41 ++++--- modin/pandas/general.py | 9 +- modin/pandas/groupby.py | 2 +- modin/pandas/io.py | 88 +++++++------- modin/pandas/series.py | 66 +++++++++-- modin/pandas/test/test_groupby.py | 7 +- setup.cfg | 2 +- 12 files changed, 292 insertions(+), 184 deletions(-) diff --git a/modin/core/io/io.py b/modin/core/io/io.py index 10b61d88032..813a094a002 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -23,7 +23,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from pandas.util._decorators import doc from modin.db_conn import ModinDatabaseConnection @@ -826,7 +826,7 @@ def to_sql( def to_pickle( cls, obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, diff --git a/modin/core/io/text/fwf_dispatcher.py b/modin/core/io/text/fwf_dispatcher.py index 63440776cfa..388ae096e8e 100644 --- a/modin/core/io/text/fwf_dispatcher.py +++ b/modin/core/io/text/fwf_dispatcher.py @@ -14,7 +14,6 @@ """Module houses `FWFDispatcher` class, that is used for reading of tables with fixed-width formatted lines.""" import pandas -from pandas._typing import FilePathOrBuffer from modin.core.io.text.text_file_dispatcher import TextFileDispatcher @@ -27,7 +26,7 @@ class FWFDispatcher(TextFileDispatcher): @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ): """ diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index e8620e07abf..97db296fced 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -27,7 +27,6 @@ import numpy as np import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.common import is_list_like from modin.core.io.file_dispatcher import FileDispatcher, OpenFile @@ -614,7 +613,7 @@ def _launch_tasks(cls, splits: list, **partition_kwargs) -> Tuple[list, list, li @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ) -> bool: """ @@ -912,7 +911,7 @@ def _get_new_qc( return new_query_compiler @classmethod - def _read(cls, filepath_or_buffer: FilePathOrBuffer, **kwargs): + def _read(cls, filepath_or_buffer, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index d6be7f69cb7..d45498c7369 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -12,12 +12,11 @@ # governing permissions and limitations under the License. import pandas +import warnings __pandas_version__ = "1.3.5" if pandas.__version__ != __pandas_version__: - import warnings - warnings.warn( "The pandas version installed {} does not match the supported pandas version in" " Modin {}. This may cause undesired side effects!".format( @@ -25,68 +24,70 @@ ) ) -from pandas import ( - eval, - cut, - factorize, - test, - qcut, - date_range, - period_range, - Index, - MultiIndex, - CategoricalIndex, - bdate_range, - DatetimeIndex, - Timedelta, - Timestamp, - to_timedelta, - set_eng_float_format, - options, - Flags, - set_option, - NaT, - PeriodIndex, - Categorical, - Interval, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - SparseDtype, - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - Float32Dtype, - Float64Dtype, - StringDtype, - BooleanDtype, - CategoricalDtype, - DatetimeTZDtype, - IntervalDtype, - PeriodDtype, - RangeIndex, - Int64Index, - UInt64Index, - Float64Index, - TimedeltaIndex, - IntervalIndex, - IndexSlice, - Grouper, - array, - Period, - show_versions, - DateOffset, - timedelta_range, - infer_freq, - interval_range, - ExcelWriter, - datetime, - NamedAgg, - NA, - api, -) +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from pandas import ( + eval, + cut, + factorize, + test, + qcut, + date_range, + period_range, + Index, + MultiIndex, + CategoricalIndex, + bdate_range, + DatetimeIndex, + Timedelta, + Timestamp, + to_timedelta, + set_eng_float_format, + options, + Flags, + set_option, + NaT, + PeriodIndex, + Categorical, + Interval, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + SparseDtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + Float32Dtype, + Float64Dtype, + StringDtype, + BooleanDtype, + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, + RangeIndex, + Int64Index, + UInt64Index, + Float64Index, + TimedeltaIndex, + IntervalIndex, + IndexSlice, + Grouper, + array, + Period, + show_versions, + DateOffset, + timedelta_range, + infer_freq, + interval_range, + ExcelWriter, + datetime, + NamedAgg, + NA, + api, + ) import os import multiprocessing diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 5736a28310b..67d83abfa8a 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -35,7 +35,6 @@ from pandas._typing import ( CompressionOptions, IndexKeyFunc, - FilePathOrBuffer, StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -932,7 +931,13 @@ def at_time(self, time, asof=False, axis=None): return self.loc[indexer] if axis == 0 else self.loc[:, indexer] def between_time( - self, start_time, end_time, include_start=True, include_end=True, axis=None + self: "BasePandasDataset", + start_time, + end_time, + include_start: "bool_t | lib.NoDefault" = no_default, + include_end: "bool_t | lib.NoDefault" = no_default, + inclusive: "str | None" = None, + axis=None, ): axis = self._get_axis_number(axis) idx = self.index if axis == 0 else self.columns @@ -943,6 +948,7 @@ def between_time( end_time, include_start=include_start, include_end=include_end, + inclusive=inclusive, ) .index ) @@ -1280,16 +1286,17 @@ def explode(self, column, ignore_index: bool = False): def ewm( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): + com: "float | None" = None, + span: "float | None" = None, + halflife: "float | TimedeltaConvertibleTypes | None" = None, + alpha: "float | None" = None, + min_periods: "int | None" = 0, + adjust: "bool_t" = True, + ignore_na: "bool_t" = False, + axis: "Axis" = 0, + times: "str | np.ndarray | BasePandasDataset | None" = None, + method: "str" = "single", + ) -> "ExponentialMovingWindow": return self._default_to_pandas( "ewm", com=com, @@ -1301,6 +1308,7 @@ def ewm( ignore_na=ignore_na, axis=axis, times=times, + method=method, ) def expanding(self, min_periods=1, center=None, axis=0, method="single"): @@ -1528,7 +1536,14 @@ def iloc(self): return _iLocIndexer(self) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def kurt( + self, + axis: "Axis | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): axis = self._get_axis_number(axis) if skipna is None: skipna = True @@ -1582,7 +1597,7 @@ def loc(self): return _LocIndexer(self) - def mad(self, axis=None, skipna=None, level=None): + def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) if skipna is None: skipna = True @@ -1621,7 +1636,14 @@ def mask( try_cast=try_cast, ) - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def max( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): if skipna is None: skipna = True if level is not None: @@ -1721,10 +1743,24 @@ def _stat_operation( ) return self._reduce_dimension(result_qc) - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def mean( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("mean", axis, skipna, level, numeric_only, **kwargs) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def median( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation( "median", axis, skipna, level, numeric_only, **kwargs ) @@ -1734,7 +1770,14 @@ def memory_usage(self, index=True, deep=False): self._query_compiler.memory_usage(index=index, deep=deep) ) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def min( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): if skipna is None: skipna = True if level is not None: @@ -1873,13 +1916,13 @@ def check_dtype(t): return result def rank( - self, + self: "BasePandasDataset", axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, + method: "str" = "average", + numeric_only: "bool_t | None | lib.NoDefault" = no_default, + na_option: "str" = "keep", + ascending: "bool_t" = True, + pct: "bool_t" = False, ): axis = self._get_axis_number(axis) return self.__constructor__( @@ -2258,7 +2301,13 @@ def sample( return self.__constructor__(query_compiler=query_compiler) def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "sem", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2377,7 +2426,14 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=no_default): else: return self.tshift(periods, freq) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def skew( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("skew", axis, skipna, level, numeric_only, **kwargs) def sort_index( @@ -2446,7 +2502,13 @@ def sort_values( return self._create_or_update_from_compiler(result, inplace) def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "std", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2703,7 +2765,7 @@ def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover def to_pickle( self, - path: FilePathOrBuffer, + path, compression: CompressionOptions = "infer", protocol: int = pkl.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -2909,7 +2971,7 @@ def value_counts( return counted_values def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs ): return self._stat_operation( "var", axis, skipna, level, numeric_only, ddof=ddof, **kwargs diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 039d55bf73b..a5591afb0e3 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -31,7 +31,7 @@ import functools import numpy as np import sys -from typing import IO, Optional, Union, Mapping, Iterator +from typing import IO, Optional, Union, Iterator import warnings from modin.error_message import ErrorMessage @@ -1614,7 +1614,7 @@ def pow( def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1770,11 +1770,11 @@ def rename( def replace( self, to_replace=None, - value=None, - inplace=False, + value=no_default, + inplace: "bool" = False, limit=None, - regex=False, - method="pad", + regex: "bool" = False, + method: "str | lib.NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -2009,7 +2009,7 @@ def sub( def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -2210,17 +2210,19 @@ def to_records( def to_stata( self, - path, - convert_dates=None, - write_index=True, - byteorder=None, - time_stamp=None, - data_label=None, - variable_labels=None, - version=114, - convert_strl=None, - compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: StorageOptions = None, + path: "FilePath | WriteBuffer[bytes]", + convert_dates: "dict[Hashable, str] | None" = None, + write_index: "bool" = True, + byteorder: "str | None" = None, + time_stamp: "datetime.datetime | None" = None, + data_label: "str | None" = None, + variable_labels: "dict[Hashable, str] | None" = None, + version: "int | None" = 114, + convert_strl: "Sequence[Hashable] | None" = None, + compression: "CompressionOptions" = "infer", + storage_options: "StorageOptions" = None, + *, + value_labels: "dict[Hashable, dict[float | int, str]] | None" = None, ): # pragma: no cover # noqa: PR01, RT01, D200 """ Export ``DataFrame`` object to Stata data format. @@ -2238,6 +2240,7 @@ def to_stata( convert_strl=convert_strl, compression=compression, storage_options=storage_options, + value_labels=value_labels, ) def to_timestamp( @@ -2328,7 +2331,7 @@ def update( def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 892b21b364d..dc8a8996f31 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -16,8 +16,7 @@ import pandas import numpy as np -from typing import Hashable, Iterable, Mapping, Optional, Union -from pandas._typing import FrameOrSeriesUnion +from typing import Hashable, Iterable, Mapping, Union from pandas.core.dtypes.common import is_list_like from modin.error_message import ErrorMessage @@ -359,9 +358,7 @@ def value_counts( @_inherit_docstrings(pandas.concat) def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", axis=0, join="outer", ignore_index: bool = False, @@ -371,7 +368,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -) -> FrameOrSeriesUnion: +) -> "DataFrame | Series": if isinstance(objs, (pandas.Series, Series, DataFrame, str, pandas.DataFrame)): raise TypeError( "first argument must be an iterable of pandas " diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 0e60fe4c514..391d5179019 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -17,7 +17,7 @@ import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like, is_numeric_dtype -from pandas.core.aggregation import reconstruct_func +from pandas.core.apply import reconstruct_func from pandas._libs.lib import no_default import pandas.core.common as com from types import BuiltinFunctionType diff --git a/modin/pandas/io.py b/modin/pandas/io.py index c5941d99064..7e197a98e69 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -26,7 +26,7 @@ import pathlib import re from collections import OrderedDict -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from typing import Union, IO, AnyStr, Sequence, Dict, List, Optional, Any from modin.error_message import ErrorMessage @@ -73,18 +73,18 @@ def _read(**kwargs): @_inherit_docstrings(pandas.read_csv) def read_csv( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, @@ -96,7 +96,7 @@ def read_csv( na_filter=True, verbose=False, skip_blank_lines=True, - parse_dates=False, + parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, @@ -104,16 +104,16 @@ def read_csv( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, @@ -124,7 +124,7 @@ def read_csv( low_memory=True, memory_map=False, float_precision=None, - storage_options: StorageOptions = None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -137,23 +137,24 @@ def read_csv( @_inherit_docstrings(pandas.read_table) def read_table( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, + skipfooter=0, nrows=None, na_values=None, keep_default_na=True, @@ -168,26 +169,26 @@ def read_table( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, + doublequote=True, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, - skipfooter=0, - doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -317,32 +318,33 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover @_inherit_docstrings(pandas.read_excel) def read_excel( io, - sheet_name=0, - header=0, + sheet_name: "str | int | list[IntStrT] | None" = 0, + header: "int | Sequence[int] | None" = 0, names=None, - index_col=None, + index_col: "int | Sequence[int] | None" = None, usecols=None, - squeeze=False, - dtype=None, - engine=None, + squeeze: "bool | None" = None, + dtype: "DtypeArg | None" = None, + engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None, converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: "Iterable[Hashable] | None" = None, + false_values: "Iterable[Hashable] | None" = None, + skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None, + nrows: "int | None" = None, na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, + keep_default_na: "bool" = True, + na_filter: "bool" = True, + verbose: "bool" = False, parse_dates=False, date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, - storage_options: StorageOptions = None, -): + thousands: "str | None" = None, + decimal: "str" = ".", + comment: "str | None" = None, + skipfooter: "int" = 0, + convert_float: "bool | None" = None, + mangle_dupe_cols: "bool" = True, + storage_options: "StorageOptions" = None, +) -> "DataFrame | dict[IntStrT, DataFrame]": _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -438,7 +440,7 @@ def read_sas( @_inherit_docstrings(pandas.read_pickle) def read_pickle( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -557,7 +559,7 @@ def read_spss( @_inherit_docstrings(pandas.to_pickle) def to_pickle( obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -597,9 +599,7 @@ def json_normalize( @_inherit_docstrings(pandas.read_orc) -def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs -) -> DataFrame: +def read_orc(path, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index f6a868d8281..49450900946 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1161,6 +1161,23 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D20 skipna = True return super(Series, self).idxmin(axis=axis, skipna=skipna, *args, **kwargs) + def info( + self, + verbose: "bool | None" = None, + buf: "IO[str] | None" = None, + max_cols: "int | None" = None, + memory_usage: "bool | str | None" = None, + show_counts: "bool" = True, + ): + return self._default_to_pandas( + pandas.Series.info, + verbose=verbose, + buf=buf, + max_cols=max_cols, + memory_usage=memory_usage, + show_counts=show_counts, + ) + def interpolate( self, method="linear", @@ -1218,7 +1235,12 @@ def keys(self): # noqa: RT01, D200 return self.index def kurt( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, + axis: "Axis | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased kurtosis over requested axis. @@ -1262,6 +1284,27 @@ def arg(s): ) ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors=no_default, + try_cast=no_default, + ): + return self._default_to_pandas( + pandas.Series.mask, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + ) + def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 """ Return the memory usage of the Series. @@ -1409,7 +1452,7 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1476,10 +1519,11 @@ def ravel(self, order="C"): # noqa: PR01, RT01, D200 return data - def reindex(self, index=None, **kwargs): # noqa: PR01, RT01, D200 + def reindex(self, *args, **kwargs): # noqa: PR01, RT01, D200 """ Conform Series to new index with optional filling logic. """ + index = kwargs.pop("index", None) method = kwargs.pop("method", None) level = kwargs.pop("level", None) copy = kwargs.pop("copy", True) @@ -1543,7 +1587,7 @@ def repeat(self, repeats, axis=None): # noqa: PR01, RT01, D200 return self.__constructor__(query_compiler=self._query_compiler.repeat(repeats)) def reset_index( - self, level=None, drop=False, name=None, inplace=False + self, level=None, drop=False, name=no_default, inplace=False ): # noqa: PR01, RT01, D200 """ Generate a new Series with the index reset. @@ -1653,11 +1697,11 @@ def reorder_levels(self, order): # noqa: PR01, RT01, D200 def replace( self, to_replace=None, - value=None, + value=no_default, inplace=False, limit=None, regex=False, - method="pad", + method: "str | lib.NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -1770,7 +1814,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1843,7 +1887,9 @@ def to_dict(self, into=dict): # pragma: no cover # noqa: PR01, RT01, D200 """ return self._default_to_pandas("to_dict", into=into) - def to_frame(self, name=None): # noqa: PR01, RT01, D200 + def to_frame( + self, name: "Hashable" = no_default + ) -> "DataFrame": # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. """ @@ -2011,11 +2057,11 @@ def view(self, dtype=None): # noqa: PR01, RT01, D200 def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, - errors="raise", + errors=no_default, try_cast=no_default, ): # noqa: PR01, RT01, D200 """ diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 7f44cf83336..770dfeeea1c 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -1397,9 +1397,10 @@ def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): modin_df = modin_df.T pandas_df = pandas_df.T - md_grp, pd_grp = modin_df.groupby( - **groupby_kwargs, dropna=dropna - ), pandas_df.groupby(**groupby_kwargs, dropna=dropna) + md_grp, pd_grp = ( + modin_df.groupby(**groupby_kwargs, dropna=dropna), + pandas_df.groupby(**groupby_kwargs, dropna=dropna), + ) modin_groupby_equals_pandas(md_grp, pd_grp) by_kwarg = groupby_kwargs.get("by", []) diff --git a/setup.cfg b/setup.cfg index 4b68525c140..cef307bd269 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ filterwarnings = [flake8] max-line-length = 88 -ignore = E203, E266, E501, W503 +ignore = E203, E266, E501, W503, F821 select = B,C,E,F,W,T4,B9 per-file-ignores = modin/pandas/__init__.py:E402,F401 From c9270b76b2b49528fcba9484e9655db065737b96 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sun, 23 Jan 2022 21:59:06 -0600 Subject: [PATCH 02/63] FEAT-#4035: Upgrade pandas support to 1.4 Signed-off-by: Devin Petersohn --- modin/core/io/io.py | 4 +- modin/core/io/text/fwf_dispatcher.py | 3 +- modin/core/io/text/text_file_dispatcher.py | 5 +- modin/pandas/__init__.py | 129 +++++++++++---------- modin/pandas/base.py | 120 ++++++++++++++----- modin/pandas/dataframe.py | 41 ++++--- modin/pandas/general.py | 9 +- modin/pandas/groupby.py | 2 +- modin/pandas/io.py | 88 +++++++------- modin/pandas/series.py | 66 +++++++++-- modin/pandas/test/test_groupby.py | 7 +- setup.cfg | 2 +- 12 files changed, 292 insertions(+), 184 deletions(-) diff --git a/modin/core/io/io.py b/modin/core/io/io.py index 10b61d88032..813a094a002 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -23,7 +23,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from pandas.util._decorators import doc from modin.db_conn import ModinDatabaseConnection @@ -826,7 +826,7 @@ def to_sql( def to_pickle( cls, obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, diff --git a/modin/core/io/text/fwf_dispatcher.py b/modin/core/io/text/fwf_dispatcher.py index 63440776cfa..388ae096e8e 100644 --- a/modin/core/io/text/fwf_dispatcher.py +++ b/modin/core/io/text/fwf_dispatcher.py @@ -14,7 +14,6 @@ """Module houses `FWFDispatcher` class, that is used for reading of tables with fixed-width formatted lines.""" import pandas -from pandas._typing import FilePathOrBuffer from modin.core.io.text.text_file_dispatcher import TextFileDispatcher @@ -27,7 +26,7 @@ class FWFDispatcher(TextFileDispatcher): @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ): """ diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index e8620e07abf..97db296fced 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -27,7 +27,6 @@ import numpy as np import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.common import is_list_like from modin.core.io.file_dispatcher import FileDispatcher, OpenFile @@ -614,7 +613,7 @@ def _launch_tasks(cls, splits: list, **partition_kwargs) -> Tuple[list, list, li @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ) -> bool: """ @@ -912,7 +911,7 @@ def _get_new_qc( return new_query_compiler @classmethod - def _read(cls, filepath_or_buffer: FilePathOrBuffer, **kwargs): + def _read(cls, filepath_or_buffer, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index d6be7f69cb7..d45498c7369 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -12,12 +12,11 @@ # governing permissions and limitations under the License. import pandas +import warnings __pandas_version__ = "1.3.5" if pandas.__version__ != __pandas_version__: - import warnings - warnings.warn( "The pandas version installed {} does not match the supported pandas version in" " Modin {}. This may cause undesired side effects!".format( @@ -25,68 +24,70 @@ ) ) -from pandas import ( - eval, - cut, - factorize, - test, - qcut, - date_range, - period_range, - Index, - MultiIndex, - CategoricalIndex, - bdate_range, - DatetimeIndex, - Timedelta, - Timestamp, - to_timedelta, - set_eng_float_format, - options, - Flags, - set_option, - NaT, - PeriodIndex, - Categorical, - Interval, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - SparseDtype, - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - Float32Dtype, - Float64Dtype, - StringDtype, - BooleanDtype, - CategoricalDtype, - DatetimeTZDtype, - IntervalDtype, - PeriodDtype, - RangeIndex, - Int64Index, - UInt64Index, - Float64Index, - TimedeltaIndex, - IntervalIndex, - IndexSlice, - Grouper, - array, - Period, - show_versions, - DateOffset, - timedelta_range, - infer_freq, - interval_range, - ExcelWriter, - datetime, - NamedAgg, - NA, - api, -) +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from pandas import ( + eval, + cut, + factorize, + test, + qcut, + date_range, + period_range, + Index, + MultiIndex, + CategoricalIndex, + bdate_range, + DatetimeIndex, + Timedelta, + Timestamp, + to_timedelta, + set_eng_float_format, + options, + Flags, + set_option, + NaT, + PeriodIndex, + Categorical, + Interval, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + SparseDtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + Float32Dtype, + Float64Dtype, + StringDtype, + BooleanDtype, + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, + RangeIndex, + Int64Index, + UInt64Index, + Float64Index, + TimedeltaIndex, + IntervalIndex, + IndexSlice, + Grouper, + array, + Period, + show_versions, + DateOffset, + timedelta_range, + infer_freq, + interval_range, + ExcelWriter, + datetime, + NamedAgg, + NA, + api, + ) import os import multiprocessing diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 5736a28310b..67d83abfa8a 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -35,7 +35,6 @@ from pandas._typing import ( CompressionOptions, IndexKeyFunc, - FilePathOrBuffer, StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -932,7 +931,13 @@ def at_time(self, time, asof=False, axis=None): return self.loc[indexer] if axis == 0 else self.loc[:, indexer] def between_time( - self, start_time, end_time, include_start=True, include_end=True, axis=None + self: "BasePandasDataset", + start_time, + end_time, + include_start: "bool_t | lib.NoDefault" = no_default, + include_end: "bool_t | lib.NoDefault" = no_default, + inclusive: "str | None" = None, + axis=None, ): axis = self._get_axis_number(axis) idx = self.index if axis == 0 else self.columns @@ -943,6 +948,7 @@ def between_time( end_time, include_start=include_start, include_end=include_end, + inclusive=inclusive, ) .index ) @@ -1280,16 +1286,17 @@ def explode(self, column, ignore_index: bool = False): def ewm( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): + com: "float | None" = None, + span: "float | None" = None, + halflife: "float | TimedeltaConvertibleTypes | None" = None, + alpha: "float | None" = None, + min_periods: "int | None" = 0, + adjust: "bool_t" = True, + ignore_na: "bool_t" = False, + axis: "Axis" = 0, + times: "str | np.ndarray | BasePandasDataset | None" = None, + method: "str" = "single", + ) -> "ExponentialMovingWindow": return self._default_to_pandas( "ewm", com=com, @@ -1301,6 +1308,7 @@ def ewm( ignore_na=ignore_na, axis=axis, times=times, + method=method, ) def expanding(self, min_periods=1, center=None, axis=0, method="single"): @@ -1528,7 +1536,14 @@ def iloc(self): return _iLocIndexer(self) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def kurt( + self, + axis: "Axis | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): axis = self._get_axis_number(axis) if skipna is None: skipna = True @@ -1582,7 +1597,7 @@ def loc(self): return _LocIndexer(self) - def mad(self, axis=None, skipna=None, level=None): + def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) if skipna is None: skipna = True @@ -1621,7 +1636,14 @@ def mask( try_cast=try_cast, ) - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def max( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): if skipna is None: skipna = True if level is not None: @@ -1721,10 +1743,24 @@ def _stat_operation( ) return self._reduce_dimension(result_qc) - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def mean( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("mean", axis, skipna, level, numeric_only, **kwargs) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def median( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation( "median", axis, skipna, level, numeric_only, **kwargs ) @@ -1734,7 +1770,14 @@ def memory_usage(self, index=True, deep=False): self._query_compiler.memory_usage(index=index, deep=deep) ) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def min( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): if skipna is None: skipna = True if level is not None: @@ -1873,13 +1916,13 @@ def check_dtype(t): return result def rank( - self, + self: "BasePandasDataset", axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, + method: "str" = "average", + numeric_only: "bool_t | None | lib.NoDefault" = no_default, + na_option: "str" = "keep", + ascending: "bool_t" = True, + pct: "bool_t" = False, ): axis = self._get_axis_number(axis) return self.__constructor__( @@ -2258,7 +2301,13 @@ def sample( return self.__constructor__(query_compiler=query_compiler) def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "sem", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2377,7 +2426,14 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=no_default): else: return self.tshift(periods, freq) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def skew( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("skew", axis, skipna, level, numeric_only, **kwargs) def sort_index( @@ -2446,7 +2502,13 @@ def sort_values( return self._create_or_update_from_compiler(result, inplace) def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "std", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2703,7 +2765,7 @@ def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover def to_pickle( self, - path: FilePathOrBuffer, + path, compression: CompressionOptions = "infer", protocol: int = pkl.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -2909,7 +2971,7 @@ def value_counts( return counted_values def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs ): return self._stat_operation( "var", axis, skipna, level, numeric_only, ddof=ddof, **kwargs diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index b81cf54cfe5..df4f3d4c3d9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -31,7 +31,7 @@ import functools import numpy as np import sys -from typing import IO, Optional, Union, Mapping, Iterator +from typing import IO, Optional, Union, Iterator import warnings from modin.pandas import Categorical @@ -1615,7 +1615,7 @@ def pow( def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1771,11 +1771,11 @@ def rename( def replace( self, to_replace=None, - value=None, - inplace=False, + value=no_default, + inplace: "bool" = False, limit=None, - regex=False, - method="pad", + regex: "bool" = False, + method: "str | lib.NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -2010,7 +2010,7 @@ def sub( def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -2211,17 +2211,19 @@ def to_records( def to_stata( self, - path, - convert_dates=None, - write_index=True, - byteorder=None, - time_stamp=None, - data_label=None, - variable_labels=None, - version=114, - convert_strl=None, - compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: StorageOptions = None, + path: "FilePath | WriteBuffer[bytes]", + convert_dates: "dict[Hashable, str] | None" = None, + write_index: "bool" = True, + byteorder: "str | None" = None, + time_stamp: "datetime.datetime | None" = None, + data_label: "str | None" = None, + variable_labels: "dict[Hashable, str] | None" = None, + version: "int | None" = 114, + convert_strl: "Sequence[Hashable] | None" = None, + compression: "CompressionOptions" = "infer", + storage_options: "StorageOptions" = None, + *, + value_labels: "dict[Hashable, dict[float | int, str]] | None" = None, ): # pragma: no cover # noqa: PR01, RT01, D200 """ Export ``DataFrame`` object to Stata data format. @@ -2239,6 +2241,7 @@ def to_stata( convert_strl=convert_strl, compression=compression, storage_options=storage_options, + value_labels=value_labels, ) def to_timestamp( @@ -2329,7 +2332,7 @@ def update( def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 892b21b364d..dc8a8996f31 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -16,8 +16,7 @@ import pandas import numpy as np -from typing import Hashable, Iterable, Mapping, Optional, Union -from pandas._typing import FrameOrSeriesUnion +from typing import Hashable, Iterable, Mapping, Union from pandas.core.dtypes.common import is_list_like from modin.error_message import ErrorMessage @@ -359,9 +358,7 @@ def value_counts( @_inherit_docstrings(pandas.concat) def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", axis=0, join="outer", ignore_index: bool = False, @@ -371,7 +368,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -) -> FrameOrSeriesUnion: +) -> "DataFrame | Series": if isinstance(objs, (pandas.Series, Series, DataFrame, str, pandas.DataFrame)): raise TypeError( "first argument must be an iterable of pandas " diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 0e60fe4c514..391d5179019 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -17,7 +17,7 @@ import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like, is_numeric_dtype -from pandas.core.aggregation import reconstruct_func +from pandas.core.apply import reconstruct_func from pandas._libs.lib import no_default import pandas.core.common as com from types import BuiltinFunctionType diff --git a/modin/pandas/io.py b/modin/pandas/io.py index c5941d99064..7e197a98e69 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -26,7 +26,7 @@ import pathlib import re from collections import OrderedDict -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from typing import Union, IO, AnyStr, Sequence, Dict, List, Optional, Any from modin.error_message import ErrorMessage @@ -73,18 +73,18 @@ def _read(**kwargs): @_inherit_docstrings(pandas.read_csv) def read_csv( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, @@ -96,7 +96,7 @@ def read_csv( na_filter=True, verbose=False, skip_blank_lines=True, - parse_dates=False, + parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, @@ -104,16 +104,16 @@ def read_csv( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, @@ -124,7 +124,7 @@ def read_csv( low_memory=True, memory_map=False, float_precision=None, - storage_options: StorageOptions = None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -137,23 +137,24 @@ def read_csv( @_inherit_docstrings(pandas.read_table) def read_table( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, + skipfooter=0, nrows=None, na_values=None, keep_default_na=True, @@ -168,26 +169,26 @@ def read_table( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, + doublequote=True, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, - skipfooter=0, - doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -317,32 +318,33 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover @_inherit_docstrings(pandas.read_excel) def read_excel( io, - sheet_name=0, - header=0, + sheet_name: "str | int | list[IntStrT] | None" = 0, + header: "int | Sequence[int] | None" = 0, names=None, - index_col=None, + index_col: "int | Sequence[int] | None" = None, usecols=None, - squeeze=False, - dtype=None, - engine=None, + squeeze: "bool | None" = None, + dtype: "DtypeArg | None" = None, + engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None, converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: "Iterable[Hashable] | None" = None, + false_values: "Iterable[Hashable] | None" = None, + skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None, + nrows: "int | None" = None, na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, + keep_default_na: "bool" = True, + na_filter: "bool" = True, + verbose: "bool" = False, parse_dates=False, date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, - storage_options: StorageOptions = None, -): + thousands: "str | None" = None, + decimal: "str" = ".", + comment: "str | None" = None, + skipfooter: "int" = 0, + convert_float: "bool | None" = None, + mangle_dupe_cols: "bool" = True, + storage_options: "StorageOptions" = None, +) -> "DataFrame | dict[IntStrT, DataFrame]": _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -438,7 +440,7 @@ def read_sas( @_inherit_docstrings(pandas.read_pickle) def read_pickle( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -557,7 +559,7 @@ def read_spss( @_inherit_docstrings(pandas.to_pickle) def to_pickle( obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -597,9 +599,7 @@ def json_normalize( @_inherit_docstrings(pandas.read_orc) -def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs -) -> DataFrame: +def read_orc(path, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index f6a868d8281..49450900946 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1161,6 +1161,23 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D20 skipna = True return super(Series, self).idxmin(axis=axis, skipna=skipna, *args, **kwargs) + def info( + self, + verbose: "bool | None" = None, + buf: "IO[str] | None" = None, + max_cols: "int | None" = None, + memory_usage: "bool | str | None" = None, + show_counts: "bool" = True, + ): + return self._default_to_pandas( + pandas.Series.info, + verbose=verbose, + buf=buf, + max_cols=max_cols, + memory_usage=memory_usage, + show_counts=show_counts, + ) + def interpolate( self, method="linear", @@ -1218,7 +1235,12 @@ def keys(self): # noqa: RT01, D200 return self.index def kurt( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, + axis: "Axis | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased kurtosis over requested axis. @@ -1262,6 +1284,27 @@ def arg(s): ) ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors=no_default, + try_cast=no_default, + ): + return self._default_to_pandas( + pandas.Series.mask, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + ) + def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 """ Return the memory usage of the Series. @@ -1409,7 +1452,7 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1476,10 +1519,11 @@ def ravel(self, order="C"): # noqa: PR01, RT01, D200 return data - def reindex(self, index=None, **kwargs): # noqa: PR01, RT01, D200 + def reindex(self, *args, **kwargs): # noqa: PR01, RT01, D200 """ Conform Series to new index with optional filling logic. """ + index = kwargs.pop("index", None) method = kwargs.pop("method", None) level = kwargs.pop("level", None) copy = kwargs.pop("copy", True) @@ -1543,7 +1587,7 @@ def repeat(self, repeats, axis=None): # noqa: PR01, RT01, D200 return self.__constructor__(query_compiler=self._query_compiler.repeat(repeats)) def reset_index( - self, level=None, drop=False, name=None, inplace=False + self, level=None, drop=False, name=no_default, inplace=False ): # noqa: PR01, RT01, D200 """ Generate a new Series with the index reset. @@ -1653,11 +1697,11 @@ def reorder_levels(self, order): # noqa: PR01, RT01, D200 def replace( self, to_replace=None, - value=None, + value=no_default, inplace=False, limit=None, regex=False, - method="pad", + method: "str | lib.NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -1770,7 +1814,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1843,7 +1887,9 @@ def to_dict(self, into=dict): # pragma: no cover # noqa: PR01, RT01, D200 """ return self._default_to_pandas("to_dict", into=into) - def to_frame(self, name=None): # noqa: PR01, RT01, D200 + def to_frame( + self, name: "Hashable" = no_default + ) -> "DataFrame": # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. """ @@ -2011,11 +2057,11 @@ def view(self, dtype=None): # noqa: PR01, RT01, D200 def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, - errors="raise", + errors=no_default, try_cast=no_default, ): # noqa: PR01, RT01, D200 """ diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 7f44cf83336..770dfeeea1c 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -1397,9 +1397,10 @@ def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): modin_df = modin_df.T pandas_df = pandas_df.T - md_grp, pd_grp = modin_df.groupby( - **groupby_kwargs, dropna=dropna - ), pandas_df.groupby(**groupby_kwargs, dropna=dropna) + md_grp, pd_grp = ( + modin_df.groupby(**groupby_kwargs, dropna=dropna), + pandas_df.groupby(**groupby_kwargs, dropna=dropna), + ) modin_groupby_equals_pandas(md_grp, pd_grp) by_kwarg = groupby_kwargs.get("by", []) diff --git a/setup.cfg b/setup.cfg index 4b68525c140..cef307bd269 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ filterwarnings = [flake8] max-line-length = 88 -ignore = E203, E266, E501, W503 +ignore = E203, E266, E501, W503, F821 select = B,C,E,F,W,T4,B9 per-file-ignores = modin/pandas/__init__.py:E402,F401 From cfdfa5193ec79a4c02a8c8c28dcf1f35828c737d Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:04:47 +0300 Subject: [PATCH 03/63] Upgrade pandas to 1.4.0 in env files Signed-off-by: Igoshev, Yaroslav --- environment-dev.yml | 2 +- modin/pandas/__init__.py | 2 +- requirements-dev.txt | 2 +- requirements/env_omnisci.yml | 2 +- requirements/requirements-no-engine.yml | 2 +- setup.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 05ffcb23609..4fd460f80f1 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,7 +2,7 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - numpy>=1.16.5 - pyarrow>=4.0.1 - dask[complete]>=2.22.0 diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index d45498c7369..bca22767a39 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -14,7 +14,7 @@ import pandas import warnings -__pandas_version__ = "1.3.5" +__pandas_version__ = "1.4.0" if pandas.__version__ != __pandas_version__: warnings.warn( diff --git a/requirements-dev.txt b/requirements-dev.txt index d321683183a..50073f6cecf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -pandas==1.3.5 +pandas==1.4.0 numpy>=1.16.5 pyarrow>=4.0.1 dask[complete]>=2.22.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 428775426b9..366a4cf6d75 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -2,7 +2,7 @@ name: modin_on_omnisci channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - pyarrow==5.0.0 - numpy>=1.16.5 - fsspec diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 79a65081e4e..1e3eb5f908d 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -1,7 +1,7 @@ channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - numpy>=1.16.5 - pyarrow>=4.0.1 - fsspec diff --git a/setup.py b/setup.py index e42555d77d7..f5c0d03880a 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.3.5", "packaging", "numpy>=1.16.5", "fsspec"], + install_requires=["pandas==1.4.0", "packaging", "numpy>=1.16.5", "fsspec"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, From de9d0950585c7137d0e00067aa0502824f8623f4 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:25:02 +0300 Subject: [PATCH 04/63] Upgrade min python version in setup.py and ci.yml Signed-off-by: Igoshev, Yaroslav --- .github/workflows/ci.yml | 38 +++++++++++++++++++------------------- setup.py | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 73496b6b3f2..995f7cef5d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install black - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py @@ -43,7 +43,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install -r docs/requirements-doc.txt - run: cd docs && sphinx-build -T -E -b html . build @@ -57,7 +57,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost - run: pytest scripts/test @@ -132,7 +132,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install flake8 flake8-print - run: flake8 --enable=T modin/ asv_bench/benchmarks scripts/doc_checker.py @@ -152,7 +152,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an HTTP error. Retry @@ -185,7 +185,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -214,7 +214,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - name: Clean install and run run: | @@ -235,7 +235,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - name: Clean install and run run: | @@ -258,7 +258,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -294,7 +294,7 @@ jobs: env: MODIN_MEMORY: 1000000000 MODIN_TEST_DATASET_SIZE: "small" - name: Test ${{ matrix.execution }} execution, Python 3.7 + name: Test ${{ matrix.execution }} execution, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -303,7 +303,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -357,7 +357,7 @@ jobs: MODIN_EXPERIMENTAL: "True" MODIN_ENGINE: "native" MODIN_STORAGE_FORMAT: "omnisci" - name: Test OmniSci storage format, Python 3.7 + name: Test OmniSci storage format, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -367,7 +367,7 @@ jobs: with: activate-environment: modin_on_omnisci environment-file: requirements/env_omnisci.yml - python-version: 3.7 + python-version: 3.8 use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry # it once if it fails. todo(https://github.com/conda-incubator/setup-miniconda/issues/129): @@ -471,7 +471,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["python", "ray", "dask"] env: MODIN_ENGINE: ${{matrix.engine}} @@ -558,7 +558,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -604,7 +604,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -642,7 +642,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["ray", "dask"] test-task: - modin/pandas/test/dataframe/test_binary.py @@ -704,7 +704,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] env: MODIN_STORAGE_FORMAT: pyarrow MODIN_EXPERIMENTAL: "True" @@ -741,7 +741,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: [ "3.7", "3.8" ] + python-version: ["3.8" ] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" diff --git a/setup.py b/setup.py index f5c0d03880a..de681def55c 100644 --- a/setup.py +++ b/setup.py @@ -32,5 +32,5 @@ "sql": sql_deps, "all": all_deps, }, - python_requires=">=3.7.1", + python_requires=">=3.8", ) From 81955c699b4f5e4eb129141f432612a9a0e7a72e Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:29:37 +0300 Subject: [PATCH 05/63] Upgrade min numpy version Signed-off-by: Igoshev, Yaroslav --- environment-dev.yml | 2 +- requirements-dev.txt | 2 +- requirements/env_omnisci.yml | 2 +- requirements/requirements-no-engine.yml | 2 +- setup.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 4fd460f80f1..283393d2890 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - pandas==1.4.0 - - numpy>=1.16.5 + - numpy>=1.18.5 - pyarrow>=4.0.1 - dask[complete]>=2.22.0 - distributed>=2.22.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 50073f6cecf..1e8e356f7d6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ pandas==1.4.0 -numpy>=1.16.5 +numpy>=1.18.5 pyarrow>=4.0.1 dask[complete]>=2.22.0 distributed>=2.22.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 366a4cf6d75..ec44b999dfb 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -4,7 +4,7 @@ channels: dependencies: - pandas==1.4.0 - pyarrow==5.0.0 - - numpy>=1.16.5 + - numpy>=1.18.5 - fsspec - pip - pytest>=6.0.1 diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 1e3eb5f908d..814ce0fd0cd 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -2,7 +2,7 @@ channels: - conda-forge dependencies: - pandas==1.4.0 - - numpy>=1.16.5 + - numpy>=1.18.5 - pyarrow>=4.0.1 - fsspec - xarray diff --git a/setup.py b/setup.py index de681def55c..a1935df70a3 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.4.0", "packaging", "numpy>=1.16.5", "fsspec"], + install_requires=["pandas==1.4.0", "packaging", "numpy>=1.18.5", "fsspec"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, From ec62f84af3521d206b981c8765ff38b6d20d2b70 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:49:22 +0300 Subject: [PATCH 06/63] Remove FilePathOrBuffer import Signed-off-by: Igoshev, Yaroslav --- .../native/implementations/omnisci_on_native/io/io.py | 3 +-- modin/experimental/pandas/io.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py index b5d03b32b50..d3eb167a6fb 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py @@ -37,7 +37,6 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.io.common import is_url ReadCsvKwargsType = Dict[ @@ -51,7 +50,7 @@ Sequence, Callable, Dialect, - FilePathOrBuffer, + None, None, ], ] diff --git a/modin/experimental/pandas/io.py b/modin/experimental/pandas/io.py index 74614b08624..68333577a06 100644 --- a/modin/experimental/pandas/io.py +++ b/modin/experimental/pandas/io.py @@ -20,7 +20,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from . import DataFrame from modin.config import IsExperimental, Engine @@ -245,7 +245,7 @@ def _read(**kwargs) -> DataFrame: def read_pickle_distributed( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -289,7 +289,7 @@ def read_pickle_distributed( def to_pickle_distributed( self, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, From 10b970edb917c7597144bd4fc37aa8bb7a1ced1c Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 15:14:06 +0300 Subject: [PATCH 07/63] Handle axis more carefully Signed-off-by: Igoshev, Yaroslav --- modin/pandas/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 67d83abfa8a..a845c73c8eb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -528,6 +528,9 @@ def _get_axis_number(cls, axis): int 0 or 1 - axis index in the array of axes stored in the dataframe. """ + if axis is no_default: + axis = None + return cls._pandas_class._get_axis_number(axis) if axis is not None else 0 def __constructor__(self, *args, **kwargs): From c250e6b66f94af05e1cd5606947dfa2729ee64c2 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Mon, 24 Jan 2022 16:37:31 +0300 Subject: [PATCH 08/63] Fix `test_resample_getitem`. Details are in https://github.com/pandas-dev/pandas/pull/44944 --- modin/pandas/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index a845c73c8eb..6c8fb86d2fe 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3221,7 +3221,7 @@ def _get_new_resampler(key): if isinstance( key, (list, tuple, Series, pandas.Series, pandas.Index, np.ndarray) ): - if len(self._dataframe.columns.intersection(key)) != len(key): + if len(self._dataframe.columns.intersection(key)) != len(set(key)): missed_keys = list(set(key).difference(self._dataframe.columns)) raise KeyError(f"Columns not found: {str(sorted(missed_keys))[1:-1]}") return _get_new_resampler(list(key)) From c6ad1aa08c7ed650c8a5ab5583a23ada785693ea Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 24 Jan 2022 12:17:59 -0600 Subject: [PATCH 09/63] Fix kurtosis exception type Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 6c8fb86d2fe..2d93b0270ae 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1548,8 +1548,7 @@ def kurt( **kwargs, ): axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: func_kwargs = { "skipna": skipna, From 2f01bf44c7035ae372667138e2db2a78620273a1 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 10:24:43 -0800 Subject: [PATCH 10/63] Fix test_append by removing stale sort workaround Signed-off-by: Rehan Durrani --- modin/pandas/test/dataframe/test_map_metadata.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 692173c1c4f..53358805d3e 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -410,15 +410,6 @@ def test_append(data): modin_df.append(list(modin_df.iloc[-1])) else: modin_result = modin_df.append(list(modin_df.iloc[-1])) - # Pandas has bug where sort=False is ignored - # (https://github.com/pandas-dev/pandas/issues/35092), but Modin - # now does the right thing, so for now manually sort to workaround - # this. Once the Pandas bug is fixed and Modin upgrades to that - # Pandas release, this sort will cause the test to fail, and the - # next three lines should be deleted. - if get_current_execution() != "BaseOnPython": - assert list(modin_result.columns) == list(modin_df.columns) + [0] - modin_result = modin_result[[0] + sorted(modin_df.columns)] df_equals(modin_result, pandas_result) verify_integrity_values = [True, False] From b4639dee96cf1bf5201912eac261f3ab80c23f4f Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 24 Jan 2022 12:28:51 -0600 Subject: [PATCH 11/63] Fix more tests Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 3 +-- modin/pandas/dataframe.py | 3 +-- modin/pandas/series.py | 3 +-- modin/pandas/test/test_series.py | 4 ++-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 2d93b0270ae..727db9c57bc 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1707,8 +1707,7 @@ def _stat_operation( `DataFrame` - self is DataFrame and level is specified. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( op_name, diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index df4f3d4c3d9..add1bccf67b 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2020,8 +2020,7 @@ def sum( Return the sum of the values over the requested axis. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) axis_to_apply = self.columns if axis else self.index if ( skipna is not False diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 49450900946..d4a05ff2be5 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1824,8 +1824,7 @@ def sum( Return the sum of the values. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if numeric_only is True: raise NotImplementedError("Series.sum does not implement numeric_only") if level is not None: diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 65b5f09b888..319efa5552a 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -3512,8 +3512,8 @@ def test_var(data, skipna, ddof): try: pandas_result = pandas_series.var(skipna=skipna, ddof=ddof) - except Exception: - with pytest.raises(TypeError): + except Exception as e: + with pytest.raises(type(e)): modin_series.var(skipna=skipna, ddof=ddof) else: modin_result = modin_series.var(skipna=skipna, ddof=ddof) From 9b1bfdde49f484bae93a098d4d7bf105a8abbf62 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 24 Jan 2022 12:33:38 -0600 Subject: [PATCH 12/63] Fix more skipna changes Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 9 +++------ modin/pandas/dataframe.py | 3 +-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 727db9c57bc..fe05efd17ba 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1601,8 +1601,7 @@ def loc(self): def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) @@ -1646,8 +1645,7 @@ def max( numeric_only=None, **kwargs, ): - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( "max", @@ -1779,8 +1777,7 @@ def min( numeric_only=None, **kwargs, ): - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( "min", diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index add1bccf67b..bd4dcafefd5 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1625,8 +1625,7 @@ def prod( Return the product of the values over the requested axis. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) From 13379eb789cf95581514b3ef17e643130f787d11 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 10:49:23 -0800 Subject: [PATCH 13/63] Update series.py __repr__ to use display.max_{rows|cols} instead of max_{rows|cols} in call to pandas.get_option Signed-off-by: Rehan Durrani --- modin/pandas/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 49450900946..b0431d7d4d1 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -453,8 +453,8 @@ def __repr__(self): ------- str """ - num_rows = pandas.get_option("max_rows") or 60 - num_cols = pandas.get_option("max_columns") or 20 + num_rows = pandas.get_option("display.max_rows") or 60 + num_cols = pandas.get_option("display.max_columns") or 20 temp_df = self._build_repr_df(num_rows, num_cols) if isinstance(temp_df, pandas.DataFrame) and not temp_df.empty: temp_df = temp_df.iloc[:, 0] From 8fa36a00f87514c79a84e9bf441678398466529d Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 11:13:16 -0800 Subject: [PATCH 14/63] Update simple_row_groupby to specify categorical data is ordered Signed-off-by: Rehan Durrani --- modin/pandas/test/test_groupby.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 770dfeeea1c..b9de5cef840 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -316,6 +316,9 @@ def test_simple_row_groupby(by, as_index, col1_category): if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) + # As of pandas 1.4.0 operators like min cause TypeErrors to be raised on unordered + # categorical columns. We need to specify the categorical column as ordered to bypass this. + pandas_df["col1"] = pandas_df["col1"].cat.as_ordered() modin_df = from_pandas(pandas_df) n = 1 From 6bf97b6971506edb281236570cb960cd8f113821 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:28:24 -0800 Subject: [PATCH 15/63] Add codepath to check that Modin raises ValueError when passing None to skipna Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 55 ++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 319efa5552a..ed3e63214eb 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,7 +770,11 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.all(skipna=skipna) + else: + df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -779,7 +783,11 @@ def test_all(data, skipna): ) def test_any(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.any(skipna=skipna) + else: + df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2136,10 +2144,14 @@ def test_lt(data): @pytest.mark.parametrize("level", [0, -1, None]) def test_mad(level, data, axis, skipna): modin_series, pandas_series = create_test_series(data) - df_equals( - modin_series.mad(axis=axis, skipna=skipna, level=level), - pandas_series.mad(axis=axis, skipna=skipna, level=level), - ) + if skipna is None: + with pytest.raises(ValueError): + modin_series.mad(skipna=skipna) + else: + df_equals( + modin_series.mad(axis=axis, skipna=skipna, level=level), + pandas_series.mad(axis=axis, skipna=skipna, level=level), + ) @pytest.mark.parametrize("na_values", ["ignore", None], ids=["na_ignore", "na_none"]) @@ -2183,7 +2195,11 @@ def test_mask(): ) def test_max(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.max(skipna=skipna) + else: + df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2201,7 +2217,11 @@ def test_mean(data, skipna): ) def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.median(skipna=skipna) + else: + df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) @pytest.mark.parametrize( @@ -2407,10 +2427,15 @@ def test_product_alias(): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_prod(axis, skipna): - eval_general( - *create_test_series(test_data["float_nan_data"]), - lambda s: s.prod(axis=axis, skipna=skipna), - ) + if skipna is None: + with pytest.raises(ValueError): + modin_series, _ = create_test_series(test_data["float_nan_data"]) + modin_series.prod(skipna=skipna) + else: + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda s: s.prod(axis=axis, skipna=skipna), + ) @pytest.mark.parametrize( @@ -2949,7 +2974,11 @@ def test_size(data): ) def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.skew(skipna=skipna) + else: + df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From b98bfc4c6e6445baba6c9c2942741631f6619308 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:30:03 -0800 Subject: [PATCH 16/63] Add comment to explain new codepath Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index ed3e63214eb..43d1f9b4672 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,6 +770,9 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) + # We currently do not accept None for skipna, while pandas warns users that it will + # exception eventually, but still accepts it currently. We need this codepath to catch + # the exception Modin raises until pandas officially deprecates skipna=None. if skipna is None: with pytest.raises(ValueError): modin_series.all(skipna=skipna) From 892bc8ea94a429c2afb1096d7b640e891cd9df1a Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:35:34 -0800 Subject: [PATCH 17/63] Add codepath to check that Modin raises ValueError when passing None to skipna for dataframe tests Signed-off-by: Rehan Durrani --- modin/pandas/test/dataframe/test_default.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index c80c8c002b2..a2adf4b7b6f 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +from multiprocessing.sharedctypes import Value import pytest import numpy as np import pandas @@ -438,10 +439,17 @@ def test_last(): @pytest.mark.parametrize("skipna", [None, True, False]) def test_mad(data, axis, skipna): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - df_equals( - modin_df.mad(axis=axis, skipna=skipna, level=None), - pandas_df.mad(axis=axis, skipna=skipna, level=None), - ) + # We currently do not accept None for skipna, while pandas warns users that it will + # exception eventually, but still accepts it currently. We need this codepath to catch + # the exception Modin raises until pandas officially deprecates skipna=None. + if skipna is None: + with pytest.raises(ValueError): + modin_df.mad(axis=axis, skipna=skipna, level=None) + else: + df_equals( + modin_df.mad(axis=axis, skipna=skipna, level=None), + pandas_df.mad(axis=axis, skipna=skipna, level=None), + ) @pytest.mark.parametrize("level", [-1, 0, 1]) From f17181ceedfcec205b3ff2e4548cbec29d94b56b Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:37:34 -0800 Subject: [PATCH 18/63] Add codepath to check that Modin raises ValueError when passing None to skipna for series tests Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 43d1f9b4672..da7d7e80dfd 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2211,7 +2211,11 @@ def test_max(data, skipna): ) def test_mean(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.mean(skipna=skipna) + else: + df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2257,7 +2261,11 @@ def test_memory_usage(data, index): ) def test_min(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.min(skipna=skipna) + else: + df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From f4c4f54f0377d54fa47b9b3db772ed64d71f5ed7 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sun, 23 Jan 2022 21:59:06 -0600 Subject: [PATCH 19/63] FEAT-#4035: Upgrade pandas support to 1.4 Signed-off-by: Devin Petersohn --- modin/core/io/io.py | 4 +- modin/core/io/text/fwf_dispatcher.py | 3 +- modin/core/io/text/text_file_dispatcher.py | 5 +- modin/pandas/__init__.py | 129 +++++++++++---------- modin/pandas/base.py | 120 ++++++++++++++----- modin/pandas/dataframe.py | 41 ++++--- modin/pandas/general.py | 9 +- modin/pandas/groupby.py | 2 +- modin/pandas/io.py | 88 +++++++------- modin/pandas/series.py | 66 +++++++++-- modin/pandas/test/test_groupby.py | 7 +- setup.cfg | 2 +- 12 files changed, 292 insertions(+), 184 deletions(-) diff --git a/modin/core/io/io.py b/modin/core/io/io.py index 10b61d88032..813a094a002 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -23,7 +23,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from pandas.util._decorators import doc from modin.db_conn import ModinDatabaseConnection @@ -826,7 +826,7 @@ def to_sql( def to_pickle( cls, obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, diff --git a/modin/core/io/text/fwf_dispatcher.py b/modin/core/io/text/fwf_dispatcher.py index 63440776cfa..388ae096e8e 100644 --- a/modin/core/io/text/fwf_dispatcher.py +++ b/modin/core/io/text/fwf_dispatcher.py @@ -14,7 +14,6 @@ """Module houses `FWFDispatcher` class, that is used for reading of tables with fixed-width formatted lines.""" import pandas -from pandas._typing import FilePathOrBuffer from modin.core.io.text.text_file_dispatcher import TextFileDispatcher @@ -27,7 +26,7 @@ class FWFDispatcher(TextFileDispatcher): @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ): """ diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index e8620e07abf..97db296fced 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -27,7 +27,6 @@ import numpy as np import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.common import is_list_like from modin.core.io.file_dispatcher import FileDispatcher, OpenFile @@ -614,7 +613,7 @@ def _launch_tasks(cls, splits: list, **partition_kwargs) -> Tuple[list, list, li @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ) -> bool: """ @@ -912,7 +911,7 @@ def _get_new_qc( return new_query_compiler @classmethod - def _read(cls, filepath_or_buffer: FilePathOrBuffer, **kwargs): + def _read(cls, filepath_or_buffer, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index d6be7f69cb7..d45498c7369 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -12,12 +12,11 @@ # governing permissions and limitations under the License. import pandas +import warnings __pandas_version__ = "1.3.5" if pandas.__version__ != __pandas_version__: - import warnings - warnings.warn( "The pandas version installed {} does not match the supported pandas version in" " Modin {}. This may cause undesired side effects!".format( @@ -25,68 +24,70 @@ ) ) -from pandas import ( - eval, - cut, - factorize, - test, - qcut, - date_range, - period_range, - Index, - MultiIndex, - CategoricalIndex, - bdate_range, - DatetimeIndex, - Timedelta, - Timestamp, - to_timedelta, - set_eng_float_format, - options, - Flags, - set_option, - NaT, - PeriodIndex, - Categorical, - Interval, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - SparseDtype, - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - Float32Dtype, - Float64Dtype, - StringDtype, - BooleanDtype, - CategoricalDtype, - DatetimeTZDtype, - IntervalDtype, - PeriodDtype, - RangeIndex, - Int64Index, - UInt64Index, - Float64Index, - TimedeltaIndex, - IntervalIndex, - IndexSlice, - Grouper, - array, - Period, - show_versions, - DateOffset, - timedelta_range, - infer_freq, - interval_range, - ExcelWriter, - datetime, - NamedAgg, - NA, - api, -) +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from pandas import ( + eval, + cut, + factorize, + test, + qcut, + date_range, + period_range, + Index, + MultiIndex, + CategoricalIndex, + bdate_range, + DatetimeIndex, + Timedelta, + Timestamp, + to_timedelta, + set_eng_float_format, + options, + Flags, + set_option, + NaT, + PeriodIndex, + Categorical, + Interval, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + SparseDtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + Float32Dtype, + Float64Dtype, + StringDtype, + BooleanDtype, + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, + RangeIndex, + Int64Index, + UInt64Index, + Float64Index, + TimedeltaIndex, + IntervalIndex, + IndexSlice, + Grouper, + array, + Period, + show_versions, + DateOffset, + timedelta_range, + infer_freq, + interval_range, + ExcelWriter, + datetime, + NamedAgg, + NA, + api, + ) import os import multiprocessing diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 5736a28310b..67d83abfa8a 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -35,7 +35,6 @@ from pandas._typing import ( CompressionOptions, IndexKeyFunc, - FilePathOrBuffer, StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -932,7 +931,13 @@ def at_time(self, time, asof=False, axis=None): return self.loc[indexer] if axis == 0 else self.loc[:, indexer] def between_time( - self, start_time, end_time, include_start=True, include_end=True, axis=None + self: "BasePandasDataset", + start_time, + end_time, + include_start: "bool_t | lib.NoDefault" = no_default, + include_end: "bool_t | lib.NoDefault" = no_default, + inclusive: "str | None" = None, + axis=None, ): axis = self._get_axis_number(axis) idx = self.index if axis == 0 else self.columns @@ -943,6 +948,7 @@ def between_time( end_time, include_start=include_start, include_end=include_end, + inclusive=inclusive, ) .index ) @@ -1280,16 +1286,17 @@ def explode(self, column, ignore_index: bool = False): def ewm( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): + com: "float | None" = None, + span: "float | None" = None, + halflife: "float | TimedeltaConvertibleTypes | None" = None, + alpha: "float | None" = None, + min_periods: "int | None" = 0, + adjust: "bool_t" = True, + ignore_na: "bool_t" = False, + axis: "Axis" = 0, + times: "str | np.ndarray | BasePandasDataset | None" = None, + method: "str" = "single", + ) -> "ExponentialMovingWindow": return self._default_to_pandas( "ewm", com=com, @@ -1301,6 +1308,7 @@ def ewm( ignore_na=ignore_na, axis=axis, times=times, + method=method, ) def expanding(self, min_periods=1, center=None, axis=0, method="single"): @@ -1528,7 +1536,14 @@ def iloc(self): return _iLocIndexer(self) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def kurt( + self, + axis: "Axis | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): axis = self._get_axis_number(axis) if skipna is None: skipna = True @@ -1582,7 +1597,7 @@ def loc(self): return _LocIndexer(self) - def mad(self, axis=None, skipna=None, level=None): + def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) if skipna is None: skipna = True @@ -1621,7 +1636,14 @@ def mask( try_cast=try_cast, ) - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def max( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): if skipna is None: skipna = True if level is not None: @@ -1721,10 +1743,24 @@ def _stat_operation( ) return self._reduce_dimension(result_qc) - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def mean( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("mean", axis, skipna, level, numeric_only, **kwargs) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def median( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation( "median", axis, skipna, level, numeric_only, **kwargs ) @@ -1734,7 +1770,14 @@ def memory_usage(self, index=True, deep=False): self._query_compiler.memory_usage(index=index, deep=deep) ) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def min( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): if skipna is None: skipna = True if level is not None: @@ -1873,13 +1916,13 @@ def check_dtype(t): return result def rank( - self, + self: "BasePandasDataset", axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, + method: "str" = "average", + numeric_only: "bool_t | None | lib.NoDefault" = no_default, + na_option: "str" = "keep", + ascending: "bool_t" = True, + pct: "bool_t" = False, ): axis = self._get_axis_number(axis) return self.__constructor__( @@ -2258,7 +2301,13 @@ def sample( return self.__constructor__(query_compiler=query_compiler) def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "sem", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2377,7 +2426,14 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=no_default): else: return self.tshift(periods, freq) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def skew( + self, + axis: "int | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("skew", axis, skipna, level, numeric_only, **kwargs) def sort_index( @@ -2446,7 +2502,13 @@ def sort_values( return self._create_or_update_from_compiler(result, inplace) def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "std", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2703,7 +2765,7 @@ def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover def to_pickle( self, - path: FilePathOrBuffer, + path, compression: CompressionOptions = "infer", protocol: int = pkl.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -2909,7 +2971,7 @@ def value_counts( return counted_values def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs ): return self._stat_operation( "var", axis, skipna, level, numeric_only, ddof=ddof, **kwargs diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index b81cf54cfe5..df4f3d4c3d9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -31,7 +31,7 @@ import functools import numpy as np import sys -from typing import IO, Optional, Union, Mapping, Iterator +from typing import IO, Optional, Union, Iterator import warnings from modin.pandas import Categorical @@ -1615,7 +1615,7 @@ def pow( def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1771,11 +1771,11 @@ def rename( def replace( self, to_replace=None, - value=None, - inplace=False, + value=no_default, + inplace: "bool" = False, limit=None, - regex=False, - method="pad", + regex: "bool" = False, + method: "str | lib.NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -2010,7 +2010,7 @@ def sub( def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -2211,17 +2211,19 @@ def to_records( def to_stata( self, - path, - convert_dates=None, - write_index=True, - byteorder=None, - time_stamp=None, - data_label=None, - variable_labels=None, - version=114, - convert_strl=None, - compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: StorageOptions = None, + path: "FilePath | WriteBuffer[bytes]", + convert_dates: "dict[Hashable, str] | None" = None, + write_index: "bool" = True, + byteorder: "str | None" = None, + time_stamp: "datetime.datetime | None" = None, + data_label: "str | None" = None, + variable_labels: "dict[Hashable, str] | None" = None, + version: "int | None" = 114, + convert_strl: "Sequence[Hashable] | None" = None, + compression: "CompressionOptions" = "infer", + storage_options: "StorageOptions" = None, + *, + value_labels: "dict[Hashable, dict[float | int, str]] | None" = None, ): # pragma: no cover # noqa: PR01, RT01, D200 """ Export ``DataFrame`` object to Stata data format. @@ -2239,6 +2241,7 @@ def to_stata( convert_strl=convert_strl, compression=compression, storage_options=storage_options, + value_labels=value_labels, ) def to_timestamp( @@ -2329,7 +2332,7 @@ def update( def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 892b21b364d..dc8a8996f31 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -16,8 +16,7 @@ import pandas import numpy as np -from typing import Hashable, Iterable, Mapping, Optional, Union -from pandas._typing import FrameOrSeriesUnion +from typing import Hashable, Iterable, Mapping, Union from pandas.core.dtypes.common import is_list_like from modin.error_message import ErrorMessage @@ -359,9 +358,7 @@ def value_counts( @_inherit_docstrings(pandas.concat) def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", axis=0, join="outer", ignore_index: bool = False, @@ -371,7 +368,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -) -> FrameOrSeriesUnion: +) -> "DataFrame | Series": if isinstance(objs, (pandas.Series, Series, DataFrame, str, pandas.DataFrame)): raise TypeError( "first argument must be an iterable of pandas " diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 0e60fe4c514..391d5179019 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -17,7 +17,7 @@ import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like, is_numeric_dtype -from pandas.core.aggregation import reconstruct_func +from pandas.core.apply import reconstruct_func from pandas._libs.lib import no_default import pandas.core.common as com from types import BuiltinFunctionType diff --git a/modin/pandas/io.py b/modin/pandas/io.py index c5941d99064..7e197a98e69 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -26,7 +26,7 @@ import pathlib import re from collections import OrderedDict -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from typing import Union, IO, AnyStr, Sequence, Dict, List, Optional, Any from modin.error_message import ErrorMessage @@ -73,18 +73,18 @@ def _read(**kwargs): @_inherit_docstrings(pandas.read_csv) def read_csv( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, @@ -96,7 +96,7 @@ def read_csv( na_filter=True, verbose=False, skip_blank_lines=True, - parse_dates=False, + parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, @@ -104,16 +104,16 @@ def read_csv( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, @@ -124,7 +124,7 @@ def read_csv( low_memory=True, memory_map=False, float_precision=None, - storage_options: StorageOptions = None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -137,23 +137,24 @@ def read_csv( @_inherit_docstrings(pandas.read_table) def read_table( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, + skipfooter=0, nrows=None, na_values=None, keep_default_na=True, @@ -168,26 +169,26 @@ def read_table( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, + doublequote=True, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, - skipfooter=0, - doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -317,32 +318,33 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover @_inherit_docstrings(pandas.read_excel) def read_excel( io, - sheet_name=0, - header=0, + sheet_name: "str | int | list[IntStrT] | None" = 0, + header: "int | Sequence[int] | None" = 0, names=None, - index_col=None, + index_col: "int | Sequence[int] | None" = None, usecols=None, - squeeze=False, - dtype=None, - engine=None, + squeeze: "bool | None" = None, + dtype: "DtypeArg | None" = None, + engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None, converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: "Iterable[Hashable] | None" = None, + false_values: "Iterable[Hashable] | None" = None, + skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None, + nrows: "int | None" = None, na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, + keep_default_na: "bool" = True, + na_filter: "bool" = True, + verbose: "bool" = False, parse_dates=False, date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, - storage_options: StorageOptions = None, -): + thousands: "str | None" = None, + decimal: "str" = ".", + comment: "str | None" = None, + skipfooter: "int" = 0, + convert_float: "bool | None" = None, + mangle_dupe_cols: "bool" = True, + storage_options: "StorageOptions" = None, +) -> "DataFrame | dict[IntStrT, DataFrame]": _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -438,7 +440,7 @@ def read_sas( @_inherit_docstrings(pandas.read_pickle) def read_pickle( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -557,7 +559,7 @@ def read_spss( @_inherit_docstrings(pandas.to_pickle) def to_pickle( obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -597,9 +599,7 @@ def json_normalize( @_inherit_docstrings(pandas.read_orc) -def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs -) -> DataFrame: +def read_orc(path, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index f6a868d8281..49450900946 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1161,6 +1161,23 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D20 skipna = True return super(Series, self).idxmin(axis=axis, skipna=skipna, *args, **kwargs) + def info( + self, + verbose: "bool | None" = None, + buf: "IO[str] | None" = None, + max_cols: "int | None" = None, + memory_usage: "bool | str | None" = None, + show_counts: "bool" = True, + ): + return self._default_to_pandas( + pandas.Series.info, + verbose=verbose, + buf=buf, + max_cols=max_cols, + memory_usage=memory_usage, + show_counts=show_counts, + ) + def interpolate( self, method="linear", @@ -1218,7 +1235,12 @@ def keys(self): # noqa: RT01, D200 return self.index def kurt( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, + axis: "Axis | None | lib.NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased kurtosis over requested axis. @@ -1262,6 +1284,27 @@ def arg(s): ) ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors=no_default, + try_cast=no_default, + ): + return self._default_to_pandas( + pandas.Series.mask, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + ) + def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 """ Return the memory usage of the Series. @@ -1409,7 +1452,7 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1476,10 +1519,11 @@ def ravel(self, order="C"): # noqa: PR01, RT01, D200 return data - def reindex(self, index=None, **kwargs): # noqa: PR01, RT01, D200 + def reindex(self, *args, **kwargs): # noqa: PR01, RT01, D200 """ Conform Series to new index with optional filling logic. """ + index = kwargs.pop("index", None) method = kwargs.pop("method", None) level = kwargs.pop("level", None) copy = kwargs.pop("copy", True) @@ -1543,7 +1587,7 @@ def repeat(self, repeats, axis=None): # noqa: PR01, RT01, D200 return self.__constructor__(query_compiler=self._query_compiler.repeat(repeats)) def reset_index( - self, level=None, drop=False, name=None, inplace=False + self, level=None, drop=False, name=no_default, inplace=False ): # noqa: PR01, RT01, D200 """ Generate a new Series with the index reset. @@ -1653,11 +1697,11 @@ def reorder_levels(self, order): # noqa: PR01, RT01, D200 def replace( self, to_replace=None, - value=None, + value=no_default, inplace=False, limit=None, regex=False, - method="pad", + method: "str | lib.NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -1770,7 +1814,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1843,7 +1887,9 @@ def to_dict(self, into=dict): # pragma: no cover # noqa: PR01, RT01, D200 """ return self._default_to_pandas("to_dict", into=into) - def to_frame(self, name=None): # noqa: PR01, RT01, D200 + def to_frame( + self, name: "Hashable" = no_default + ) -> "DataFrame": # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. """ @@ -2011,11 +2057,11 @@ def view(self, dtype=None): # noqa: PR01, RT01, D200 def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, - errors="raise", + errors=no_default, try_cast=no_default, ): # noqa: PR01, RT01, D200 """ diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 7f44cf83336..770dfeeea1c 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -1397,9 +1397,10 @@ def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): modin_df = modin_df.T pandas_df = pandas_df.T - md_grp, pd_grp = modin_df.groupby( - **groupby_kwargs, dropna=dropna - ), pandas_df.groupby(**groupby_kwargs, dropna=dropna) + md_grp, pd_grp = ( + modin_df.groupby(**groupby_kwargs, dropna=dropna), + pandas_df.groupby(**groupby_kwargs, dropna=dropna), + ) modin_groupby_equals_pandas(md_grp, pd_grp) by_kwarg = groupby_kwargs.get("by", []) diff --git a/setup.cfg b/setup.cfg index 4b68525c140..cef307bd269 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ filterwarnings = [flake8] max-line-length = 88 -ignore = E203, E266, E501, W503 +ignore = E203, E266, E501, W503, F821 select = B,C,E,F,W,T4,B9 per-file-ignores = modin/pandas/__init__.py:E402,F401 From 8e44e7c800da52db4867c869c86634a9bd5e3229 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:04:47 +0300 Subject: [PATCH 20/63] Upgrade pandas to 1.4.0 in env files Signed-off-by: Igoshev, Yaroslav --- environment-dev.yml | 2 +- modin/pandas/__init__.py | 2 +- requirements-dev.txt | 2 +- requirements/env_omnisci.yml | 2 +- requirements/requirements-no-engine.yml | 2 +- setup.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 05ffcb23609..4fd460f80f1 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,7 +2,7 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - numpy>=1.16.5 - pyarrow>=4.0.1 - dask[complete]>=2.22.0 diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index d45498c7369..bca22767a39 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -14,7 +14,7 @@ import pandas import warnings -__pandas_version__ = "1.3.5" +__pandas_version__ = "1.4.0" if pandas.__version__ != __pandas_version__: warnings.warn( diff --git a/requirements-dev.txt b/requirements-dev.txt index d321683183a..50073f6cecf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -pandas==1.3.5 +pandas==1.4.0 numpy>=1.16.5 pyarrow>=4.0.1 dask[complete]>=2.22.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 428775426b9..366a4cf6d75 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -2,7 +2,7 @@ name: modin_on_omnisci channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - pyarrow==5.0.0 - numpy>=1.16.5 - fsspec diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 79a65081e4e..1e3eb5f908d 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -1,7 +1,7 @@ channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - numpy>=1.16.5 - pyarrow>=4.0.1 - fsspec diff --git a/setup.py b/setup.py index e42555d77d7..f5c0d03880a 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.3.5", "packaging", "numpy>=1.16.5", "fsspec"], + install_requires=["pandas==1.4.0", "packaging", "numpy>=1.16.5", "fsspec"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, From a25bf13b76cfdba1c291abb4fa297c93e21eeda3 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:25:02 +0300 Subject: [PATCH 21/63] Upgrade min python version in setup.py and ci.yml Signed-off-by: Igoshev, Yaroslav --- .github/workflows/ci.yml | 38 +++++++++++++++++++------------------- setup.py | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 73496b6b3f2..995f7cef5d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install black - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py @@ -43,7 +43,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install -r docs/requirements-doc.txt - run: cd docs && sphinx-build -T -E -b html . build @@ -57,7 +57,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost - run: pytest scripts/test @@ -132,7 +132,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install flake8 flake8-print - run: flake8 --enable=T modin/ asv_bench/benchmarks scripts/doc_checker.py @@ -152,7 +152,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an HTTP error. Retry @@ -185,7 +185,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -214,7 +214,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - name: Clean install and run run: | @@ -235,7 +235,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - name: Clean install and run run: | @@ -258,7 +258,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -294,7 +294,7 @@ jobs: env: MODIN_MEMORY: 1000000000 MODIN_TEST_DATASET_SIZE: "small" - name: Test ${{ matrix.execution }} execution, Python 3.7 + name: Test ${{ matrix.execution }} execution, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -303,7 +303,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -357,7 +357,7 @@ jobs: MODIN_EXPERIMENTAL: "True" MODIN_ENGINE: "native" MODIN_STORAGE_FORMAT: "omnisci" - name: Test OmniSci storage format, Python 3.7 + name: Test OmniSci storage format, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -367,7 +367,7 @@ jobs: with: activate-environment: modin_on_omnisci environment-file: requirements/env_omnisci.yml - python-version: 3.7 + python-version: 3.8 use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry # it once if it fails. todo(https://github.com/conda-incubator/setup-miniconda/issues/129): @@ -471,7 +471,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["python", "ray", "dask"] env: MODIN_ENGINE: ${{matrix.engine}} @@ -558,7 +558,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -604,7 +604,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -642,7 +642,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["ray", "dask"] test-task: - modin/pandas/test/dataframe/test_binary.py @@ -704,7 +704,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] env: MODIN_STORAGE_FORMAT: pyarrow MODIN_EXPERIMENTAL: "True" @@ -741,7 +741,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: [ "3.7", "3.8" ] + python-version: ["3.8" ] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" diff --git a/setup.py b/setup.py index f5c0d03880a..de681def55c 100644 --- a/setup.py +++ b/setup.py @@ -32,5 +32,5 @@ "sql": sql_deps, "all": all_deps, }, - python_requires=">=3.7.1", + python_requires=">=3.8", ) From 11d7a97f8a731489430167e52f29361b677434b3 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:29:37 +0300 Subject: [PATCH 22/63] Upgrade min numpy version Signed-off-by: Igoshev, Yaroslav --- environment-dev.yml | 2 +- requirements-dev.txt | 2 +- requirements/env_omnisci.yml | 2 +- requirements/requirements-no-engine.yml | 2 +- setup.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/environment-dev.yml b/environment-dev.yml index 4fd460f80f1..283393d2890 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: - pandas==1.4.0 - - numpy>=1.16.5 + - numpy>=1.18.5 - pyarrow>=4.0.1 - dask[complete]>=2.22.0 - distributed>=2.22.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 50073f6cecf..1e8e356f7d6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ pandas==1.4.0 -numpy>=1.16.5 +numpy>=1.18.5 pyarrow>=4.0.1 dask[complete]>=2.22.0 distributed>=2.22.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 366a4cf6d75..ec44b999dfb 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -4,7 +4,7 @@ channels: dependencies: - pandas==1.4.0 - pyarrow==5.0.0 - - numpy>=1.16.5 + - numpy>=1.18.5 - fsspec - pip - pytest>=6.0.1 diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 1e3eb5f908d..814ce0fd0cd 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -2,7 +2,7 @@ channels: - conda-forge dependencies: - pandas==1.4.0 - - numpy>=1.16.5 + - numpy>=1.18.5 - pyarrow>=4.0.1 - fsspec - xarray diff --git a/setup.py b/setup.py index de681def55c..a1935df70a3 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.4.0", "packaging", "numpy>=1.16.5", "fsspec"], + install_requires=["pandas==1.4.0", "packaging", "numpy>=1.18.5", "fsspec"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, From 2f3bdf339b3920cf855f13b7b8a34e0da39e5e16 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 11:49:22 +0300 Subject: [PATCH 23/63] Remove FilePathOrBuffer import Signed-off-by: Igoshev, Yaroslav --- .../native/implementations/omnisci_on_native/io/io.py | 3 +-- modin/experimental/pandas/io.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py index b5d03b32b50..d3eb167a6fb 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py @@ -37,7 +37,6 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.io.common import is_url ReadCsvKwargsType = Dict[ @@ -51,7 +50,7 @@ Sequence, Callable, Dialect, - FilePathOrBuffer, + None, None, ], ] diff --git a/modin/experimental/pandas/io.py b/modin/experimental/pandas/io.py index 74614b08624..68333577a06 100644 --- a/modin/experimental/pandas/io.py +++ b/modin/experimental/pandas/io.py @@ -20,7 +20,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from . import DataFrame from modin.config import IsExperimental, Engine @@ -245,7 +245,7 @@ def _read(**kwargs) -> DataFrame: def read_pickle_distributed( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -289,7 +289,7 @@ def read_pickle_distributed( def to_pickle_distributed( self, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, From daf2d9c2acb790064772d67653dd4396b664bfc4 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 24 Jan 2022 15:14:06 +0300 Subject: [PATCH 24/63] Handle axis more carefully Signed-off-by: Igoshev, Yaroslav --- modin/pandas/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 67d83abfa8a..a845c73c8eb 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -528,6 +528,9 @@ def _get_axis_number(cls, axis): int 0 or 1 - axis index in the array of axes stored in the dataframe. """ + if axis is no_default: + axis = None + return cls._pandas_class._get_axis_number(axis) if axis is not None else 0 def __constructor__(self, *args, **kwargs): From 0b9d6173a1e53c24a4d5d82a3243499d259c9d28 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Mon, 24 Jan 2022 16:37:31 +0300 Subject: [PATCH 25/63] Fix `test_resample_getitem`. Details are in https://github.com/pandas-dev/pandas/pull/44944 --- modin/pandas/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index a845c73c8eb..6c8fb86d2fe 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3221,7 +3221,7 @@ def _get_new_resampler(key): if isinstance( key, (list, tuple, Series, pandas.Series, pandas.Index, np.ndarray) ): - if len(self._dataframe.columns.intersection(key)) != len(key): + if len(self._dataframe.columns.intersection(key)) != len(set(key)): missed_keys = list(set(key).difference(self._dataframe.columns)) raise KeyError(f"Columns not found: {str(sorted(missed_keys))[1:-1]}") return _get_new_resampler(list(key)) From 63ff1b81a9c609483627d8aaede37d372cabb62c Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 24 Jan 2022 12:17:59 -0600 Subject: [PATCH 26/63] Fix kurtosis exception type Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 6c8fb86d2fe..2d93b0270ae 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1548,8 +1548,7 @@ def kurt( **kwargs, ): axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: func_kwargs = { "skipna": skipna, From 35d45110f2579ff92348ac928fb265b71603b530 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 10:24:43 -0800 Subject: [PATCH 27/63] Fix test_append by removing stale sort workaround Signed-off-by: Rehan Durrani --- modin/pandas/test/dataframe/test_map_metadata.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 692173c1c4f..53358805d3e 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -410,15 +410,6 @@ def test_append(data): modin_df.append(list(modin_df.iloc[-1])) else: modin_result = modin_df.append(list(modin_df.iloc[-1])) - # Pandas has bug where sort=False is ignored - # (https://github.com/pandas-dev/pandas/issues/35092), but Modin - # now does the right thing, so for now manually sort to workaround - # this. Once the Pandas bug is fixed and Modin upgrades to that - # Pandas release, this sort will cause the test to fail, and the - # next three lines should be deleted. - if get_current_execution() != "BaseOnPython": - assert list(modin_result.columns) == list(modin_df.columns) + [0] - modin_result = modin_result[[0] + sorted(modin_df.columns)] df_equals(modin_result, pandas_result) verify_integrity_values = [True, False] From 40be69fa031e35132836bbf058d4d8e317617b48 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 10:49:23 -0800 Subject: [PATCH 28/63] Update series.py __repr__ to use display.max_{rows|cols} instead of max_{rows|cols} in call to pandas.get_option Signed-off-by: Rehan Durrani --- modin/pandas/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 49450900946..b0431d7d4d1 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -453,8 +453,8 @@ def __repr__(self): ------- str """ - num_rows = pandas.get_option("max_rows") or 60 - num_cols = pandas.get_option("max_columns") or 20 + num_rows = pandas.get_option("display.max_rows") or 60 + num_cols = pandas.get_option("display.max_columns") or 20 temp_df = self._build_repr_df(num_rows, num_cols) if isinstance(temp_df, pandas.DataFrame) and not temp_df.empty: temp_df = temp_df.iloc[:, 0] From 19a71e67e8f91f7745b88388f02a1b5e84ac1899 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 24 Jan 2022 12:28:51 -0600 Subject: [PATCH 29/63] Fix more tests Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 3 +-- modin/pandas/dataframe.py | 3 +-- modin/pandas/series.py | 3 +-- modin/pandas/test/test_series.py | 4 ++-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 2d93b0270ae..727db9c57bc 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1707,8 +1707,7 @@ def _stat_operation( `DataFrame` - self is DataFrame and level is specified. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( op_name, diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index df4f3d4c3d9..add1bccf67b 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2020,8 +2020,7 @@ def sum( Return the sum of the values over the requested axis. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) axis_to_apply = self.columns if axis else self.index if ( skipna is not False diff --git a/modin/pandas/series.py b/modin/pandas/series.py index b0431d7d4d1..d8cc9dc14dd 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1824,8 +1824,7 @@ def sum( Return the sum of the values. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if numeric_only is True: raise NotImplementedError("Series.sum does not implement numeric_only") if level is not None: diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 65b5f09b888..319efa5552a 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -3512,8 +3512,8 @@ def test_var(data, skipna, ddof): try: pandas_result = pandas_series.var(skipna=skipna, ddof=ddof) - except Exception: - with pytest.raises(TypeError): + except Exception as e: + with pytest.raises(type(e)): modin_series.var(skipna=skipna, ddof=ddof) else: modin_result = modin_series.var(skipna=skipna, ddof=ddof) From 09eda827bdb6485bd752ff70055ceb60a967903a Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 24 Jan 2022 12:33:38 -0600 Subject: [PATCH 30/63] Fix more skipna changes Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 9 +++------ modin/pandas/dataframe.py | 3 +-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 727db9c57bc..fe05efd17ba 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1601,8 +1601,7 @@ def loc(self): def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) @@ -1646,8 +1645,7 @@ def max( numeric_only=None, **kwargs, ): - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( "max", @@ -1779,8 +1777,7 @@ def min( numeric_only=None, **kwargs, ): - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( "min", diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index add1bccf67b..bd4dcafefd5 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1625,8 +1625,7 @@ def prod( Return the product of the values over the requested axis. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) From c892e9b331ba4cb4907efcd3419e057bbd93a082 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 11:13:16 -0800 Subject: [PATCH 31/63] Update simple_row_groupby to specify categorical data is ordered Signed-off-by: Rehan Durrani --- modin/pandas/test/test_groupby.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 770dfeeea1c..b9de5cef840 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -316,6 +316,9 @@ def test_simple_row_groupby(by, as_index, col1_category): if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) + # As of pandas 1.4.0 operators like min cause TypeErrors to be raised on unordered + # categorical columns. We need to specify the categorical column as ordered to bypass this. + pandas_df["col1"] = pandas_df["col1"].cat.as_ordered() modin_df = from_pandas(pandas_df) n = 1 From 2dbc29510608e65ca07edf99ddd81fbf201040d0 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:28:24 -0800 Subject: [PATCH 32/63] Add codepath to check that Modin raises ValueError when passing None to skipna Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 55 ++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 319efa5552a..ed3e63214eb 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,7 +770,11 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.all(skipna=skipna) + else: + df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -779,7 +783,11 @@ def test_all(data, skipna): ) def test_any(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.any(skipna=skipna) + else: + df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2136,10 +2144,14 @@ def test_lt(data): @pytest.mark.parametrize("level", [0, -1, None]) def test_mad(level, data, axis, skipna): modin_series, pandas_series = create_test_series(data) - df_equals( - modin_series.mad(axis=axis, skipna=skipna, level=level), - pandas_series.mad(axis=axis, skipna=skipna, level=level), - ) + if skipna is None: + with pytest.raises(ValueError): + modin_series.mad(skipna=skipna) + else: + df_equals( + modin_series.mad(axis=axis, skipna=skipna, level=level), + pandas_series.mad(axis=axis, skipna=skipna, level=level), + ) @pytest.mark.parametrize("na_values", ["ignore", None], ids=["na_ignore", "na_none"]) @@ -2183,7 +2195,11 @@ def test_mask(): ) def test_max(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.max(skipna=skipna) + else: + df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2201,7 +2217,11 @@ def test_mean(data, skipna): ) def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.median(skipna=skipna) + else: + df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) @pytest.mark.parametrize( @@ -2407,10 +2427,15 @@ def test_product_alias(): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_prod(axis, skipna): - eval_general( - *create_test_series(test_data["float_nan_data"]), - lambda s: s.prod(axis=axis, skipna=skipna), - ) + if skipna is None: + with pytest.raises(ValueError): + modin_series, _ = create_test_series(test_data["float_nan_data"]) + modin_series.prod(skipna=skipna) + else: + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda s: s.prod(axis=axis, skipna=skipna), + ) @pytest.mark.parametrize( @@ -2949,7 +2974,11 @@ def test_size(data): ) def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.skew(skipna=skipna) + else: + df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From eb282a6b3394013e94ccbbf6aad4820a0a7184c1 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:30:03 -0800 Subject: [PATCH 33/63] Add comment to explain new codepath Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index ed3e63214eb..43d1f9b4672 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,6 +770,9 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) + # We currently do not accept None for skipna, while pandas warns users that it will + # exception eventually, but still accepts it currently. We need this codepath to catch + # the exception Modin raises until pandas officially deprecates skipna=None. if skipna is None: with pytest.raises(ValueError): modin_series.all(skipna=skipna) From 6de34e7d6cf556e6fbabab65bc975b492f13151f Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:35:34 -0800 Subject: [PATCH 34/63] Add codepath to check that Modin raises ValueError when passing None to skipna for dataframe tests Signed-off-by: Rehan Durrani --- modin/pandas/test/dataframe/test_default.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index c80c8c002b2..a2adf4b7b6f 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -11,6 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +from multiprocessing.sharedctypes import Value import pytest import numpy as np import pandas @@ -438,10 +439,17 @@ def test_last(): @pytest.mark.parametrize("skipna", [None, True, False]) def test_mad(data, axis, skipna): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - df_equals( - modin_df.mad(axis=axis, skipna=skipna, level=None), - pandas_df.mad(axis=axis, skipna=skipna, level=None), - ) + # We currently do not accept None for skipna, while pandas warns users that it will + # exception eventually, but still accepts it currently. We need this codepath to catch + # the exception Modin raises until pandas officially deprecates skipna=None. + if skipna is None: + with pytest.raises(ValueError): + modin_df.mad(axis=axis, skipna=skipna, level=None) + else: + df_equals( + modin_df.mad(axis=axis, skipna=skipna, level=None), + pandas_df.mad(axis=axis, skipna=skipna, level=None), + ) @pytest.mark.parametrize("level", [-1, 0, 1]) From f98f0e7c23bda947955f65d02b3d67b15714e3b8 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Mon, 24 Jan 2022 16:37:34 -0800 Subject: [PATCH 35/63] Add codepath to check that Modin raises ValueError when passing None to skipna for series tests Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 43d1f9b4672..da7d7e80dfd 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2211,7 +2211,11 @@ def test_max(data, skipna): ) def test_mean(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.mean(skipna=skipna) + else: + df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2257,7 +2261,11 @@ def test_memory_usage(data, index): ) def test_min(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) + if skipna is None: + with pytest.raises(ValueError): + modin_series.min(skipna=skipna) + else: + df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From fe11d810ba2ec95c6194ccb85d4e1914cea8807b Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Tue, 25 Jan 2022 10:13:56 +0300 Subject: [PATCH 36/63] Fix linting Signed-off-by: Igoshev, Yaroslav --- modin/pandas/__init__.py | 1 - modin/pandas/test/dataframe/test_default.py | 1 - modin/pandas/test/test_series.py | 4 +++- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index e778dcd7334..8396462f96f 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -77,7 +77,6 @@ Grouper, array, Period, - show_versions, DateOffset, timedelta_range, infer_freq, diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index a2adf4b7b6f..e22c2c0e295 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -11,7 +11,6 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -from multiprocessing.sharedctypes import Value import pytest import numpy as np import pandas diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index da7d7e80dfd..5b470e2c4ee 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2228,7 +2228,9 @@ def test_median(data, skipna): with pytest.raises(ValueError): modin_series.median(skipna=skipna) else: - df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + df_equals( + modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna) + ) @pytest.mark.parametrize( From 2f4a1a1503d73a29497b6ab971f383ec3dfe9900 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Tue, 25 Jan 2022 14:48:57 +0300 Subject: [PATCH 37/63] Fix tests for merge Signed-off-by: Igoshev, Yaroslav --- modin/pandas/series.py | 6 ++++- modin/pandas/test/dataframe/test_join_sort.py | 22 ++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d8cc9dc14dd..6660a83a953 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1894,9 +1894,13 @@ def to_frame( """ from .dataframe import DataFrame + if name is None: + name = no_default + self_cp = self.copy() - if name is not None: + if name is not no_default: self_cp.name = name + return DataFrame(self_cp) def to_list(self): # noqa: RT01, D200 diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index 8d949e26429..75196ceafdf 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -305,13 +305,23 @@ def test_merge(test_data, test_data2): ) df_equals(modin_result, pandas_result) - # Named Series promoted to DF - s = pd.Series(frame_data2.get("col1")) - with pytest.raises(ValueError): - modin_df.merge(s) + # Cannot merge a Series without a name + ps = pandas.Series(frame_data2.get("col1")) + ms = pd.Series(frame_data2.get("col1")) + eval_general( + modin_df, + pandas_df, + lambda df: df.merge(ms) if isinstance(df, pd.DataFrame) else df.merge(ps), + ) - s = pd.Series(frame_data2.get("col1"), name="col1") - df_equals(modin_df.merge(s), modin_df.merge(modin_df2[["col1"]])) + # merge a Series with a name + ps = pandas.Series(frame_data2.get("col1"), name="col1") + ms = pd.Series(frame_data2.get("col1"), name="col1") + eval_general( + modin_df, + pandas_df, + lambda df: df.merge(ms) if isinstance(df, pd.DataFrame) else df.merge(ps), + ) with pytest.raises(TypeError): modin_df.merge("Non-valid type") From 70ba389c66e514f6ad88557424932ba344579e0b Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Tue, 25 Jan 2022 15:24:17 +0300 Subject: [PATCH 38/63] Fix reset_index Signed-off-by: Igoshev, Yaroslav --- modin/pandas/series.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 6660a83a953..935d57f07b1 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1592,11 +1592,19 @@ def reset_index( """ Generate a new Series with the index reset. """ + if name is no_default: + # For backwards compatibility, keep columns as [0] instead of + # [None] when self.name is None + if self.name is None: + name = 0 + else: + name = self.name + if drop and level is None: new_idx = pandas.RangeIndex(len(self.index)) if inplace: self.index = new_idx - self.name = name or self.name + self.name = name else: result = self.copy() result.index = new_idx @@ -1607,8 +1615,7 @@ def reset_index( ) else: obj = self.copy() - if name is not None: - obj.name = name + obj.name = name from .dataframe import DataFrame return DataFrame(obj).reset_index(level=level, drop=drop, inplace=inplace) From 57fdeef5655c924f5a91acc89266c0402b6f8f5a Mon Sep 17 00:00:00 2001 From: ienkovich Date: Tue, 25 Jan 2022 06:22:23 -0600 Subject: [PATCH 39/63] Adjust number of warnings for OmniSci tests. Signed-off-by: ienkovich --- .../implementations/omnisci_on_native/test/test_dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py index f58ffc7f23b..6e3c003b6f7 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py @@ -1187,11 +1187,12 @@ def applier(df, **kwargs): # At the end of reduce function it does inevitable `transpose`, which # is defaulting to pandas. The following logic check that `transpose` is the only # function that falling back to pandas in the reduce operation flow. + # Another warning comes from deprecated pandas.Int64Index usage. with pytest.warns(UserWarning) as warns: res = getattr(df, method)() assert ( - len(warns) == 1 - ), f"More than one warning were arisen: len(warns) != 1 ({len(warns)} != 1)" + len(warns) == 2 + ), f"More than two warnings were arisen: len(warns) != 2 ({len(warns)} != 2)" message = warns[0].message.args[0] assert ( re.match(r".*transpose.*defaulting to pandas", message) is not None From 4df06063088728e4afa0487cbf6beac7f5325dbb Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Tue, 25 Jan 2022 16:19:26 +0300 Subject: [PATCH 40/63] Fix reset_index Signed-off-by: Igoshev, Yaroslav --- modin/pandas/series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 935d57f07b1..91152f7837e 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1604,7 +1604,6 @@ def reset_index( new_idx = pandas.RangeIndex(len(self.index)) if inplace: self.index = new_idx - self.name = name else: result = self.copy() result.index = new_idx From 47000f798c19b64c8c46d95ff6b78c701179a52d Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Tue, 25 Jan 2022 16:46:24 +0300 Subject: [PATCH 41/63] Fix series.asof, series.reindex Signed-off-by: Alexey Prutskov --- modin/pandas/series.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 91152f7837e..579fd80e7d0 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1523,6 +1523,14 @@ def reindex(self, *args, **kwargs): # noqa: PR01, RT01, D200 """ Conform Series to new index with optional filling logic. """ + if args: + if len(args) > 1: + raise TypeError("Only one positional argument ('index') is allowed") + if "index" in kwargs: + raise TypeError( + "'index' passed as both positional and keyword argument" + ) + kwargs.update({"index": args[0]}) index = kwargs.pop("index", None) method = kwargs.pop("method", None) level = kwargs.pop("level", None) From 4e49e47d77af4313ffb9f9ca32377c6684969de1 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 10:26:06 -0800 Subject: [PATCH 42/63] Revert "Add codepath to check that Modin raises ValueError when passing None to skipna for series tests" This reverts commit f98f0e7c23bda947955f65d02b3d67b15714e3b8. --- modin/pandas/test/test_series.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 5b470e2c4ee..f9f1cda1cee 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2211,11 +2211,7 @@ def test_max(data, skipna): ) def test_mean(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.mean(skipna=skipna) - else: - df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) + df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2263,11 +2259,7 @@ def test_memory_usage(data, index): ) def test_min(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.min(skipna=skipna) - else: - df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) + df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 4dc5b49d428459b15fd89d4af47dbf99aa3e6acf Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 10:27:20 -0800 Subject: [PATCH 43/63] Revert "Add codepath to check that Modin raises ValueError when passing None to skipna for dataframe tests" This reverts commit 6de34e7d6cf556e6fbabab65bc975b492f13151f. --- modin/pandas/test/dataframe/test_default.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index e22c2c0e295..c80c8c002b2 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -438,17 +438,10 @@ def test_last(): @pytest.mark.parametrize("skipna", [None, True, False]) def test_mad(data, axis, skipna): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) - # We currently do not accept None for skipna, while pandas warns users that it will - # exception eventually, but still accepts it currently. We need this codepath to catch - # the exception Modin raises until pandas officially deprecates skipna=None. - if skipna is None: - with pytest.raises(ValueError): - modin_df.mad(axis=axis, skipna=skipna, level=None) - else: - df_equals( - modin_df.mad(axis=axis, skipna=skipna, level=None), - pandas_df.mad(axis=axis, skipna=skipna, level=None), - ) + df_equals( + modin_df.mad(axis=axis, skipna=skipna, level=None), + pandas_df.mad(axis=axis, skipna=skipna, level=None), + ) @pytest.mark.parametrize("level", [-1, 0, 1]) From ed265feb676c127b5b16cc211a366592d23d9b9f Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 10:27:50 -0800 Subject: [PATCH 44/63] Revert "Add comment to explain new codepath" This reverts commit eb282a6b3394013e94ccbbf6aad4820a0a7184c1. --- modin/pandas/test/test_series.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index f9f1cda1cee..94800c51de3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,9 +770,6 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) - # We currently do not accept None for skipna, while pandas warns users that it will - # exception eventually, but still accepts it currently. We need this codepath to catch - # the exception Modin raises until pandas officially deprecates skipna=None. if skipna is None: with pytest.raises(ValueError): modin_series.all(skipna=skipna) From 1a908c71cbdf2d11956d673fdcdad37c7d668da1 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 10:28:47 -0800 Subject: [PATCH 45/63] Revert "Add codepath to check that Modin raises ValueError when passing None to skipna" This reverts commit 2dbc29510608e65ca07edf99ddd81fbf201040d0. Signed-off-by: Rehan Durrani --- modin/pandas/test/test_series.py | 57 ++++++++------------------------ 1 file changed, 13 insertions(+), 44 deletions(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 94800c51de3..319efa5552a 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,11 +770,7 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.all(skipna=skipna) - else: - df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) + df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -783,11 +779,7 @@ def test_all(data, skipna): ) def test_any(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.any(skipna=skipna) - else: - df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) + df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2144,14 +2136,10 @@ def test_lt(data): @pytest.mark.parametrize("level", [0, -1, None]) def test_mad(level, data, axis, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.mad(skipna=skipna) - else: - df_equals( - modin_series.mad(axis=axis, skipna=skipna, level=level), - pandas_series.mad(axis=axis, skipna=skipna, level=level), - ) + df_equals( + modin_series.mad(axis=axis, skipna=skipna, level=level), + pandas_series.mad(axis=axis, skipna=skipna, level=level), + ) @pytest.mark.parametrize("na_values", ["ignore", None], ids=["na_ignore", "na_none"]) @@ -2195,11 +2183,7 @@ def test_mask(): ) def test_max(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.max(skipna=skipna) - else: - df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) + df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2217,13 +2201,7 @@ def test_mean(data, skipna): ) def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.median(skipna=skipna) - else: - df_equals( - modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna) - ) + df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) @pytest.mark.parametrize( @@ -2429,15 +2407,10 @@ def test_product_alias(): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_prod(axis, skipna): - if skipna is None: - with pytest.raises(ValueError): - modin_series, _ = create_test_series(test_data["float_nan_data"]) - modin_series.prod(skipna=skipna) - else: - eval_general( - *create_test_series(test_data["float_nan_data"]), - lambda s: s.prod(axis=axis, skipna=skipna), - ) + eval_general( + *create_test_series(test_data["float_nan_data"]), + lambda s: s.prod(axis=axis, skipna=skipna), + ) @pytest.mark.parametrize( @@ -2976,11 +2949,7 @@ def test_size(data): ) def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) - if skipna is None: - with pytest.raises(ValueError): - modin_series.skew(skipna=skipna) - else: - df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From ccad5c15fcb2bfe00050f6cf7ccb4e1898e95731 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 25 Jan 2022 13:38:43 -0600 Subject: [PATCH 46/63] Fix test cases where pandas throws errors Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 4 +- modin/pandas/series.py | 3 +- modin/pandas/test/test_series.py | 75 +++++++++++++++++++++++++++----- 3 files changed, 68 insertions(+), 14 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index fe05efd17ba..549b4addf02 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -708,6 +708,7 @@ def align( ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is not None: axis = self._get_axis_number(axis) if bool_only and axis == 0: @@ -765,6 +766,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return result def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is not None: axis = self._get_axis_number(axis) if bool_only and axis == 0: @@ -1601,7 +1603,7 @@ def loc(self): def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) - validate_bool_kwarg(skipna, "skipna", none_allowed=False) + validate_bool_kwarg(skipna, "skipna", none_allowed=True) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 579fd80e7d0..2d4a9c087df 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1462,8 +1462,7 @@ def prod( Return the product of the values over the requested `axis`. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 319efa5552a..455c6d7e93e 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -770,7 +770,14 @@ def test_align(data): ) def test_all(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) + try: + pandas_result = pandas_series.all(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.all(skipna=skipna) + else: + modin_result = modin_series.all(skipna=skipna) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -779,7 +786,14 @@ def test_all(data, skipna): ) def test_any(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) + try: + pandas_result = pandas_series.any(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.any(skipna=skipna) + else: + modin_result = modin_series.any(skipna=skipna) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2136,10 +2150,14 @@ def test_lt(data): @pytest.mark.parametrize("level", [0, -1, None]) def test_mad(level, data, axis, skipna): modin_series, pandas_series = create_test_series(data) - df_equals( - modin_series.mad(axis=axis, skipna=skipna, level=level), - pandas_series.mad(axis=axis, skipna=skipna, level=level), - ) + try: + pandas_result = pandas_series.mad(axis=axis, skipna=skipna, level=level) + except Exception as e: + with pytest.raises(type(e)): + modin_series.mad(axis=axis, skipna=skipna, level=level) + else: + modin_result = modin_series.mad(axis=axis, skipna=skipna, level=level) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("na_values", ["ignore", None], ids=["na_ignore", "na_none"]) @@ -2183,7 +2201,14 @@ def test_mask(): ) def test_max(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) + try: + pandas_result = pandas_series.max(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.max(skipna=skipna) + else: + modin_result = modin_series.max(skipna=skipna) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2192,7 +2217,14 @@ def test_max(data, skipna): ) def test_mean(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) + try: + pandas_result = pandas_series.mean(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.mean(skipna=skipna) + else: + modin_result = modin_series.mean(skipna=skipna) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2201,7 +2233,14 @@ def test_mean(data, skipna): ) def test_median(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + try: + pandas_result = pandas_series.median(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.median(skipna=skipna) + else: + modin_result = modin_series.median(skipna=skipna) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize( @@ -2234,7 +2273,14 @@ def test_memory_usage(data, index): ) def test_min(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) + try: + pandas_result = pandas_series.min(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.min(skipna=skipna) + else: + modin_result = modin_series.min(skipna=skipna) + df_equals(modin_result, pandas_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2949,7 +2995,14 @@ def test_size(data): ) def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + try: + pandas_result = pandas_series.skew(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.skew(skipna=skipna) + else: + modin_result = modin_series.skew(skipna=skipna) + df_equals(pandas_result, modin_result) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From e578af83538ccac11d7a305822161f8ddea95d3f Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 25 Jan 2022 13:42:45 -0600 Subject: [PATCH 47/63] Lint Signed-off-by: Devin Petersohn --- modin/pandas/test/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 455c6d7e93e..113a6fda203 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -2996,7 +2996,7 @@ def test_size(data): def test_skew(data, skipna): modin_series, pandas_series = create_test_series(data) try: - pandas_result = pandas_series.skew(skipna=skipna) + pandas_result = pandas_series.skew(skipna=skipna) except Exception as e: with pytest.raises(type(e)): modin_series.skew(skipna=skipna) From d9776879042aed9432640d6801085db7ae3bca8f Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 25 Jan 2022 13:59:10 -0600 Subject: [PATCH 48/63] Fix warnings Signed-off-by: Devin Petersohn --- modin/core/io/text/text_file_dispatcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index 97db296fced..cf695792cdf 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -35,7 +35,7 @@ from modin.core.io.text.utils import CustomNewlineIterator from modin.config import NPartitions -ColumnNamesTypes = Tuple[Union[pandas.Index, pandas.MultiIndex, pandas.Int64Index]] +ColumnNamesTypes = Tuple[Union[pandas.Index, pandas.MultiIndex]] IndexColType = Union[int, str, bool, Sequence[int], Sequence[str], None] From 8565d7419629d5e979963a276073b2876cfb0f02 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 25 Jan 2022 14:26:53 -0600 Subject: [PATCH 49/63] Fix import and validation Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 549b4addf02..e5f1fa1a772 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -30,7 +30,7 @@ import pandas.core.resample import pandas.core.generic from pandas.core.indexing import convert_to_index_sliceable -from pandas.util._validators import validate_bool_kwarg, validate_percentile +from pandas.util._validators import validate_bool_kwarg, validate_percentile, validate_ascending from pandas._libs.lib import no_default from pandas._typing import ( CompressionOptions, @@ -2481,6 +2481,7 @@ def sort_values( ): axis = self._get_axis_number(axis) inplace = validate_bool_kwarg(inplace, "inplace") + ascending = validate_ascending(ascending) if axis == 0: result = self._query_compiler.sort_rows_by_column_values( by, From eb0cd2abf789323365363502bd7cb414e0b57dad Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 13:09:11 -0800 Subject: [PATCH 50/63] Fix test_join_sort::test_sort_values by skipping ascending = None Signed-off-by: Rehan Durrani --- modin/pandas/test/dataframe/test_join_sort.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index 75196ceafdf..f60dd1825fb 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -459,6 +459,8 @@ def test_sort_multiindex(sort_remaining): def test_sort_values( data, by, axis, ascending, inplace, kind, na_position, ignore_index, key ): + if ascending is None: + pytest.skip("None is not a valid value for ascending.") if (axis == 1 or axis == "columns") and ignore_index: pytest.skip("Pandas bug #39426 which is fixed in Pandas 1.3") From af1565a8285ac42a271d0bdcb43bf23be3cb2ecb Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 13:11:11 -0800 Subject: [PATCH 51/63] lint Signed-off-by: Rehan Durrani --- modin/pandas/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index e5f1fa1a772..dbcdd7b01e8 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -30,7 +30,11 @@ import pandas.core.resample import pandas.core.generic from pandas.core.indexing import convert_to_index_sliceable -from pandas.util._validators import validate_bool_kwarg, validate_percentile, validate_ascending +from pandas.util._validators import ( + validate_bool_kwarg, + validate_percentile, + validate_ascending, +) from pandas._libs.lib import no_default from pandas._typing import ( CompressionOptions, From d0a6d9dab41c433d5ea429635c1e3757371c6e7e Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 25 Jan 2022 16:36:55 -0600 Subject: [PATCH 52/63] Fix read_fwf issue Signed-off-by: Devin Petersohn --- modin/pandas/test/test_io.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index e2190c353fc..a94ff9e1cfe 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1944,19 +1944,7 @@ def test_fwf_file_chunksize(self, make_fwf_file): df_equals(modin_df, pd_df) - @pytest.mark.parametrize( - "nrows", - [ - pytest.param( - 13, - marks=pytest.mark.xfail( - Engine.get() == "Ray", - reason="read_fwf bug on pandas side: pandas-dev/pandas#44021", - ), - ), - None, - ], - ) + @pytest.mark.parametrize("nrows", [13, None]) def test_fwf_file_skiprows(self, make_fwf_file, nrows): unique_filename = make_fwf_file() From 51f06165b5252cefffa617034610538f1cee76b7 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 16:03:50 -0800 Subject: [PATCH 53/63] Update insert to throw IndexError if negative index is out of bounds, otherwise throw ValueError (same as before) Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index bd4dcafefd5..730275c47b7 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1163,6 +1163,8 @@ def insert(self, loc, column, value, allow_duplicates=False): # noqa: PR01, D20 ) ) if loc < 0: + if loc < -len(self.columns): + raise IndexError("index {0} is out of bounds for axis 0 with size {1}".format(loc, len(self.index))) raise ValueError("unbounded slice") if isinstance(value, Series): value = value._query_compiler From 72cfadf805a9e68f9e1423a1705786b8996c96c8 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 16:06:34 -0800 Subject: [PATCH 54/63] Lint Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 730275c47b7..4056a5cc4d9 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1164,7 +1164,11 @@ def insert(self, loc, column, value, allow_duplicates=False): # noqa: PR01, D20 ) if loc < 0: if loc < -len(self.columns): - raise IndexError("index {0} is out of bounds for axis 0 with size {1}".format(loc, len(self.index))) + raise IndexError( + "index {0} is out of bounds for axis 0 with size {1}".format( + loc, len(self.index) + ) + ) raise ValueError("unbounded slice") if isinstance(value, Series): value = value._query_compiler From c2b5dfe87102379df37e48fd769e49ae003f082c Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 16:42:01 -0800 Subject: [PATCH 55/63] Convert error strings to f-strings Signed-off-by: Rehan Durrani --- modin/pandas/dataframe.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 4056a5cc4d9..9f7e37d7ca6 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1155,19 +1155,15 @@ def insert(self, loc, column, value, allow_duplicates=False): # noqa: PR01, D20 ): raise ValueError("Length of values does not match length of index") if not allow_duplicates and column in self.columns: - raise ValueError("cannot insert {0}, already exists".format(column)) + raise ValueError(f"cannot insert {column}, already exists") if loc > len(self.columns): raise IndexError( - "index {0} is out of bounds for axis 0 with size {1}".format( - loc, len(self.columns) - ) + f"index {loc} is out of bounds for axis 0 with size {len(self.columns)}" ) if loc < 0: if loc < -len(self.columns): raise IndexError( - "index {0} is out of bounds for axis 0 with size {1}".format( - loc, len(self.index) - ) + f"index {loc} is out of bounds for axis 0 with size {len(self.columns)}" ) raise ValueError("unbounded slice") if isinstance(value, Series): From 061b3a7a4e40de765095c517920639d58ffe7d44 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 19:16:39 -0800 Subject: [PATCH 56/63] Resolve fileno error by setting memory_map to False when using BytesIO with read_csv Signed-off-by: Rehan Durrani --- modin/core/storage_formats/pandas/parsers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index 3ecf048c175..7793cc53f24 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -40,7 +40,7 @@ """ from collections import OrderedDict -from io import BytesIO, TextIOWrapper +from io import BytesIO, StringIO, TextIOWrapper import numpy as np import pandas from pandas.core.dtypes.cast import find_common_type @@ -54,6 +54,8 @@ from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.error_message import ErrorMessage +from tempfile import TemporaryFile + _doc_pandas_parser_class = """ Class for handling {data_type} on the workers using pandas storage format. @@ -189,7 +191,9 @@ def generic_parse(fname, **kwargs): bio.seek(start) to_read = header + bio.read(end - start) + memory_map = kwargs.pop("memory_map") pandas_df = callback(BytesIO(to_read), **kwargs) + kwargs["memory_map"] = memory_map index = ( pandas_df.index if not isinstance(pandas_df.index, pandas.RangeIndex) From f3a6f530f00aeb20bc1a33a6071e9faa8058b9e3 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 19:20:18 -0800 Subject: [PATCH 57/63] Remove unused imports Signed-off-by: Rehan Durrani --- modin/core/storage_formats/pandas/parsers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index 7793cc53f24..e033d61aec9 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -40,7 +40,7 @@ """ from collections import OrderedDict -from io import BytesIO, StringIO, TextIOWrapper +from io import BytesIO, TextIOWrapper import numpy as np import pandas from pandas.core.dtypes.cast import find_common_type @@ -54,8 +54,6 @@ from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas from modin.error_message import ErrorMessage -from tempfile import TemporaryFile - _doc_pandas_parser_class = """ Class for handling {data_type} on the workers using pandas storage format. From b7b9d96abe77e5a5b1bb62e84499d6b32fbee77a Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 20:07:37 -0800 Subject: [PATCH 58/63] Remove keyerror for memory_map Signed-off-by: Rehan Durrani --- modin/core/storage_formats/pandas/parsers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index e033d61aec9..3bef0347309 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -189,9 +189,10 @@ def generic_parse(fname, **kwargs): bio.seek(start) to_read = header + bio.read(end - start) - memory_map = kwargs.pop("memory_map") + if 'memory_map' in kwargs: + kwargs = kwargs.copy() + del kwargs['memory_map'] pandas_df = callback(BytesIO(to_read), **kwargs) - kwargs["memory_map"] = memory_map index = ( pandas_df.index if not isinstance(pandas_df.index, pandas.RangeIndex) From 5ff2baaedc8d9a8193471cc605341e11a738ae92 Mon Sep 17 00:00:00 2001 From: Rehan Durrani Date: Tue, 25 Jan 2022 20:10:37 -0800 Subject: [PATCH 59/63] lint Signed-off-by: Rehan Durrani --- modin/core/storage_formats/pandas/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index 3bef0347309..d828c74fd0c 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -189,9 +189,9 @@ def generic_parse(fname, **kwargs): bio.seek(start) to_read = header + bio.read(end - start) - if 'memory_map' in kwargs: + if "memory_map" in kwargs: kwargs = kwargs.copy() - del kwargs['memory_map'] + del kwargs["memory_map"] pandas_df = callback(BytesIO(to_read), **kwargs) index = ( pandas_df.index From 8bd0766e827615e63338bf2688ceb4d9256a9bb3 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Wed, 26 Jan 2022 13:57:07 +0300 Subject: [PATCH 60/63] Address comments Signed-off-by: Igoshev, Yaroslav --- .github/workflows/push-to-master.yml | 10 ++-- .github/workflows/push.yml | 18 +++--- modin/pandas/base.py | 20 +++---- modin/pandas/dataframe.py | 2 +- modin/pandas/series.py | 4 +- modin/pandas/test/test_series.py | 83 ++++------------------------ 6 files changed, 37 insertions(+), 100 deletions(-) diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml index cfc28ab42c4..db6310d06b9 100644 --- a/.github/workflows/push-to-master.yml +++ b/.github/workflows/push-to-master.yml @@ -18,11 +18,11 @@ jobs: with: activate-environment: modin environment-file: requirements/requirements-no-engine.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: install Ray nightly build - run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl + run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl - name: Conda environment run: | conda info @@ -63,7 +63,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment @@ -81,7 +81,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] test-task: - modin/pandas/test/dataframe/test_binary.py - modin/pandas/test/dataframe/test_default.py @@ -118,8 +118,6 @@ jobs: - run: pip install -r requirements-dev.txt --use-deprecated=legacy-resolver # Use a ray master commit that includes the fix here: https://github.com/ray-project/ray/pull/16278 # Can be changed after a Ray version > 1.4 is released. - - run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/master/c8e3ed9eec30119092ef966ee7b8982c8954c333/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - if: matrix.python-version == '3.7' - run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/master/c8e3ed9eec30119092ef966ee7b8982c8954c333/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl if: matrix.python-version == '3.8' - name: Install HDF5 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 3e89ae3cc4b..c205aa05b9d 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -15,7 +15,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment @@ -41,7 +41,7 @@ jobs: env: MODIN_MEMORY: 1000000000 MODIN_TEST_DATASET_SIZE: "small" - name: Test ${{ matrix.execution }} execution, Python 3.7 + name: Test ${{ matrix.execution }} execution, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -50,7 +50,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment @@ -96,7 +96,7 @@ jobs: MODIN_EXPERIMENTAL: "True" MODIN_ENGINE: "native" MODIN_STORAGE_FORMAT: "omnisci" - name: Test OmniSci storage format, Python 3.7 + name: Test OmniSci storage format, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -106,7 +106,7 @@ jobs: with: activate-environment: modin_on_omnisci environment-file: requirements/env_omnisci.yml - python-version: 3.7 + python-version: 3.8 use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment run: | @@ -135,7 +135,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["python", "ray", "dask"] env: MODIN_ENGINE: ${{matrix.engine}} @@ -202,7 +202,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["ray", "dask"] test-task: - modin/pandas/test/dataframe/test_binary.py @@ -257,7 +257,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] env: MODIN_STORAGE_FORMAT: pyarrow MODIN_EXPERIMENTAL: "True" @@ -287,7 +287,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: [ "3.7", "3.8" ] + python-version: ["3.8"] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" diff --git a/modin/pandas/base.py b/modin/pandas/base.py index dbcdd7b01e8..8558838fa3b 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -524,7 +524,7 @@ def _get_axis_number(cls, axis): Parameters ---------- - axis : int, str + axis : int, str or pandas._libs.lib.NoDefault Axis name ('index' or 'columns') or number to be converted to axis index. Returns @@ -943,8 +943,8 @@ def between_time( self: "BasePandasDataset", start_time, end_time, - include_start: "bool_t | lib.NoDefault" = no_default, - include_end: "bool_t | lib.NoDefault" = no_default, + include_start: "bool_t | NoDefault" = no_default, + include_end: "bool_t | NoDefault" = no_default, inclusive: "str | None" = None, axis=None, ): @@ -1547,7 +1547,7 @@ def iloc(self): def kurt( self, - axis: "Axis | None | lib.NoDefault" = no_default, + axis: "Axis | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, @@ -1645,7 +1645,7 @@ def mask( def max( self, - axis: "int | None | lib.NoDefault" = no_default, + axis: "int | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, @@ -1750,7 +1750,7 @@ def _stat_operation( def mean( self, - axis: "int | None | lib.NoDefault" = no_default, + axis: "int | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, @@ -1760,7 +1760,7 @@ def mean( def median( self, - axis: "int | None | lib.NoDefault" = no_default, + axis: "int | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, @@ -1777,7 +1777,7 @@ def memory_usage(self, index=True, deep=False): def min( self, - axis: "int | None | lib.NoDefault" = no_default, + axis: "int | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, @@ -1923,7 +1923,7 @@ def rank( self: "BasePandasDataset", axis=0, method: "str" = "average", - numeric_only: "bool_t | None | lib.NoDefault" = no_default, + numeric_only: "bool_t | None | NoDefault" = no_default, na_option: "str" = "keep", ascending: "bool_t" = True, pct: "bool_t" = False, @@ -2432,7 +2432,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=no_default): def skew( self, - axis: "int | None | lib.NoDefault" = no_default, + axis: "int | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 9f7e37d7ca6..a71a930e9b7 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1776,7 +1776,7 @@ def replace( inplace: "bool" = False, limit=None, regex: "bool" = False, - method: "str | lib.NoDefault" = no_default, + method: "str | NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 2d4a9c087df..d38db8d1707 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1236,7 +1236,7 @@ def keys(self): # noqa: RT01, D200 def kurt( self, - axis: "Axis | None | lib.NoDefault" = no_default, + axis: "Axis | None | NoDefault" = no_default, skipna=True, level=None, numeric_only=None, @@ -1714,7 +1714,7 @@ def replace( inplace=False, limit=None, regex=False, - method: "str | lib.NoDefault" = no_default, + method: "str | NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 113a6fda203..0e6bfb976e1 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -769,15 +769,7 @@ def test_align(data): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_all(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.all(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.all(skipna=skipna) - else: - modin_result = modin_series.all(skipna=skipna) - df_equals(modin_result, pandas_result) + eval_general(*create_test_series(data), lambda df: df.all(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -785,15 +777,7 @@ def test_all(data, skipna): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_any(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.any(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.any(skipna=skipna) - else: - modin_result = modin_series.any(skipna=skipna) - df_equals(modin_result, pandas_result) + eval_general(*create_test_series(data), lambda df: df.any(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2149,15 +2133,10 @@ def test_lt(data): @pytest.mark.parametrize("skipna", [None, True, False]) @pytest.mark.parametrize("level", [0, -1, None]) def test_mad(level, data, axis, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.mad(axis=axis, skipna=skipna, level=level) - except Exception as e: - with pytest.raises(type(e)): - modin_series.mad(axis=axis, skipna=skipna, level=level) - else: - modin_result = modin_series.mad(axis=axis, skipna=skipna, level=level) - df_equals(modin_result, pandas_result) + eval_general( + *create_test_series(data), + lambda df: df.mad(axis=axis, skipna=skipna, level=level), + ) @pytest.mark.parametrize("na_values", ["ignore", None], ids=["na_ignore", "na_none"]) @@ -2200,15 +2179,7 @@ def test_mask(): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_max(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.max(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.max(skipna=skipna) - else: - modin_result = modin_series.max(skipna=skipna) - df_equals(modin_result, pandas_result) + eval_general(*create_test_series(data), lambda df: df.max(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2216,15 +2187,7 @@ def test_max(data, skipna): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_mean(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.mean(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.mean(skipna=skipna) - else: - modin_result = modin_series.mean(skipna=skipna) - df_equals(modin_result, pandas_result) + eval_general(*create_test_series(data), lambda df: df.mean(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2232,15 +2195,7 @@ def test_mean(data, skipna): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_median(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.median(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.median(skipna=skipna) - else: - modin_result = modin_series.median(skipna=skipna) - df_equals(modin_result, pandas_result) + eval_general(*create_test_series(data), lambda df: df.median(skipna=skipna)) @pytest.mark.parametrize( @@ -2272,15 +2227,7 @@ def test_memory_usage(data, index): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_min(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.min(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.min(skipna=skipna) - else: - modin_result = modin_series.min(skipna=skipna) - df_equals(modin_result, pandas_result) + eval_general(*create_test_series(data), lambda df: df.min(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2994,15 +2941,7 @@ def test_size(data): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_skew(data, skipna): - modin_series, pandas_series = create_test_series(data) - try: - pandas_result = pandas_series.skew(skipna=skipna) - except Exception as e: - with pytest.raises(type(e)): - modin_series.skew(skipna=skipna) - else: - modin_result = modin_series.skew(skipna=skipna) - df_equals(pandas_result, modin_result) + eval_general(*create_test_series(data), lambda df: df.skew(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From d9e983d46172a175589fd8a21e9150d787690ce0 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 26 Jan 2022 10:21:11 -0600 Subject: [PATCH 61/63] Apply suggestions from code review Co-authored-by: Vasily Litvinov --- modin/pandas/series.py | 5 +---- modin/pandas/test/dataframe/test_join_sort.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d38db8d1707..ab955ae8cb7 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1602,10 +1602,7 @@ def reset_index( if name is no_default: # For backwards compatibility, keep columns as [0] instead of # [None] when self.name is None - if self.name is None: - name = 0 - else: - name = self.name + name = 0 if self.name is None else self.name if drop and level is None: new_idx = pandas.RangeIndex(len(self.index)) diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index f60dd1825fb..f3b70d0f937 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -311,7 +311,7 @@ def test_merge(test_data, test_data2): eval_general( modin_df, pandas_df, - lambda df: df.merge(ms) if isinstance(df, pd.DataFrame) else df.merge(ps), + lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps), ) # merge a Series with a name From 3149280f83c84b8a42b182e3ec9ac3ba21185c0f Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Wed, 26 Jan 2022 10:36:44 -0600 Subject: [PATCH 62/63] Update modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py --- .../execution/native/implementations/omnisci_on_native/io/io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py index d3eb167a6fb..616f707173d 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py @@ -51,7 +51,6 @@ Callable, Dialect, None, - None, ], ] From 913ee694b5b8dfbe8a2b2cf37891526786e2a260 Mon Sep 17 00:00:00 2001 From: Yaroslav Igoshev Date: Wed, 26 Jan 2022 20:28:08 +0300 Subject: [PATCH 63/63] Update modin/pandas/test/dataframe/test_join_sort.py --- modin/pandas/test/dataframe/test_join_sort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index f3b70d0f937..1d277d45173 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -320,7 +320,7 @@ def test_merge(test_data, test_data2): eval_general( modin_df, pandas_df, - lambda df: df.merge(ms) if isinstance(df, pd.DataFrame) else df.merge(ps), + lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps), ) with pytest.raises(TypeError):