From 37526c1ddbe0952dffe16ac0fe158a00aa7b397f Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Mon, 2 Dec 2019 18:37:11 +0800 Subject: [PATCH 1/6] API/DEPR: Change default skipna behaviour + deprecate numeric_only in Categorical.min and max (#27929) --- doc/source/whatsnew/v1.0.0.rst | 22 +++++++++ pandas/core/arrays/categorical.py | 38 +++++++-------- pandas/core/series.py | 4 +- .../arrays/categorical/test_analytics.py | 46 ++++++++++++------- pandas/tests/reductions/test_reductions.py | 32 ++++++------- 5 files changed, 85 insertions(+), 57 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9847324147618..b45bec37e84eb 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -304,6 +304,26 @@ The following methods now also correctly output values for unobserved categories df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() +By default :meth:`Categorical.min` now returns the minimum instead of np.nan +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When :class:`Categorical` contains ``np.nan``, +:meth:`Categorical.min` no longer return ``np.nan`` by default (skipna=True) (:issue:`25303`) + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: pd.Categorical([1, 2, np.nan], ordered=True).min() + Out[1]: nan + + +*pandas 1.0.0* + +.. ipython:: python + + pd.Categorical([1, 2, np.nan], ordered=True).min() + .. _whatsnew_1000.api_breaking.deps: Increased minimum versions for dependencies @@ -410,6 +430,8 @@ Deprecations - :func:`is_extension_type` is deprecated, :func:`is_extension_array_dtype` should be used instead (:issue:`29457`) - :func:`eval` keyword argument "truediv" is deprecated and will be removed in a future version (:issue:`29812`) - :meth:`Categorical.take_nd` is deprecated, use :meth:`Categorical.take` instead (:issue:`27745`) +- The parameter ``numeric_only`` of :meth:`Categorical.min` and :meth:`Categorical.max` is deprecated and replaced with ``skipna`` (:issue:`25303`) +- .. _whatsnew_1000.prior_deprecations: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 83f6051b8423f..f4a20b808292a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2123,7 +2123,8 @@ def _reduce(self, name, axis=0, **kwargs): raise TypeError(f"Categorical cannot perform the operation {name}") return func(**kwargs) - def min(self, numeric_only=None, **kwargs): + @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") + def min(self, skipna=True): """ The minimum value of the object. @@ -2139,17 +2140,18 @@ def min(self, numeric_only=None, **kwargs): min : the minimum of this `Categorical` """ self.check_for_ordered("min") - if numeric_only: - good = self._codes != -1 - pointer = self._codes[good].min(**kwargs) - else: - pointer = self._codes.min(**kwargs) - if pointer == -1: - return np.nan + good = self._codes != -1 + if not good.all(): + if skipna: + pointer = self._codes[good].min() + else: + return np.nan else: - return self.categories[pointer] + pointer = self._codes.min() + return self.categories[pointer] - def max(self, numeric_only=None, **kwargs): + @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") + def max(self, skipna=True): """ The maximum value of the object. @@ -2165,15 +2167,15 @@ def max(self, numeric_only=None, **kwargs): max : the maximum of this `Categorical` """ self.check_for_ordered("max") - if numeric_only: - good = self._codes != -1 - pointer = self._codes[good].max(**kwargs) - else: - pointer = self._codes.max(**kwargs) - if pointer == -1: - return np.nan + good = self._codes != -1 + if not good.all(): + if skipna: + pointer = self._codes[good].max() + else: + return np.nan else: - return self.categories[pointer] + pointer = self._codes.max() + return self.categories[pointer] def mode(self, dropna=True): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index a8232f137f3ef..11e87a4eed27f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3820,9 +3820,7 @@ def _reduce( self._get_axis_number(axis) if isinstance(delegate, Categorical): - # TODO deprecate numeric_only argument for Categorical and use - # skipna as well, see GH25303 - return delegate._reduce(name, numeric_only=numeric_only, **kwds) + return delegate._reduce(name, skipna=skipna, **kwds) elif isinstance(delegate, ExtensionArray): # dispatch to ExtensionArray interface return delegate._reduce(name, skipna=skipna, **kwds) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 279f1492d7dad..637a47eba0597 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -35,31 +35,43 @@ def test_min_max(self): assert _min == "d" assert _max == "a" + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_with_nan(self, skipna): + # GH 25303 cat = Categorical( [np.nan, "b", "c", np.nan], categories=["d", "c", "b", "a"], ordered=True ) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == "b" + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) - _min = cat.min(numeric_only=True) - assert _min == "c" - _max = cat.max(numeric_only=True) - assert _max == "b" + if skipna is False: + assert np.isnan(_min) + assert np.isnan(_max) + else: + assert _min == "c" + assert _max == "b" cat = Categorical( [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True ) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == 1 - - _min = cat.min(numeric_only=True) - assert _min == 2 - _max = cat.max(numeric_only=True) - assert _max == 1 + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) + + if skipna is False: + assert np.isnan(_min) + assert np.isnan(_max) + else: + assert _min == 2 + assert _max == 1 + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_deprecate_numeric_only_min_max(self, method): + # GH 25303 + cat = Categorical( + [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True + ) + with tm.assert_produces_warning(expected_warning=FutureWarning): + getattr(cat, method)(numeric_only=True) @pytest.mark.parametrize( "values,categories,exp_mode", diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index b0ef0c58ca65a..80d148c919ab2 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1043,7 +1043,7 @@ def test_min_max(self): ) _min = cat.min() _max = cat.max() - assert np.isnan(_min) + assert _min == "c" assert _max == "b" cat = Series( @@ -1053,30 +1053,24 @@ def test_min_max(self): ) _min = cat.min() _max = cat.max() - assert np.isnan(_min) + assert _min == 2 assert _max == 1 - def test_min_max_numeric_only(self): - # TODO deprecate numeric_only argument for Categorical and use - # skipna as well, see GH25303 + @pytest.mark.parametrize("skipna", [True, False]) + def test_min_max_skipna(self, skipna): + # GH 25303 cat = Series( Categorical(["a", "b", np.nan, "a"], categories=["b", "a"], ordered=True) ) + _min = cat.min(skipna=skipna) + _max = cat.max(skipna=skipna) - _min = cat.min() - _max = cat.max() - assert np.isnan(_min) - assert _max == "a" - - _min = cat.min(numeric_only=True) - _max = cat.max(numeric_only=True) - assert _min == "b" - assert _max == "a" - - _min = cat.min(numeric_only=False) - _max = cat.max(numeric_only=False) - assert np.isnan(_min) - assert _max == "a" + if skipna is True: + assert _min == "b" + assert _max == "a" + else: + assert np.isnan(_min) + assert np.isnan(_max) class TestSeriesMode: From 28f4a8ae25deba681bff8a1f9ec049ae2b2eca9a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Dec 2019 04:34:18 -0800 Subject: [PATCH 2/6] CLN: small things in pytables (#29958) --- pandas/io/pytables.py | 49 +++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 39e9d467b652f..5a42df92ddf84 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -312,7 +312,7 @@ def read_hdf(path_or_buf, key=None, mode: str = "r", **kwargs): key : object, optional The group identifier in the store. Can be omitted if the HDF file contains a single pandas object. - mode : {'r', 'r+', 'a'}, optional + mode : {'r', 'r+', 'a'}, default 'r' Mode to use when opening the file. Ignored if path_or_buf is a :class:`pandas.HDFStore`. Default is 'r'. where : list, optional @@ -417,7 +417,7 @@ def read_hdf(path_or_buf, key=None, mode: str = "r", **kwargs): raise -def _is_metadata_of(group, parent_group) -> bool: +def _is_metadata_of(group: "Node", parent_group: "Node") -> bool: """Check if a given group is a metadata group for a given parent_group.""" if group._v_depth <= parent_group._v_depth: return False @@ -932,9 +932,7 @@ def func(_start, _stop, _where): # retrieve the objs, _where is always passed as a set of # coordinates here objs = [ - t.read( - where=_where, columns=columns, start=_start, stop=_stop, **kwargs - ) + t.read(where=_where, columns=columns, start=_start, stop=_stop) for t in tbls ] @@ -957,7 +955,7 @@ def func(_start, _stop, _where): return it.get_result(coordinates=True) - def put(self, key: str, value, format=None, append=False, **kwargs): + def put(self, key: str, value: FrameOrSeries, format=None, append=False, **kwargs): """ Store object in HDFStore. @@ -986,8 +984,8 @@ def put(self, key: str, value, format=None, append=False, **kwargs): """ if format is None: format = get_option("io.hdf.default_format") or "fixed" - kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, **kwargs) + format = self._validate_format(format) + self._write_to_group(key, value, format=format, append=append, **kwargs) def remove(self, key: str, where=None, start=None, stop=None): """ @@ -1046,7 +1044,7 @@ def remove(self, key: str, where=None, start=None, stop=None): def append( self, key: str, - value, + value: FrameOrSeries, format=None, append=True, columns=None, @@ -1096,8 +1094,10 @@ def append( dropna = get_option("io.hdf.dropna_table") if format is None: format = get_option("io.hdf.default_format") or "table" - kwargs = self._validate_format(format, kwargs) - self._write_to_group(key, value, append=append, dropna=dropna, **kwargs) + format = self._validate_format(format) + self._write_to_group( + key, value, format=format, append=append, dropna=dropna, **kwargs + ) def append_to_multiple( self, @@ -1418,17 +1418,16 @@ def _check_if_open(self): if not self.is_open: raise ClosedFileError(f"{self._path} file is not open!") - def _validate_format(self, format: str, kwargs: Dict[str, Any]) -> Dict[str, Any]: - """ validate / deprecate formats; return the new kwargs """ - kwargs = kwargs.copy() + def _validate_format(self, format: str) -> str: + """ validate / deprecate formats """ # validate try: - kwargs["format"] = _FORMAT_MAP[format.lower()] + format = _FORMAT_MAP[format.lower()] except KeyError: raise TypeError(f"invalid HDFStore format specified [{format}]") - return kwargs + return format def _create_storer( self, @@ -1532,7 +1531,7 @@ def error(t): def _write_to_group( self, key: str, - value, + value: FrameOrSeries, format, axes=None, index=True, @@ -1615,10 +1614,10 @@ def _write_to_group( if isinstance(s, Table) and index: s.create_index(columns=index) - def _read_group(self, group: "Node", **kwargs): + def _read_group(self, group: "Node"): s = self._create_storer(group) s.infer_axes() - return s.read(**kwargs) + return s.read() class TableIterator: @@ -2752,28 +2751,22 @@ def f(values, freq=None, tz=None): return klass - def validate_read(self, kwargs: Dict[str, Any]) -> Dict[str, Any]: + def validate_read(self, columns, where): """ - remove table keywords from kwargs and return raise if any keywords are passed which are not-None """ - kwargs = copy.copy(kwargs) - - columns = kwargs.pop("columns", None) if columns is not None: raise TypeError( "cannot pass a column specification when reading " "a Fixed format store. this store must be " "selected in its entirety" ) - where = kwargs.pop("where", None) if where is not None: raise TypeError( "cannot pass a where specification when reading " "from a Fixed format store. this store must be " "selected in its entirety" ) - return kwargs @property def is_exists(self) -> bool: @@ -3085,7 +3078,7 @@ def read( start: Optional[int] = None, stop: Optional[int] = None, ): - self.validate_read({"where": where, "columns": columns}) + self.validate_read(columns, where) index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) return Series(values, index=index, name=self.name) @@ -3142,7 +3135,7 @@ def read( stop: Optional[int] = None, ): # start, stop applied to rows, so 0th axis only - self.validate_read({"columns": columns, "where": where}) + self.validate_read(columns, where) select_axis = self.obj_type()._get_block_manager_axis(0) axes = [] From a97abc50cbd50bbacb0f4af6541cb3d5ece35eda Mon Sep 17 00:00:00 2001 From: Koushik <42416901+koushikgk@users.noreply.github.com> Date: Mon, 2 Dec 2019 05:35:15 -0700 Subject: [PATCH 3/6] DOC : Typo fix in userguide/Styling (#29956) --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 5e026e3a7d78f..633827eb79f46 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -677,7 +677,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice that you're able share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon." + "Notice that you're able to share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon." ] }, { From 0c2b1db198ce628ca889ad25c1179fff4ab3337c Mon Sep 17 00:00:00 2001 From: MomIsBestFriend <50263213+MomIsBestFriend@users.noreply.github.com> Date: Mon, 2 Dec 2019 14:45:02 +0200 Subject: [PATCH 4/6] repr() (#29959) --- pandas/io/html.py | 26 +++++------- pandas/io/parsers.py | 41 +++++++++---------- pandas/plotting/_core.py | 2 +- pandas/tests/computation/test_eval.py | 8 ++-- pandas/tests/frame/test_alter_axes.py | 4 +- pandas/tests/frame/test_query_eval.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/io/parser/test_unsupported.py | 6 +-- pandas/tests/io/test_html.py | 4 +- pandas/tests/test_strings.py | 6 +-- 10 files changed, 45 insertions(+), 56 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 5f38f866e1643..b8cb6679a9562 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -560,9 +560,7 @@ def _parse_tables(self, doc, match, attrs): unique_tables.add(table) if not result: - raise ValueError( - "No tables found matching pattern {patt!r}".format(patt=match.pattern) - ) + raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") return result def _text_getter(self, obj): @@ -618,7 +616,7 @@ def _build_xpath_expr(attrs) -> str: if "class_" in attrs: attrs["class"] = attrs.pop("class_") - s = ["@{key}={val!r}".format(key=k, val=v) for k, v in attrs.items()] + s = [f"@{k}={repr(v)}" for k, v in attrs.items()] return "[{expr}]".format(expr=" and ".join(s)) @@ -661,8 +659,7 @@ def _parse_tables(self, doc, match, kwargs): # 1. check all descendants for the given pattern and only search tables # 2. go up the tree until we find a table - query = "//table//*[re:test(text(), {patt!r})]/ancestor::table" - xpath_expr = query.format(patt=pattern) + xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table" # if any table attributes were given build an xpath expression to # search for them @@ -682,9 +679,7 @@ def _parse_tables(self, doc, match, kwargs): elem.getparent().remove(elem) if not tables: - raise ValueError( - "No tables found matching regex {patt!r}".format(patt=pattern) - ) + raise ValueError(f"No tables found matching regex {repr(pattern)}") return tables def _equals_tag(self, obj, tag): @@ -833,8 +828,7 @@ def _parser_dispatch(flavor): valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: raise ValueError( - "{invalid!r} is not a valid flavor, valid flavors " - "are {valid}".format(invalid=flavor, valid=valid_parsers) + f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" ) if flavor in ("bs4", "html5lib"): @@ -863,13 +857,13 @@ def _validate_flavor(flavor): elif isinstance(flavor, abc.Iterable): if not all(isinstance(flav, str) for flav in flavor): raise TypeError( - "Object of type {typ!r} is not an iterable of " - "strings".format(typ=type(flavor).__name__) + f"Object of type {repr(type(flavor).__name__)} " + f"is not an iterable of strings" ) else: - fmt = "{flavor!r}" if isinstance(flavor, str) else "{flavor}" - fmt += " is not a valid flavor" - raise ValueError(fmt.format(flavor=flavor)) + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) flavor = tuple(flavor) valid_flavors = set(_valid_parsers) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bbec148b8745d..7403e6d254d03 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -972,10 +972,10 @@ def _clean_options(self, options, engine): elif engine not in ("python", "python-fwf"): # wait until regex engine integrated fallback_reason = ( - "the 'c' engine does not support" - " regex separators (separators > 1 char and" - r" different from '\s+' are" - " interpreted as regex)" + "the 'c' engine does not support " + "regex separators (separators > 1 char and " + r"different from '\s+' are " + "interpreted as regex)" ) engine = "python" elif delim_whitespace: @@ -990,9 +990,9 @@ def _clean_options(self, options, engine): encodeable = False if not encodeable and engine not in ("python", "python-fwf"): fallback_reason = ( - "the separator encoded in {encoding}" - " is > 1 char long, and the 'c' engine" - " does not support such separators".format(encoding=encoding) + "the separator encoded in {encoding} " + "is > 1 char long, and the 'c' engine " + "does not support such separators".format(encoding=encoding) ) engine = "python" @@ -1021,21 +1021,19 @@ def _clean_options(self, options, engine): if "python" in engine: for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: - msg = ( - "Falling back to the 'python' engine because" - " {reason}, but this causes {option!r} to be" - " ignored as it is not supported by the 'python'" - " engine." - ).format(reason=fallback_reason, option=arg) - raise ValueError(msg) + raise ValueError( + f"Falling back to the 'python' engine because " + f"{fallback_reason}, but this causes {repr(arg)} to be " + f"ignored as it is not supported by the 'python' engine." + ) del result[arg] if fallback_reason: warnings.warn( ( - "Falling back to the 'python' engine because" - " {0}; you can avoid this warning by specifying" - " engine='python'." + "Falling back to the 'python' engine because " + "{0}; you can avoid this warning by specifying " + "engine='python'." ).format(fallback_reason), ParserWarning, stacklevel=5, @@ -1056,8 +1054,8 @@ def _clean_options(self, options, engine): depr_default = _deprecated_defaults[arg] msg = ( - "The '{arg}' argument has been deprecated " - "and will be removed in a future version.".format(arg=arg) + f"The {repr(arg)} argument has been deprecated and will be " + f"removed in a future version." ) if result.get(arg, depr_default) != depr_default: @@ -1081,9 +1079,8 @@ def _clean_options(self, options, engine): if converters is not None: if not isinstance(converters, dict): raise TypeError( - "Type converters must be a dict or" - " subclass, input was " - "a {0!r}".format(type(converters).__name__) + f"Type converters must be a dict or subclass, " + f"input was a {repr(type(converters).__name__)}" ) else: converters = {} diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index beb276478070e..375e6fe2b02c7 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -743,7 +743,7 @@ def _get_call_args(backend_name, data, args, kwargs): if args and isinstance(data, ABCSeries): positional_args = str(args)[1:-1] keyword_args = ", ".join( - f"{name}={value!r}" for (name, default), value in zip(arg_def, args) + f"{name}={repr(value)}" for (name, default), value in zip(arg_def, args) ) msg = ( "`Series.plot()` should not be called with positional " diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 1146b486a3eb4..2208fbf933387 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1114,11 +1114,11 @@ def test_performance_warning_for_poor_alignment(self, engine, parser): if not is_python_engine: assert len(w) == 1 msg = str(w[0].message) + loged = np.log10(s.size - df.shape[1]) expected = ( - "Alignment difference on axis {0} is larger" - " than an order of magnitude on term {1!r}, " - "by more than {2:.4g}; performance may suffer" - "".format(1, "df", np.log10(s.size - df.shape[1])) + f"Alignment difference on axis 1 is larger " + f"than an order of magnitude on term 'df', " + f"by more than {loged:.4g}; performance may suffer" ) assert msg == expected diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 6206b333d29e1..b52f24f9e06f1 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -342,7 +342,7 @@ def __init__(self, name, color): self.color = color def __str__(self) -> str: - return "".format(self=self) + return f"" # necessary for pretty KeyError __repr__ = __str__ @@ -419,7 +419,7 @@ def __init__(self, name, color): self.color = color def __str__(self) -> str: - return "".format(self=self) + return f"" thing1 = Thing("One", "red") thing2 = Thing("Two", "blue") diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index cd1bee356ed8e..abd8ef98ff871 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -27,7 +27,7 @@ def engine(request): def skip_if_no_pandas_parser(parser): if parser != "pandas": - pytest.skip("cannot evaluate with parser {0!r}".format(parser)) + pytest.skip(f"cannot evaluate with parser {repr(parser)}") class TestCompat: diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index ba0af7dd8136c..d59b6c18f6042 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -234,7 +234,7 @@ def test_pickle(self): def test_hash_error(self): index = timedelta_range("1 days", periods=10) with pytest.raises( - TypeError, match=("unhashable type: {0.__name__!r}".format(type(index))) + TypeError, match=(f"unhashable type: {repr(type(index).__name__)}") ): hash(index) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index b23ddf5bd9292..07ab41b47bf27 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -96,9 +96,9 @@ def test_python_engine(self, python_engine): for default in py_unsupported: msg = ( - "The {default!r} option is not supported with the {python_engine!r}" - " engine" - ).format(default=default, python_engine=python_engine) + f"The {repr(default)} option is not " + f"supported with the {repr(python_engine)} engine" + ) kwargs = {default: object()} with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index d8d617ceeebff..353946a311c1a 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -902,8 +902,8 @@ def test_computer_sales_page(self, datapath): def test_wikipedia_states_table(self, datapath): data = datapath("io", "data", "html", "wikipedia_states.html") - assert os.path.isfile(data), "{data!r} is not a file".format(data=data) - assert os.path.getsize(data), "{data!r} is an empty file".format(data=data) + assert os.path.isfile(data), f"{repr(data)} is not a file" + assert os.path.getsize(data), f"{repr(data)} is an empty file" result = self.read_html(data, "Arizona", header=1)[0] assert result["sq mi"].dtype == np.dtype("float64") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 3c97b75ecfa0c..0e2f8ee6543e1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -296,10 +296,8 @@ def test_api_per_method( else: # GH 23011, GH 23163 msg = ( - "Cannot use .str.{name} with values of inferred dtype " - "{inferred_dtype!r}.".format( - name=method_name, inferred_dtype=inferred_dtype - ) + f"Cannot use .str.{method_name} with values of " + f"inferred dtype {repr(inferred_dtype)}." ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) From 23bb61bce7e7705788d13d409fdb83c1d3fa3855 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 2 Dec 2019 16:53:39 +0000 Subject: [PATCH 5/6] TYPE: some types for pandas/core/arrays/base.py (#29968) --- pandas/core/arrays/base.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bf50d6e9b50e7..dc1a23e83f981 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -451,7 +451,9 @@ def _values_for_argsort(self) -> np.ndarray: # Note: this is used in `ExtensionArray.argsort`. return np.array(self) - def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): + def argsort( + self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs + ) -> np.ndarray: """ Return the indices that would sort this array. @@ -467,7 +469,7 @@ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): Returns ------- - index_array : ndarray + ndarray Array of indices that sort ``self``. If NaN values are contained, NaN values are placed at the end. @@ -1198,10 +1200,9 @@ def _maybe_convert(arr): if op.__name__ in {"divmod", "rdivmod"}: a, b = zip(*res) - res = _maybe_convert(a), _maybe_convert(b) - else: - res = _maybe_convert(res) - return res + return _maybe_convert(a), _maybe_convert(b) + + return _maybe_convert(res) op_name = ops._get_op_name(op, True) return set_function_name(_binop, op_name, cls) From 83812e1ba93b7857b8222e6651c823a88223472f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Dec 2019 11:38:48 -0600 Subject: [PATCH 6/6] API: Infer extension types in array (#29799) --- doc/source/user_guide/integer_na.rst | 36 ++++++-- doc/source/whatsnew/v1.0.0.rst | 33 +++++++- pandas/_libs/lib.pyx | 2 +- pandas/core/construction.py | 98 +++++++++++++--------- pandas/tests/arrays/test_array.py | 39 +++++++-- pandas/tests/dtypes/test_inference.py | 9 +- pandas/tests/frame/test_block_internals.py | 4 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/series/test_ufunc.py | 2 +- 9 files changed, 161 insertions(+), 64 deletions(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index f1f3d79eed61e..77568f3bcb244 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -25,8 +25,7 @@ numbers. Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` -implemented within pandas. It is not the default dtype for integers, and will not be inferred; -you must explicitly pass the dtype into :meth:`array` or :class:`Series`: +implemented within pandas. .. ipython:: python @@ -50,17 +49,34 @@ NumPy array. You can also pass the list-like object to the :class:`Series` constructor with the dtype. -.. ipython:: python +.. warning:: - s = pd.Series([1, 2, np.nan], dtype="Int64") - s + Currently :meth:`pandas.array` and :meth:`pandas.Series` use different + rules for dtype inference. :meth:`pandas.array` will infer a nullable- + integer dtype -By default (if you don't specify ``dtype``), NumPy is used, and you'll end -up with a ``float64`` dtype Series: + .. ipython:: python -.. ipython:: python + pd.array([1, None]) + pd.array([1, 2]) + + For backwards-compatibility, :class:`Series` infers these as either + integer or float dtype + + .. ipython:: python + + pd.Series([1, None]) + pd.Series([1, 2]) - pd.Series([1, 2, np.nan]) + We recommend explicitly providing the dtype to avoid confusion. + + .. ipython:: python + + pd.array([1, None], dtype="Int64") + pd.Series([1, None], dtype="Int64") + + In the future, we may provide an option for :class:`Series` to infer a + nullable-integer dtype. Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another @@ -68,6 +84,8 @@ dtype if needed. .. ipython:: python + s = pd.Series([1, 2, None], dtype="Int64") + # arithmetic s + 1 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b45bec37e84eb..470209a7f4a33 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -303,6 +303,38 @@ The following methods now also correctly output values for unobserved categories df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() +:meth:`pandas.array` inference changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`): + +1. String data (including missing values) now returns a :class:`arrays.StringArray`. +2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`. +3. Boolean data (including missing values) now returns the new :class:`arrays.BooleanArray` + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.array(["a", None]) + + ['a', None] + Length: 2, dtype: object + + >>> pd.array([1, None]) + + [1, None] + Length: 2, dtype: object + + +*pandas 1.0.0* + +.. ipython:: python + + pd.array(["a", None]) + pd.array([1, None]) + +As a reminder, you can specify the ``dtype`` to disable all inference. By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -408,7 +440,6 @@ Other API changes - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) -- .. _whatsnew_1000.api.documentation: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 41c15ab4de5e1..eb08a22b8c34f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1313,7 +1313,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: elif isinstance(val, str): if is_string_array(values, skipna=skipna): - return 'string' + return "string" elif isinstance(val, bytes): if is_bytes_array(values, skipna=skipna): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c0b08beead0ca..dc537d50b3419 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -94,10 +94,19 @@ def array( :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`str` :class:`pandas.arrays.StringArray` + :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. + .. versionchanged:: 1.0.0 + + Pandas infers nullable-integer dtype for integer data, + string dtype for string data, and nullable-boolean dtype + for boolean data. + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -154,14 +163,6 @@ def array( ['a', 'b'] Length: 2, dtype: str32 - Or use the dedicated constructor for the array you're expecting, and - wrap that in a PandasArray - - >>> pd.array(np.array(['a', 'b'], dtype=' - ['a', 'b'] - Length: 2, dtype: str32 - Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` @@ -184,20 +185,28 @@ def array( Examples -------- - If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. + If a dtype is not specified, pandas will infer the best dtype from the values. + See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) - + [1, 2] - Length: 2, dtype: int64 + Length: 2, dtype: Int64 - Or the NumPy dtype can be specified + >>> pd.array([1, 2, np.nan]) + + [1, 2, NaN] + Length: 3, dtype: Int64 - >>> pd.array([1, 2], dtype=np.dtype("int32")) - - [1, 2] - Length: 2, dtype: int32 + >>> pd.array(["a", None, "c"]) + + ['a', nan, 'c'] + Length: 3, dtype: string + + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] You can use the string alias for `dtype` @@ -212,29 +221,24 @@ def array( [a, b, a] Categories (3, object): [a < b < c] - Because omitting the `dtype` passes the data through to NumPy, - a mixture of valid integers and NA will return a floating-point - NumPy array. + If pandas does not infer a dedicated extension type a + :class:`arrays.PandasArray` is returned. - >>> pd.array([1, 2, np.nan]) + >>> pd.array([1.1, 2.2]) - [1.0, 2.0, nan] - Length: 3, dtype: float64 - - To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify - the dtype: + [1.1, 2.2] + Length: 2, dtype: float64 - >>> pd.array([1, 2, np.nan], dtype='Int64') - - [1, 2, NaN] - Length: 3, dtype: Int64 + As mentioned in the "Notes" section, new extension types may be added + in the future (by pandas or 3rd party libraries), causing the return + value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` + as a NumPy dtype if you need to ensure there's no future change in + behavior. - Pandas will infer an ExtensionArray for some types of data: - - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) - - ['2000-01-01', '2000-01-01'] - Length: 2, dtype: period[D] + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. @@ -246,21 +250,26 @@ def array( """ from pandas.core.arrays import ( period_array, + BooleanArray, + IntegerArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, + StringArray, ) if lib.is_scalar(data): msg = "Cannot pass scalar '{}' to 'pandas.array'." raise ValueError(msg.format(data)) - data = extract_array(data, extract_numpy=True) - - if dtype is None and isinstance(data, ABCExtensionArray): + if dtype is None and isinstance( + data, (ABCSeries, ABCIndexClass, ABCExtensionArray) + ): dtype = data.dtype + data = extract_array(data, extract_numpy=True) + # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype @@ -270,7 +279,7 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=False) + inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) @@ -298,7 +307,14 @@ def array( # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) - # TODO(BooleanArray): handle this type + elif inferred_dtype == "string": + return StringArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "integer": + return IntegerArray._from_sequence(data, copy=copy) + + elif inferred_dtype == "boolean": + return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 6f443f1841dcc..479f8dbad0418 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -19,14 +19,18 @@ "data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ( [1, 2], np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), + ( + np.array([1, 2], dtype="int64"), + None, + pd.arrays.IntegerArray._from_sequence([1, 2]), + ), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -113,6 +117,20 @@ # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # String + (["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])), + ( + ["a", None], + pd.StringDtype(), + pd.arrays.StringArray._from_sequence(["a", None]), + ), + # Boolean + ([True, None], "boolean", pd.arrays.BooleanArray._from_sequence([True, None])), + ( + [True, None], + pd.BooleanDtype(), + pd.arrays.BooleanArray._from_sequence([True, None]), + ), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -139,15 +157,15 @@ def test_array(data, dtype, expected): def test_array_copy(): a = np.array([1, 2]) # default is to copy - b = pd.array(a) + b = pd.array(a, dtype=a.dtype) assert np.shares_memory(a, b._ndarray) is False # copy=True - b = pd.array(a, copy=True) + b = pd.array(a, dtype=a.dtype, copy=True) assert np.shares_memory(a, b._ndarray) is False # copy=False - b = pd.array(a, copy=False) + b = pd.array(a, dtype=a.dtype, copy=False) assert np.shares_memory(a, b._ndarray) is True @@ -211,6 +229,15 @@ def test_array_copy(): np.array([1, 2], dtype="m8[us]"), pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), ), + # integer + ([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])), + ([1, None], pd.arrays.IntegerArray._from_sequence([1, None])), + # string + (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])), + (["a", None], pd.arrays.StringArray._from_sequence(["a", None])), + # Boolean + ([True, False], pd.arrays.BooleanArray._from_sequence([True, False])), + ([True, None], pd.arrays.BooleanArray._from_sequence([True, None])), ], ) def test_array_inference(data, expected): @@ -241,7 +268,7 @@ def test_array_inference_fails(data): @pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) def test_nd_raises(data): with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): - pd.array(data) + pd.array(data, dtype="int64") def test_scalar_raises(): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 53e979d12a56d..75e86a2ee7ecc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -732,12 +732,17 @@ def test_string(self): def test_unicode(self): arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) + # This currently returns "mixed", but it's not clear that's optimal. + # This could also return "string" or "mixed-string" assert result == "mixed" arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) - expected = "string" - assert result == expected + assert result == "string" + + arr = ["a", "c"] + result = lib.infer_dtype(arr, skipna=False) + assert result == "string" @pytest.mark.parametrize( "dtype, missing, skipna, expected", diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index d491e9f25c897..b27e7c217c4c2 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -615,12 +615,12 @@ def test_constructor_no_pandas_array(self): def test_add_column_with_pandas_array(self): # GH 26390 df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = pd.array([1, 2, None, 3]) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) df2 = pd.DataFrame( { "a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], - "c": pd.array([1, 2, None, 3]), + "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), } ) assert type(df["c"]._data.blocks[0]) == ObjectBlock diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index abe2ddf955ad8..551782d0b363a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1268,7 +1268,7 @@ def test_block_shape(): def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 - arr = pd.array([1, 2]) + arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 977e7ded1f1a7..92d72706f3dec 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -251,7 +251,7 @@ def __add__(self, other): @pytest.mark.parametrize( "values", [ - pd.array([1, 3, 2]), + pd.array([1, 3, 2], dtype="int64"), pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"),