From 31dcbb75dbcd315af9baf4ff9bd25cbf483046c0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 26 Jan 2019 09:01:55 +0100 Subject: [PATCH 01/30] DOC: Minor what's new fix (#24933) --- doc/source/whatsnew/v0.24.0.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index fc963fce37a5b..16319a3b83ca4 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -6,7 +6,8 @@ What's New in 0.24.0 (January 25, 2019) .. warning:: The 0.24.x series of releases will be the last to support Python 2. Future feature - releases will support Python 3 only. See :ref:`install.dropping-27` for more. + releases will support Python 3 only. See :ref:`install.dropping-27` for more + details. {{ header }} @@ -244,7 +245,7 @@ the new extension arrays that back interval and period data. Joining with two multi-indexes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`DataFrame.merge` and :func:`DataFrame.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels (:issue:`6360`) +:func:`DataFrame.merge` and :func:`DataFrame.join` can now be used to join multi-indexed ``Dataframe`` instances on the overlapping index levels (:issue:`6360`) See the :ref:`Merge, join, and concatenate ` documentation section. From 84056c52e3f20ab44921b86a8e0f05275bf8ddb4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 26 Jan 2019 09:47:00 -0800 Subject: [PATCH 02/30] Backport PR #24916: BUG-24212 fix regression in #24897 (#24951) --- doc/source/whatsnew/v0.24.1.rst | 3 ++ pandas/core/reshape/merge.py | 45 ++++++++++++++++++++++-- pandas/tests/reshape/merge/test_merge.py | 31 ++++++++-------- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index ee4b7ab62b31a..3ac2ed73ea53f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -63,6 +63,9 @@ Bug Fixes - - +**Reshaping** + +- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) **Other** diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e11847d2b8ce2..1dd19a7c1514e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,13 +757,21 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self.left.index.take(left_indexer) + join_index = self._create_join_index(self.left.index, + self.right.index, + left_indexer, + right_indexer, + how='right') else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self.right.index.take(right_indexer) + join_index = self._create_join_index(self.right.index, + self.left.index, + right_indexer, + left_indexer, + how='left') else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -774,6 +782,39 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer + def _create_join_index(self, index, other_index, indexer, + other_indexer, how='left'): + """ + Create a join index by rearranging one index to match another + + Parameters + ---------- + index: Index being rearranged + other_index: Index used to supply values not found in index + indexer: how to rearrange index + how: replacement is only necessary if indexer based on other_index + + Returns + ------- + join_index + """ + join_index = index.take(indexer) + if (self.how in (how, 'outer') and + not isinstance(other_index, MultiIndex)): + # if final index requires values in other_index but not target + # index, indexer may hold missing (-1) values, causing Index.take + # to take the final value in target index + mask = indexer == -1 + if np.any(mask): + # if values missing (-1) from target index, + # take from other_index instead + join_list = join_index.to_numpy() + other_list = other_index.take(other_indexer).to_numpy() + join_list[mask] = other_list[mask] + join_index = Index(join_list, dtype=join_index.dtype, + name=join_index.name) + return join_index + def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0a3ddc8ce8a4..c17c301968269 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -939,25 +939,22 @@ def test_merge_two_empty_df_no_division_error(self): with np.errstate(divide='raise'): merge(a, a, on=('a', 'b')) - @pytest.mark.parametrize('how', ['left', 'outer']) - @pytest.mark.xfail(reason="GH-24897") + @pytest.mark.parametrize('how', ['right', 'outer']) def test_merge_on_index_with_more_values(self, how): # GH 24212 - # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is - # interpreted as a missing value instead of the last element - df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], - columns=['a', 'b']) - df2 = pd.DataFrame([[3, 30], [4, 40]], - columns=['a', 'c']) - df1.set_index('a', drop=False, inplace=True) - df2.set_index('a', inplace=True) - result = pd.merge(df1, df2, left_index=True, right_on='a', how=how) - expected = pd.DataFrame([[1, 2, np.nan], - [2, 4, np.nan], - [3, 6, 30.0], - [4, 8, 40.0]], - columns=['a', 'b', 'c']) - expected.set_index('a', drop=False, inplace=True) + # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that + # -1 is interpreted as a missing value instead of the last element + df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) + df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) + result = df1.merge(df2, left_on='key', right_index=True, how=how) + expected = pd.DataFrame([[1.0, 0, 1], + [2.0, 2, 3], + [3.0, 2, 3], + [np.nan, 1, 2], + [np.nan, 3, 4], + [np.nan, 4, 5]], + columns=['a', 'key', 'b']) + expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True) assert_frame_equal(result, expected) def test_merge_right_index_right(self): From e22a6c848637a807c415936c7ae8a7bd53758985 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 28 Jan 2019 15:01:32 +0100 Subject: [PATCH 03/30] Revert "Backport PR #24916: BUG-24212 fix regression in #24897 (#24951)" This reverts commit 84056c52e3f20ab44921b86a8e0f05275bf8ddb4. --- doc/source/whatsnew/v0.24.1.rst | 3 -- pandas/core/reshape/merge.py | 45 ++---------------------- pandas/tests/reshape/merge/test_merge.py | 31 ++++++++-------- 3 files changed, 19 insertions(+), 60 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 3ac2ed73ea53f..ee4b7ab62b31a 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -63,9 +63,6 @@ Bug Fixes - - -**Reshaping** - -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) **Other** diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1dd19a7c1514e..e11847d2b8ce2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -757,21 +757,13 @@ def _get_join_info(self): if self.right_index: if len(self.left) > 0: - join_index = self._create_join_index(self.left.index, - self.right.index, - left_indexer, - right_indexer, - how='right') + join_index = self.left.index.take(left_indexer) else: join_index = self.right.index.take(right_indexer) left_indexer = np.array([-1] * len(join_index)) elif self.left_index: if len(self.right) > 0: - join_index = self._create_join_index(self.right.index, - self.left.index, - right_indexer, - left_indexer, - how='left') + join_index = self.right.index.take(right_indexer) else: join_index = self.left.index.take(left_indexer) right_indexer = np.array([-1] * len(join_index)) @@ -782,39 +774,6 @@ def _get_join_info(self): join_index = join_index.astype(object) return join_index, left_indexer, right_indexer - def _create_join_index(self, index, other_index, indexer, - other_indexer, how='left'): - """ - Create a join index by rearranging one index to match another - - Parameters - ---------- - index: Index being rearranged - other_index: Index used to supply values not found in index - indexer: how to rearrange index - how: replacement is only necessary if indexer based on other_index - - Returns - ------- - join_index - """ - join_index = index.take(indexer) - if (self.how in (how, 'outer') and - not isinstance(other_index, MultiIndex)): - # if final index requires values in other_index but not target - # index, indexer may hold missing (-1) values, causing Index.take - # to take the final value in target index - mask = indexer == -1 - if np.any(mask): - # if values missing (-1) from target index, - # take from other_index instead - join_list = join_index.to_numpy() - other_list = other_index.take(other_indexer).to_numpy() - join_list[mask] = other_list[mask] - join_index = Index(join_list, dtype=join_index.dtype, - name=join_index.name) - return join_index - def _get_merge_keys(self): """ Note: has side effects (copy/delete key columns) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c17c301968269..f0a3ddc8ce8a4 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -939,22 +939,25 @@ def test_merge_two_empty_df_no_division_error(self): with np.errstate(divide='raise'): merge(a, a, on=('a', 'b')) - @pytest.mark.parametrize('how', ['right', 'outer']) + @pytest.mark.parametrize('how', ['left', 'outer']) + @pytest.mark.xfail(reason="GH-24897") def test_merge_on_index_with_more_values(self, how): # GH 24212 - # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that - # -1 is interpreted as a missing value instead of the last element - df1 = pd.DataFrame({'a': [1, 2, 3], 'key': [0, 2, 2]}) - df2 = pd.DataFrame({'b': [1, 2, 3, 4, 5]}) - result = df1.merge(df2, left_on='key', right_index=True, how=how) - expected = pd.DataFrame([[1.0, 0, 1], - [2.0, 2, 3], - [3.0, 2, 3], - [np.nan, 1, 2], - [np.nan, 3, 4], - [np.nan, 4, 5]], - columns=['a', 'key', 'b']) - expected.set_index(Int64Index([0, 1, 2, 1, 3, 4]), inplace=True) + # pd.merge gets [-1, -1, 0, 1] as right_indexer, ensure that -1 is + # interpreted as a missing value instead of the last element + df1 = pd.DataFrame([[1, 2], [2, 4], [3, 6], [4, 8]], + columns=['a', 'b']) + df2 = pd.DataFrame([[3, 30], [4, 40]], + columns=['a', 'c']) + df1.set_index('a', drop=False, inplace=True) + df2.set_index('a', inplace=True) + result = pd.merge(df1, df2, left_index=True, right_on='a', how=how) + expected = pd.DataFrame([[1, 2, np.nan], + [2, 4, np.nan], + [3, 6, 30.0], + [4, 8, 40.0]], + columns=['a', 'b', 'c']) + expected.set_index('a', drop=False, inplace=True) assert_frame_equal(result, expected) def test_merge_right_index_right(self): From 638ac195af5974983c45b74a3de1332f5322d501 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 28 Jan 2019 08:17:36 -0800 Subject: [PATCH 04/30] Backport PR #24965: Fixed itertuples usage in to_dict (#24978) --- doc/source/whatsnew/v0.24.1.rst | 7 +++++++ pandas/core/frame.py | 15 +++++++++------ pandas/tests/frame/test_convert_to.py | 14 ++++++++++++++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index ee4b7ab62b31a..16044fb00d4f5 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -15,6 +15,13 @@ Whats New in 0.24.1 (February XX, 2019) These are the changes in pandas 0.24.1. See :ref:`release` for a full changelog including other versions of pandas. +.. _whatsnew_0241.regressions: + +Fixed Regressions +^^^^^^^^^^^^^^^^^ + +- Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) +- Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) .. _whatsnew_0241.enhancements: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4f79bda25517..28c6f3c23a3ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -847,7 +847,7 @@ def itertuples(self, index=True, name="Pandas"): ---------- index : bool, default True If True, return the index as the first element of the tuple. - name : str, default "Pandas" + name : str or None, default "Pandas" The name of the returned namedtuples or None to return regular tuples. @@ -1290,23 +1290,26 @@ def to_dict(self, orient='dict', into=dict): ('columns', self.columns.tolist()), ('data', [ list(map(com.maybe_box_datetimelike, t)) - for t in self.itertuples(index=False)] - ))) + for t in self.itertuples(index=False, name=None) + ]))) elif orient.lower().startswith('s'): return into_c((k, com.maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): + columns = self.columns.tolist() + rows = (dict(zip(columns, row)) + for row in self.itertuples(index=False, name=None)) return [ into_c((k, com.maybe_box_datetimelike(v)) - for k, v in compat.iteritems(row._asdict())) - for row in self.itertuples(index=False)] + for k, v in compat.iteritems(row)) + for row in rows] elif orient.lower().startswith('i'): if not self.index.is_unique: raise ValueError( "DataFrame index must be unique for orient='index'." ) return into_c((t[0], dict(zip(self.columns, t[1:]))) - for t in self.itertuples()) + for t in self.itertuples(name=None)) else: raise ValueError("orient '{o}' not understood".format(o=orient)) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index ddf85136126a1..7b98395dd6dec 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -488,3 +488,17 @@ def test_to_dict_index_dtypes(self, into, expected): result = DataFrame.from_dict(result, orient='index')[cols] expected = DataFrame.from_dict(expected, orient='index')[cols] tm.assert_frame_equal(result, expected) + + def test_to_dict_numeric_names(self): + # https://github.com/pandas-dev/pandas/issues/24940 + df = DataFrame({str(i): [i] for i in range(5)}) + result = set(df.to_dict('records')[0].keys()) + expected = set(df.columns) + assert result == expected + + def test_to_dict_wide(self): + # https://github.com/pandas-dev/pandas/issues/24939 + df = DataFrame({('A_{:d}'.format(i)): [i] for i in range(256)}) + result = df.to_dict('records')[0] + expected = {'A_{:d}'.format(i): i for i in range(256)} + assert result == expected From 72dc33fa64459d3e3f4a95925fe86c8e95be7908 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Jan 2019 05:26:16 -0800 Subject: [PATCH 05/30] Backport PR #24989: DOC: Document breaking change to read_csv (#24996) --- doc/source/user_guide/io.rst | 30 +++++++++++++++++++++ doc/source/whatsnew/v0.24.0.rst | 46 +++++++++++++++++++++++++++++++++ pandas/io/parsers.py | 11 +++++--- 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 58e1b2370c7c8..b23a0f10e9e2b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -989,6 +989,36 @@ a single date rather than the entire array. os.remove('tmp.csv') + +.. _io.csv.mixed_timezones: + +Parsing a CSV with mixed Timezones +++++++++++++++++++++++++++++++++++ + +Pandas cannot natively represent a column or index with mixed timezones. If your CSV +file contains columns with a mixture of timezones, the default result will be +an object-dtype column with strings, even with ``parse_dates``. + + +.. ipython:: python + + content = """\ + a + 2000-01-01T00:00:00+05:00 + 2000-01-01T00:00:00+06:00""" + df = pd.read_csv(StringIO(content), parse_dates=['a']) + df['a'] + +To parse the mixed-timezone values as a datetime column, pass a partially-applied +:func:`to_datetime` with ``utc=True`` as the ``date_parser``. + +.. ipython:: python + + df = pd.read_csv(StringIO(content), parse_dates=['a'], + date_parser=lambda col: pd.to_datetime(col, utc=True)) + df['a'] + + .. _io.dayfirst: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 16319a3b83ca4..a49ea2cf493a6 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -648,6 +648,52 @@ that the dates have been converted to UTC pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.read_csv_mixed_tz: + +Parsing mixed-timezones with :func:`read_csv` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_csv` no longer silently converts mixed-timezone columns to UTC (:issue:`24987`). + +*Previous Behavior* + +.. code-block:: python + + >>> import io + >>> content = """\ + ... a + ... 2000-01-01T00:00:00+05:00 + ... 2000-01-01T00:00:00+06:00""" + >>> df = pd.read_csv(io.StringIO(content), parse_dates=['a']) + >>> df.a + 0 1999-12-31 19:00:00 + 1 1999-12-31 18:00:00 + Name: a, dtype: datetime64[ns] + +*New Behavior* + +.. ipython:: python + + import io + content = """\ + a + 2000-01-01T00:00:00+05:00 + 2000-01-01T00:00:00+06:00""" + df = pd.read_csv(io.StringIO(content), parse_dates=['a']) + df.a + +As can be seen, the ``dtype`` is object; each value in the column is a string. +To convert the strings to an array of datetimes, the ``date_parser`` argument + +.. ipython:: python + + df = pd.read_csv(io.StringIO(content), parse_dates=['a'], + date_parser=lambda col: pd.to_datetime(col, utc=True)) + df.a + +See :ref:`whatsnew_0240.api.timezone_offset_parsing` for more. + .. _whatsnew_0240.api_breaking.period_end_time: Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b31d3f665f47f..4163a571df800 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -203,9 +203,14 @@ * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call result 'foo' - If a column or index contains an unparseable date, the entire column or - index will be returned unaltered as an object data type. For non-standard - datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv`` + If a column or index cannot be represented as an array of datetimes, + say because of an unparseable value or a mixture of timezones, the column + or index will be returned unaltered as an object data type. For + non-standard datetime parsing, use ``pd.to_datetime`` after + ``pd.read_csv``. To parse an index or column with a mixture of timezones, + specify ``date_parser`` to be a partially-applied + :func:`pandas.to_datetime` with ``utc=True``. See + :ref:`io.csv.mixed_timezones` for more. Note: A fast-path exists for iso8601-formatted dates. infer_datetime_format : bool, default False From fd1c66c74f87ee64b1c492f4f0d174a6f0b8ea0a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 29 Jan 2019 05:26:29 -0800 Subject: [PATCH 06/30] Backport PR #24964: DEPR: Fixed warning for implicit registration (#24997) --- doc/source/whatsnew/v0.24.1.rst | 5 +++++ pandas/plotting/_core.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 16044fb00d4f5..7647e199030d2 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -71,6 +71,11 @@ Bug Fixes - +**Visualization** + +- Fixed the warning for implicitly registered matplotlib converters not showing. See :ref:`whatsnew_0211.converters` for more (:issue:`24963`). + + **Other** - diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index e543ab88f53b2..85549bafa8dc0 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -39,7 +39,7 @@ else: _HAS_MPL = True if get_option('plotting.matplotlib.register_converters'): - _converter.register(explicit=True) + _converter.register(explicit=False) def _raise_if_no_mpl(): From d54c3a5944502ac1e880211bef0c46498424edeb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 29 Jan 2019 10:46:51 -0600 Subject: [PATCH 07/30] Backport PR #24973: fix for BUG: grouping with tz-aware: Values falls after last bin (#25005) --- doc/source/whatsnew/v0.24.1.rst | 3 ++ pandas/core/resample.py | 31 ++++++++++---------- pandas/tests/resample/test_datetime_index.py | 15 ++++++++++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 7647e199030d2..8f4c3982c745f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -70,6 +70,9 @@ Bug Fixes - - +**Reshaping** + +- Bug in :meth:`DataFrame.groupby` with :class:`Grouper` when there is a time change (DST) and grouping frequency is ``'1d'`` (:issue:`24972`) **Visualization** diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6822225273906..7723827ff478a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -30,8 +30,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.tseries.frequencies import to_offset -from pandas.tseries.offsets import ( - DateOffset, Day, Nano, Tick, delta_to_nanoseconds) +from pandas.tseries.offsets import DateOffset, Day, Nano, Tick _shared_docs_kwargs = dict() @@ -1613,20 +1612,20 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if isinstance(offset, Tick): - is_day = isinstance(offset, Day) - day_nanos = delta_to_nanoseconds(timedelta(1)) - - # #1165 and #24127 - if (is_day and not offset.nanos % day_nanos) or not is_day: - first, last = _adjust_dates_anchored(first, last, offset, - closed=closed, base=base) - if is_day and first.tz is not None: - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). - # Ensure first/last snap to midnight. - first = first.normalize() - last = last.normalize() - return first, last + if isinstance(offset, Day): + # _adjust_dates_anchored assumes 'D' means 24H, but first/last + # might contain a DST transition (23H, 24H, or 25H). + # So "pretend" the dates are naive when adjusting the endpoints + tz = first.tz + first = first.tz_localize(None) + last = last.tz_localize(None) + + first, last = _adjust_dates_anchored(first, last, offset, + closed=closed, base=base) + if isinstance(offset, Day): + first = first.tz_localize(tz) + last = last.tz_localize(tz) + return first, last else: first = first.normalize() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 73995cbe79ecd..b743aeecdc756 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1276,6 +1276,21 @@ def test_resample_across_dst(): assert_frame_equal(result, expected) +def test_groupby_with_dst_time_change(): + # GH 24972 + index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], + tz='UTC').tz_convert('America/Chicago') + + df = pd.DataFrame([1, 2], index=index) + result = df.groupby(pd.Grouper(freq='1d')).last() + expected_index_values = pd.date_range('2016-11-02', '2016-11-24', + freq='d', tz='America/Chicago') + + index = pd.DatetimeIndex(expected_index_values) + expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) + assert_frame_equal(result, expected) + + def test_resample_dst_anchor(): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') From e3cc0b18ea933613af27fefeb07431cc305030a7 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 30 Jan 2019 04:50:03 -0800 Subject: [PATCH 08/30] Backport PR #24967: REGR: Preserve order by default in Index.difference (#25013) --- doc/source/whatsnew/v0.24.1.rst | 1 + pandas/core/indexes/base.py | 8 ++++++-- pandas/core/indexes/datetimes.py | 10 +++++++++- pandas/core/indexes/interval.py | 6 +++--- pandas/core/indexes/multi.py | 6 +++++- pandas/core/indexes/range.py | 6 +++++- pandas/tests/indexes/test_base.py | 22 +++++++++++++++++----- 7 files changed, 46 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 8f4c3982c745f..828c35c10e958 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -22,6 +22,7 @@ Fixed Regressions - Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) - Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) +- Fixed regression in :class:`Index.intersection` incorrectly sorting the values by default (:issue:`24959`). .. _whatsnew_0241.enhancements: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 767da81c5c43a..3d176012df22b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2333,7 +2333,7 @@ def union(self, other, sort=True): def _wrap_setop_result(self, other, result): return self._constructor(result, name=get_op_result_name(self, other)) - def intersection(self, other, sort=True): + def intersection(self, other, sort=False): """ Form the intersection of two Index objects. @@ -2342,11 +2342,15 @@ def intersection(self, other, sort=True): Parameters ---------- other : Index or array-like - sort : bool, default True + sort : bool, default False Sort the resulting index if possible .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default from ``True`` to ``False``. + Returns ------- intersection : Index diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index cc373c06efcc9..ef941ab87ba12 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -594,7 +594,7 @@ def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) return self._shallow_copy(result, name=name, freq=None, tz=self.tz) - def intersection(self, other, sort=True): + def intersection(self, other, sort=False): """ Specialized intersection for DatetimeIndex objects. May be much faster than Index.intersection @@ -602,6 +602,14 @@ def intersection(self, other, sort=True): Parameters ---------- other : DatetimeIndex or array-like + sort : bool, default True + Sort the resulting index if possible. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default from ``True`` to ``False``. Returns ------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 0210560aaa21f..736de94991181 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1093,8 +1093,8 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) - def _setop(op_name): - def func(self, other, sort=True): + def _setop(op_name, sort=True): + def func(self, other, sort=sort): other = self._as_like_interval_index(other) # GH 19016: ensure set op will not return a prohibited dtype @@ -1128,7 +1128,7 @@ def is_all_dates(self): return False union = _setop('union') - intersection = _setop('intersection') + intersection = _setop('intersection', sort=False) difference = _setop('difference') symmetric_difference = _setop('symmetric_difference') diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e4d01a40bd181..16af3fe8eef26 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2910,7 +2910,7 @@ def union(self, other, sort=True): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def intersection(self, other, sort=True): + def intersection(self, other, sort=False): """ Form the intersection of two MultiIndex objects. @@ -2922,6 +2922,10 @@ def intersection(self, other, sort=True): .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default from ``True`` to ``False``. + Returns ------- Index diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index ebf5b279563cf..e17a6a682af40 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -343,7 +343,7 @@ def equals(self, other): return super(RangeIndex, self).equals(other) - def intersection(self, other, sort=True): + def intersection(self, other, sort=False): """ Form the intersection of two Index objects. @@ -355,6 +355,10 @@ def intersection(self, other, sort=True): .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default from ``True`` to ``False``. + Returns ------- intersection : Index diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f3e9d835c7391..20e439de46bde 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -765,6 +765,11 @@ def test_intersect_str_dates(self, sort): assert len(result) == 0 + def test_intersect_nosort(self): + result = pd.Index(['c', 'b', 'a']).intersection(['b', 'a']) + expected = pd.Index(['b', 'a']) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("sort", [True, False]) def test_chained_union(self, sort): # Chained unions handles names correctly @@ -1595,20 +1600,27 @@ def test_drop_tuple(self, values, to_drop): for drop_me in to_drop[1], [to_drop[1]]: pytest.raises(KeyError, removed.drop, drop_me) - @pytest.mark.parametrize("method,expected", [ + @pytest.mark.parametrize("method,expected,sort", [ + ('intersection', np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], + dtype=[('num', int), ('let', 'a1')]), + False), + ('intersection', np.array([(1, 'A'), (1, 'B'), (2, 'A'), (2, 'B')], - dtype=[('num', int), ('let', 'a1')])), + dtype=[('num', int), ('let', 'a1')]), + True), + ('union', np.array([(1, 'A'), (1, 'B'), (1, 'C'), (2, 'A'), (2, 'B'), - (2, 'C')], dtype=[('num', int), ('let', 'a1')])) + (2, 'C')], dtype=[('num', int), ('let', 'a1')]), + True) ]) - def test_tuple_union_bug(self, method, expected): + def test_tuple_union_bug(self, method, expected, sort): index1 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], dtype=[('num', int), ('let', 'a1')])) index2 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B'), (1, 'C'), (2, 'C')], dtype=[('num', int), ('let', 'a1')])) - result = getattr(index1, method)(index2) + result = getattr(index1, method)(index2, sort=sort) assert result.ndim == 1 expected = Index(expected) From c228597c78978dcada5db260a69f2d3230ee51d9 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 30 Jan 2019 05:18:38 -0800 Subject: [PATCH 09/30] Backport PR #24961: fix+test to_timedelta('NaT', box=False) (#25025) --- doc/source/whatsnew/v0.24.1.rst | 2 +- pandas/core/tools/timedeltas.py | 3 ++- pandas/tests/scalar/timedelta/test_timedelta.py | 9 +++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 828c35c10e958..57fdff041db28 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -66,7 +66,7 @@ Bug Fixes - **Timedelta** - +- Bug in :func:`to_timedelta` with `box=False` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) - - - diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index e3428146b91d8..ddd21d0f62d08 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -120,7 +120,8 @@ def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): try: result = Timedelta(r, unit) if not box: - result = result.asm8 + # explicitly view as timedelta64 for case when result is pd.NaT + result = result.asm8.view('timedelta64[ns]') except ValueError: if errors == 'raise': raise diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 9b5fdfb06a9fa..e1838e0160fec 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -309,8 +309,13 @@ def test_iso_conversion(self): assert to_timedelta('P0DT0H0M1S') == expected def test_nat_converters(self): - assert to_timedelta('nat', box=False).astype('int64') == iNaT - assert to_timedelta('nan', box=False).astype('int64') == iNaT + result = to_timedelta('nat', box=False) + assert result.dtype.kind == 'm' + assert result.astype('int64') == iNaT + + result = to_timedelta('nan', box=False) + assert result.dtype.kind == 'm' + assert result.astype('int64') == iNaT @pytest.mark.parametrize('units, np_unit', [(['Y', 'y'], 'Y'), From 795653371c0fe83a077a630d18f561cf00524a1a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 30 Jan 2019 14:27:19 -0800 Subject: [PATCH 10/30] Backport PR #25033: BUG: Fixed merging on tz-aware (#25041) --- doc/source/whatsnew/v0.24.1.rst | 1 + pandas/core/internals/concat.py | 6 ++++-- pandas/tests/reshape/merge/test_merge.py | 18 ++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 57fdff041db28..047404e93914b 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -23,6 +23,7 @@ Fixed Regressions - Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) - Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) - Fixed regression in :class:`Index.intersection` incorrectly sorting the values by default (:issue:`24959`). +- Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). .. _whatsnew_0241.enhancements: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 4a16707a376e9..640587b7f9f31 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -183,7 +183,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): is_datetime64tz_dtype(empty_dtype)): if self.block is None: array = empty_dtype.construct_array_type() - return array(np.full(self.shape[1], fill_value), + return array(np.full(self.shape[1], fill_value.value), dtype=empty_dtype) pass elif getattr(self.block, 'is_categorical', False): @@ -335,8 +335,10 @@ def get_empty_dtype_and_na(join_units): elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: + # GH-25014. We use NaT instead of iNaT, since this eventually + # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes['datetimetz'] - return dtype[0], tslibs.iNaT + return dtype[0], tslibs.NaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0a3ddc8ce8a4..1e60fdbebfeb3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -616,6 +616,24 @@ def test_merge_on_datetime64tz(self): assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]' + def test_merge_on_datetime64tz_empty(self): + # https://github.com/pandas-dev/pandas/issues/25014 + dtz = pd.DatetimeTZDtype(tz='UTC') + right = pd.DataFrame({'date': [pd.Timestamp('2018', tz=dtz.tz)], + 'value': [4.0], + 'date2': [pd.Timestamp('2019', tz=dtz.tz)]}, + columns=['date', 'value', 'date2']) + left = right[:0] + result = left.merge(right, on='date') + expected = pd.DataFrame({ + 'value_x': pd.Series(dtype=float), + 'date2_x': pd.Series(dtype=dtz), + 'date': pd.Series(dtype=dtz), + 'value_y': pd.Series(dtype=float), + 'date2_y': pd.Series(dtype=dtz), + }, columns=['value_x', 'date2_x', 'date', 'value_y', 'date2_y']) + tm.assert_frame_equal(result, expected) + def test_merge_datetime64tz_with_dst_transition(self): # GH 18885 df1 = pd.DataFrame(pd.date_range( From 722bb79ebe20ceabaedf79b0ae22486a15dc7805 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 30 Jan 2019 14:28:45 -0800 Subject: [PATCH 11/30] Backport PR #24993: Test nested PandasArray (#25042) --- pandas/core/arrays/numpy_.py | 2 +- pandas/tests/extension/numpy_/__init__.py | 0 pandas/tests/extension/numpy_/conftest.py | 38 +++ .../extension/{ => numpy_}/test_numpy.py | 36 +-- .../extension/numpy_/test_numpy_nested.py | 286 ++++++++++++++++++ 5 files changed, 326 insertions(+), 36 deletions(-) create mode 100644 pandas/tests/extension/numpy_/__init__.py create mode 100644 pandas/tests/extension/numpy_/conftest.py rename pandas/tests/extension/{ => numpy_}/test_numpy.py (84%) create mode 100644 pandas/tests/extension/numpy_/test_numpy_nested.py diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 47517782e2bbf..791ff44303e96 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -222,7 +222,7 @@ def __getitem__(self, item): item = item._ndarray result = self._ndarray[item] - if not lib.is_scalar(result): + if not lib.is_scalar(item): result = type(self)(result) return result diff --git a/pandas/tests/extension/numpy_/__init__.py b/pandas/tests/extension/numpy_/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/numpy_/conftest.py b/pandas/tests/extension/numpy_/conftest.py new file mode 100644 index 0000000000000..daa93571c2957 --- /dev/null +++ b/pandas/tests/extension/numpy_/conftest.py @@ -0,0 +1,38 @@ +import numpy as np +import pytest + +from pandas.core.arrays.numpy_ import PandasArray + + +@pytest.fixture +def allow_in_pandas(monkeypatch): + """ + A monkeypatch to tell pandas to let us in. + + By default, passing a PandasArray to an index / series / frame + constructor will unbox that PandasArray to an ndarray, and treat + it as a non-EA column. We don't want people using EAs without + reason. + + The mechanism for this is a check against ABCPandasArray + in each constructor. + + But, for testing, we need to allow them in pandas. So we patch + the _typ of PandasArray, so that we evade the ABCPandasArray + check. + """ + with monkeypatch.context() as m: + m.setattr(PandasArray, '_typ', 'extension') + yield + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def na_cmp(): + def cmp(a, b): + return np.isnan(a) and np.isnan(b) + return cmp diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/numpy_/test_numpy.py similarity index 84% rename from pandas/tests/extension/test_numpy.py rename to pandas/tests/extension/numpy_/test_numpy.py index 7ca6882c7441b..4c93d5ee0b9d7 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/numpy_/test_numpy.py @@ -6,7 +6,7 @@ from pandas.core.arrays.numpy_ import PandasArray, PandasDtype import pandas.util.testing as tm -from . import base +from .. import base @pytest.fixture @@ -14,28 +14,6 @@ def dtype(): return PandasDtype(np.dtype('float')) -@pytest.fixture -def allow_in_pandas(monkeypatch): - """ - A monkeypatch to tells pandas to let us in. - - By default, passing a PandasArray to an index / series / frame - constructor will unbox that PandasArray to an ndarray, and treat - it as a non-EA column. We don't want people using EAs without - reason. - - The mechanism for this is a check against ABCPandasArray - in each constructor. - - But, for testing, we need to allow them in pandas. So we patch - the _typ of PandasArray, so that we evade the ABCPandasArray - check. - """ - with monkeypatch.context() as m: - m.setattr(PandasArray, '_typ', 'extension') - yield - - @pytest.fixture def data(allow_in_pandas, dtype): return PandasArray(np.arange(1, 101, dtype=dtype._dtype)) @@ -46,18 +24,6 @@ def data_missing(allow_in_pandas): return PandasArray(np.array([np.nan, 1.0])) -@pytest.fixture -def na_value(): - return np.nan - - -@pytest.fixture -def na_cmp(): - def cmp(a, b): - return np.isnan(a) and np.isnan(b) - return cmp - - @pytest.fixture def data_for_sorting(allow_in_pandas): """Length-3 array with a known sort order. diff --git a/pandas/tests/extension/numpy_/test_numpy_nested.py b/pandas/tests/extension/numpy_/test_numpy_nested.py new file mode 100644 index 0000000000000..cf9b34dd08798 --- /dev/null +++ b/pandas/tests/extension/numpy_/test_numpy_nested.py @@ -0,0 +1,286 @@ +""" +Tests for PandasArray with nested data. Users typically won't create +these objects via `pd.array`, but they can show up through `.array` +on a Series with nested data. + +We partition these tests into their own file, as many of the base +tests fail, as they aren't appropriate for nested data. It is easier +to have a seperate file with its own data generating fixtures, than +trying to skip based upon the value of a fixture. +""" +import pytest + +import pandas as pd +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype + +from .. import base + +# For NumPy <1.16, np.array([np.nan, (1,)]) raises +# ValueError: setting an array element with a sequence. +np = pytest.importorskip('numpy', minversion='1.16.0') + + +@pytest.fixture +def dtype(): + return PandasDtype(np.dtype('object')) + + +@pytest.fixture +def data(allow_in_pandas, dtype): + return pd.Series([(i,) for i in range(100)]).array + + +@pytest.fixture +def data_missing(allow_in_pandas): + return PandasArray(np.array([np.nan, (1,)])) + + +@pytest.fixture +def data_for_sorting(allow_in_pandas): + """Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + # Use an empty tuple for first element, then remove, + # to disable np.array's shape inference. + return PandasArray( + np.array([(), (2,), (3,), (1,)])[1:] + ) + + +@pytest.fixture +def data_missing_for_sorting(allow_in_pandas): + """Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + return PandasArray( + np.array([(1,), np.nan, (0,)]) + ) + + +@pytest.fixture +def data_for_grouping(allow_in_pandas): + """Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + a, b, c = (1,), (2,), (3,) + return PandasArray(np.array( + [b, b, np.nan, np.nan, a, a, b, c] + )) + + +skip_nested = pytest.mark.skip(reason="Skipping for nested PandasArray") + + +class BaseNumPyTests(object): + pass + + +class TestCasting(BaseNumPyTests, base.BaseCastingTests): + + @skip_nested + def test_astype_str(self, data): + pass + + +class TestConstructors(BaseNumPyTests, base.BaseConstructorsTests): + @pytest.mark.skip(reason="We don't register our dtype") + # We don't want to register. This test should probably be split in two. + def test_from_dtype(self, data): + pass + + @skip_nested + def test_array_from_scalars(self, data): + pass + + +class TestDtype(BaseNumPyTests, base.BaseDtypeTests): + + @pytest.mark.skip(reason="Incorrect expected.") + # we unsurprisingly clash with a NumPy name. + def test_check_dtype(self, data): + pass + + +class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): + + @skip_nested + def test_getitem_scalar(self, data): + pass + + @skip_nested + def test_take_series(self, data): + pass + + +class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): + @skip_nested + def test_groupby_extension_apply(self, data_for_grouping, op): + pass + + +class TestInterface(BaseNumPyTests, base.BaseInterfaceTests): + @skip_nested + def test_array_interface(self, data): + # NumPy array shape inference + pass + + +class TestMethods(BaseNumPyTests, base.BaseMethodsTests): + + @pytest.mark.skip(reason="TODO: remove?") + def test_value_counts(self, all_data, dropna): + pass + + @pytest.mark.skip(reason="Incorrect expected") + # We have a bool dtype, so the result is an ExtensionArray + # but expected is not + def test_combine_le(self, data_repeated): + super(TestMethods, self).test_combine_le(data_repeated) + + @skip_nested + def test_combine_add(self, data_repeated): + # Not numeric + pass + + @skip_nested + def test_shift_fill_value(self, data): + # np.array shape inference. Shift implementation fails. + super().test_shift_fill_value(data) + + @skip_nested + def test_unique(self, data, box, method): + # Fails creating expected + pass + + @skip_nested + def test_fillna_copy_frame(self, data_missing): + # The "scalar" for this array isn't a scalar. + pass + + @skip_nested + def test_fillna_copy_series(self, data_missing): + # The "scalar" for this array isn't a scalar. + pass + + @skip_nested + def test_hash_pandas_object_works(self, data, as_frame): + # ndarray of tuples not hashable + pass + + @skip_nested + def test_searchsorted(self, data_for_sorting, as_series): + # Test setup fails. + pass + + @skip_nested + def test_where_series(self, data, na_value, as_frame): + # Test setup fails. + pass + + @skip_nested + def test_repeat(self, data, repeats, as_series, use_numpy): + # Fails creating expected + pass + + +class TestPrinting(BaseNumPyTests, base.BasePrintingTests): + pass + + +class TestMissing(BaseNumPyTests, base.BaseMissingTests): + + @skip_nested + def test_fillna_scalar(self, data_missing): + # Non-scalar "scalar" values. + pass + + @skip_nested + def test_fillna_series_method(self, data_missing, method): + # Non-scalar "scalar" values. + pass + + @skip_nested + def test_fillna_series(self, data_missing): + # Non-scalar "scalar" values. + pass + + @skip_nested + def test_fillna_frame(self, data_missing): + # Non-scalar "scalar" values. + pass + + +class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): + + @pytest.mark.skip("Incorrect parent test") + # not actually a mixed concat, since we concat int and int. + def test_concat_mixed_dtypes(self, data): + super(TestReshaping, self).test_concat_mixed_dtypes(data) + + @skip_nested + def test_merge(self, data, na_value): + # Fails creating expected + pass + + @skip_nested + def test_merge_on_extension_array(self, data): + # Fails creating expected + pass + + @skip_nested + def test_merge_on_extension_array_duplicates(self, data): + # Fails creating expected + pass + + +class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): + + @skip_nested + def test_setitem_scalar_series(self, data, box_in_series): + pass + + @skip_nested + def test_setitem_sequence(self, data, box_in_series): + pass + + @skip_nested + def test_setitem_sequence_mismatched_length_raises(self, data, as_array): + pass + + @skip_nested + def test_setitem_sequence_broadcasts(self, data, box_in_series): + pass + + @skip_nested + def test_setitem_loc_scalar_mixed(self, data): + pass + + @skip_nested + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + pass + + @skip_nested + def test_setitem_iloc_scalar_mixed(self, data): + pass + + @skip_nested + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + pass + + @skip_nested + def test_setitem_mask_broadcast(self, data, setter): + pass + + @skip_nested + def test_setitem_scalar_key_sequence_raise(self, data): + pass + + +# Skip Arithmetics, NumericReduce, BooleanReduce, Parsing From e3634b1fcc483e5dda3a7b5b166690893a0ac4da Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Jan 2019 12:17:46 -0800 Subject: [PATCH 12/30] Backport PR #25039: BUG: avoid usage in_qtconsole for recent IPython versions (#25054) --- doc/source/whatsnew/v0.24.1.rst | 2 +- pandas/core/frame.py | 13 ++++++++++--- pandas/tests/io/formats/test_format.py | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 047404e93914b..521319c55a503 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -83,7 +83,7 @@ Bug Fixes **Other** -- +- Fixed AttributeError when printing a DataFrame's HTML repr after accessing the IPython config object (:issue:`25036`) - .. _whatsnew_0.241.contributors: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 28c6f3c23a3ce..5b462b949abf9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -17,6 +17,7 @@ import itertools import sys import warnings +from distutils.version import LooseVersion from textwrap import dedent import numpy as np @@ -646,9 +647,15 @@ def _repr_html_(self): # XXX: In IPython 3.x and above, the Qt console will not attempt to # display HTML, so this check can be removed when support for # IPython 2.x is no longer needed. - if console.in_qtconsole(): - # 'HTML output is disabled in QtConsole' - return None + try: + import IPython + except ImportError: + pass + else: + if LooseVersion(IPython.__version__) < LooseVersion('3.0'): + if console.in_qtconsole(): + # 'HTML output is disabled in QtConsole' + return None if self._info_repr(): buf = StringIO(u("")) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 5d922ccaf1fd5..b0cf5a2f17609 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -12,6 +12,7 @@ import os import re import sys +import textwrap import warnings import dateutil @@ -2777,3 +2778,17 @@ def test_format_percentiles(): fmt.format_percentiles([2, 0.1, 0.5]) with pytest.raises(ValueError, match=msg): fmt.format_percentiles([0.1, 0.5, 'a']) + + +def test_repr_html_ipython_config(ip): + code = textwrap.dedent("""\ + import pandas as pd + df = pd.DataFrame({"A": [1, 2]}) + df._repr_html_() + + cfg = get_ipython().config + cfg['IPKernelApp']['parent_appname'] + df._repr_html_() + """) + result = ip.run_cell(code) + assert not result.error_in_exec From 4f865c5cb505650cd55a23946fbb44ca9d577c55 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 31 Jan 2019 15:31:16 -0800 Subject: [PATCH 13/30] Backport PR #25024: REGR: fix read_sql delegation for queries on MySQL/pymysql (#25064) --- doc/source/whatsnew/v0.24.1.rst | 1 + pandas/io/sql.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 521319c55a503..222963a7ff71a 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -22,6 +22,7 @@ Fixed Regressions - Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) - Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) +- Fixed regression in :func:`read_sql` when passing certain queries with MySQL/pymysql (:issue:`24988`). - Fixed regression in :class:`Index.intersection` incorrectly sorting the values by default (:issue:`24959`). - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 5d1163b3e0024..aaface5415384 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -381,7 +381,8 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, try: _is_table_name = pandas_sql.has_table(sql) - except (ImportError, AttributeError): + except Exception: + # using generic exception to catch errors from sql drivers (GH24988) _is_table_name = False if _is_table_name: From c21d32f48e6b6efc9257e187793a19e81d2ac306 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Feb 2019 11:06:13 -0800 Subject: [PATCH 14/30] Backport PR #25069: REGR: rename_axis with None should remove axis name (#25079) --- doc/source/whatsnew/v0.24.1.rst | 1 + pandas/core/generic.py | 22 ++++++++++++++++------ pandas/tests/frame/test_alter_axes.py | 20 ++++++++++++++++++++ pandas/tests/series/test_alter_axes.py | 11 +++++++++++ 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 222963a7ff71a..c8b5417a5b77f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -25,6 +25,7 @@ Fixed Regressions - Fixed regression in :func:`read_sql` when passing certain queries with MySQL/pymysql (:issue:`24988`). - Fixed regression in :class:`Index.intersection` incorrectly sorting the values by default (:issue:`24959`). - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). +- Fixed regression in :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` where passing ``None`` failed to remove the axis name (:issue:`25034`) .. _whatsnew_0241.enhancements: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2b97661fe9ec3..1d8077873e1ea 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -61,6 +61,10 @@ by : str or list of str Name or list of names to sort by""") +# sentinel value to use as kwarg in place of None when None has special meaning +# and needs to be distinguished from a user explicitly passing None. +sentinel = object() + def _single_replace(self, to_replace, method, inplace, limit): """ @@ -290,11 +294,16 @@ def _construct_axes_dict_for_slice(self, axes=None, **kwargs): d.update(kwargs) return d - def _construct_axes_from_arguments(self, args, kwargs, require_all=False): + def _construct_axes_from_arguments( + self, args, kwargs, require_all=False, sentinel=None): """Construct and returns axes if supplied in args/kwargs. If require_all, raise if all axis arguments are not supplied return a tuple of (axes, kwargs). + + sentinel specifies the default parameter when an axis is not + supplied; useful to distinguish when a user explicitly passes None + in scenarios where None has special meaning. """ # construct the args @@ -322,7 +331,7 @@ def _construct_axes_from_arguments(self, args, kwargs, require_all=False): raise TypeError("not enough/duplicate arguments " "specified!") - axes = {a: kwargs.pop(a, None) for a in self._AXIS_ORDERS} + axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS} return axes, kwargs @classmethod @@ -1089,7 +1098,7 @@ def rename(self, *args, **kwargs): @rewrite_axis_style_signature('mapper', [('copy', True), ('inplace', False)]) - def rename_axis(self, mapper=None, **kwargs): + def rename_axis(self, mapper=sentinel, **kwargs): """ Set the name of the axis for the index or columns. @@ -1218,7 +1227,8 @@ class name cat 4 0 monkey 2 2 """ - axes, kwargs = self._construct_axes_from_arguments((), kwargs) + axes, kwargs = self._construct_axes_from_arguments( + (), kwargs, sentinel=sentinel) copy = kwargs.pop('copy', True) inplace = kwargs.pop('inplace', False) axis = kwargs.pop('axis', 0) @@ -1231,7 +1241,7 @@ class name inplace = validate_bool_kwarg(inplace, 'inplace') - if (mapper is not None): + if (mapper is not sentinel): # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not is_dict_like(mapper)) @@ -1254,7 +1264,7 @@ class name for axis in lrange(self._AXIS_LEN): v = axes.get(self._AXIS_NAMES[axis]) - if v is None: + if v is sentinel: continue non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index c2355742199dc..2d1afa2281d44 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -600,6 +600,26 @@ def test_rename_axis_mapper(self): with pytest.raises(TypeError, match='bogus'): df.rename_axis(bogus=None) + @pytest.mark.parametrize('kwargs, rename_index, rename_columns', [ + ({'mapper': None, 'axis': 0}, True, False), + ({'mapper': None, 'axis': 1}, False, True), + ({'index': None}, True, False), + ({'columns': None}, False, True), + ({'index': None, 'columns': None}, True, True), + ({}, False, False)]) + def test_rename_axis_none(self, kwargs, rename_index, rename_columns): + # GH 25034 + index = Index(list('abc'), name='foo') + columns = Index(['col1', 'col2'], name='bar') + data = np.arange(6).reshape(3, 2) + df = DataFrame(data, index, columns) + + result = df.rename_axis(**kwargs) + expected_index = index.rename(None) if rename_index else index + expected_columns = columns.rename(None) if rename_columns else columns + expected = DataFrame(data, expected_index, expected_columns) + tm.assert_frame_equal(result, expected) + def test_rename_multiindex(self): tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 04c54bcf8c22c..73adc7d4bf82f 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -258,6 +258,17 @@ def test_rename_axis_inplace(self, datetime_series): assert no_return is None tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('kwargs', [{'mapper': None}, {'index': None}, {}]) + def test_rename_axis_none(self, kwargs): + # GH 25034 + index = Index(list('abc'), name='foo') + df = Series([1, 2, 3], index=index) + + result = df.rename_axis(**kwargs) + expected_index = index.rename(None) if kwargs else index + expected = Series([1, 2, 3], index=expected_index) + tm.assert_series_equal(result, expected) + def test_set_axis_inplace_axes(self, axis_series): # GH14636 ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') From 5cb622a8a6bf36726ccaa6f39aa63302af533fd9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 1 Feb 2019 14:09:00 -0600 Subject: [PATCH 15/30] DOC: 0.24.1 whatsnew (#25027) --- doc/source/index.rst.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 51487c0d325b5..df2a29a76f3c5 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. {% endif %} {% if not single_doc -%} - What's New in 0.24.0 + What's New in 0.24.1 install getting_started/index user_guide/index From c397839705603f82f02bc2c32961352b160b80ec Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 1 Feb 2019 21:28:10 +0100 Subject: [PATCH 16/30] Revert "DOC: update DF.set_index (#24762)" This reverts commit e984947ab42b2c95c5acbe37cf2e24786a51980d. --- doc/source/whatsnew/v0.24.0.rst | 4 +- pandas/core/frame.py | 64 +++++++++++---------------- pandas/tests/frame/test_alter_axes.py | 53 +++++++++------------- 3 files changed, 48 insertions(+), 73 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a49ea2cf493a6..0777cbaf934b3 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1195,8 +1195,8 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types, - and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) +- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, + has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b462b949abf9..c77782f926a13 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4052,16 +4052,12 @@ def set_index(self, keys, drop=True, append=False, inplace=False, Set the DataFrame index using existing columns. Set the DataFrame index (row labels) using one or more existing - columns or arrays (of the correct length). The index can replace the - existing index or expand on it. + columns. The index can replace the existing index or expand on it. Parameters ---------- - keys : label or array-like or list of labels/arrays - This parameter can be either a single column key, a single array of - the same length as the calling DataFrame, or a list containing an - arbitrary combination of column keys and arrays. Here, "array" - encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. + keys : label or list of label + Name or names of the columns that will be used as the index. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4106,7 +4102,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 7 2013 84 10 2014 31 - Create a MultiIndex using columns 'year' and 'month': + Create a multi-index using columns 'year' and 'month': >>> df.set_index(['year', 'month']) sale @@ -4116,51 +4112,35 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2013 7 84 2014 10 31 - Create a MultiIndex using an Index and a column: + Create a multi-index using a set of values and a column: - >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) + >>> df.set_index([[1, 2, 3, 4], 'year']) month sale year 1 2012 1 55 2 2014 4 40 3 2013 7 84 4 2014 10 31 - - Create a MultiIndex using two Series: - - >>> s = pd.Series([1, 2, 3, 4]) - >>> df.set_index([s, s**2]) - month year sale - 1 1 1 2012 55 - 2 4 4 2014 40 - 3 9 7 2013 84 - 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, 'inplace') - - err_msg = ('The parameter "keys" may be a column key, one-dimensional ' - 'array, or a list containing only valid column keys and ' - 'one-dimensional arrays.') - - if (is_scalar(keys) or isinstance(keys, tuple) - or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))): - # make sure we have a container of keys/arrays we can iterate over - # tuples can appear as valid column keys! + if not isinstance(keys, list): keys = [keys] - elif not isinstance(keys, list): - raise ValueError(err_msg) missing = [] for col in keys: - if (is_scalar(col) or isinstance(col, tuple)): - # if col is a valid column key, everything is fine - # tuples are always considered keys, never as list-likes - if col not in self: - missing.append(col) - elif (not isinstance(col, (ABCIndexClass, ABCSeries, - np.ndarray, list)) + if (is_scalar(col) or isinstance(col, tuple)) and col in self: + # tuples can be both column keys or list-likes + # if they are valid column keys, everything is fine + continue + elif is_scalar(col) and col not in self: + # tuples that are not column keys are considered list-like, + # not considered missing + missing.append(col) + elif (not is_list_like(col, allow_sets=False) or getattr(col, 'ndim', 1) > 1): - raise ValueError(err_msg) + raise TypeError('The parameter "keys" may only contain a ' + 'combination of valid column keys and ' + 'one-dimensional list-likes') if missing: raise KeyError('{}'.format(missing)) @@ -4193,6 +4173,12 @@ def set_index(self, keys, drop=True, append=False, inplace=False, elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) + elif (is_list_like(col) + and not (isinstance(col, tuple) and col in self)): + # all other list-likes (but avoid valid column keys) + col = list(col) # ensure iterator do not get read twice etc. + arrays.append(col) + names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 2d1afa2281d44..4faff8e75a7d2 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -118,7 +118,7 @@ def test_set_index_after_mutation(self): # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) @pytest.mark.parametrize('box', [Series, Index, np.array, - list, lambda x: [list(x)], + list, tuple, iter, lambda x: [list(x)], lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -135,7 +135,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, with pytest.raises(KeyError, match=msg): df.set_index(key, drop=drop, append=append) else: - # np.array/list-of-list "forget" the name of B + # np.array/tuple/iter/list-of-list "forget" the name of B name_mi = getattr(key, 'names', None) name = [getattr(key, 'name', None)] if name_mi is None else name_mi @@ -150,7 +150,8 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, list, + @pytest.mark.parametrize('box', [Series, Index, np.array, + list, tuple, iter, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'A'), (True, 'B'), @@ -162,7 +163,7 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array/list "forget" the name of B + # np.array/list/tuple/iter "forget" the name of B names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] result = df.set_index(keys, drop=drop, append=append) @@ -178,10 +179,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, list, + @pytest.mark.parametrize('box2', [Series, Index, np.array, + list, tuple, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, list, + @pytest.mark.parametrize('box1', [Series, Index, np.array, + list, tuple, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -195,6 +198,9 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] result = df.set_index(keys, drop=drop, append=append) + # if either box was iter, the content has been consumed; re-read it + keys = [box1(df['A']), box2(df['A'])] + # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers @@ -202,7 +208,7 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise - # box = list would be interpreted as keys + # box = list would be illegal expected = df.set_index([keys[0]], drop=first_drop, append=append) expected = expected.set_index([keys[1]], drop=drop, append=True) tm.assert_frame_equal(result, expected) @@ -232,7 +238,7 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): + def test_set_index_raise(self, frame_of_index_cols, drop, append): df = frame_of_index_cols with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): @@ -243,31 +249,14 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): with pytest.raises(KeyError, match='X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) - msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" - # tuples always raise KeyError - with pytest.raises(KeyError, match=msg): - df.set_index(tuple(df['A']), drop=drop, append=append) - - # also within a list - with pytest.raises(KeyError, match=msg): - df.set_index(['A', df['A'], tuple(df['A'])], - drop=drop, append=append) - - @pytest.mark.parametrize('append', [True, False]) - @pytest.mark.parametrize('drop', [True, False]) - @pytest.mark.parametrize('box', [set, iter]) - def test_set_index_raise_on_type(self, frame_of_index_cols, box, - drop, append): - df = frame_of_index_cols - - msg = 'The parameter "keys" may be a column key, .*' - # forbidden type, e.g. set/tuple/iter - with pytest.raises(ValueError, match=msg): - df.set_index(box(df['A']), drop=drop, append=append) + msg = 'The parameter "keys" may only contain a combination of.*' + # forbidden type, e.g. set + with pytest.raises(TypeError, match=msg): + df.set_index(set(df['A']), drop=drop, append=append) - # forbidden type in list, e.g. set/tuple/iter - with pytest.raises(ValueError, match=msg): - df.set_index(['A', df['A'], box(df['A'])], + # forbidden type in list, e.g. set + with pytest.raises(TypeError, match=msg): + df.set_index(['A', df['A'], set(df['A'])], drop=drop, append=append) def test_construction_with_categorical_index(self): From 4a211e9c24b3efad942610a6097745fd331953dd Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 1 Feb 2019 21:42:39 +0100 Subject: [PATCH 17/30] Revert "API: better error-handling for df.set_index (#22486)" This reverts commit 145c2275e3560edc30ff418a57df67ba3c4c30d6. --- doc/source/whatsnew/v0.24.0.txt | 1141 +++++++++++++++++++++++++ pandas/core/frame.py | 56 +- pandas/tests/frame/conftest.py | 7 +- pandas/tests/frame/test_alter_axes.py | 68 +- 4 files changed, 1187 insertions(+), 85 deletions(-) create mode 100644 doc/source/whatsnew/v0.24.0.txt diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt new file mode 100644 index 0000000000000..c1a2aae7767c3 --- /dev/null +++ b/doc/source/whatsnew/v0.24.0.txt @@ -0,0 +1,1141 @@ +.. _whatsnew_0240: + +v0.24.0 (Month XX, 2018) +------------------------ + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) + + +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) + +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing +the user to override the engine's default behavior to include or omit the +dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) + + +.. _whatsnew_0240.enhancements.extension_array_operators: + +``ExtensionArray`` operator support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison +operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: + +1. Define each of the operators on your ``ExtensionArray`` subclass. +2. Use an operator implementation from pandas that depends on operators that are already defined + on the underlying elements (scalars) of the ``ExtensionArray``. + +See the :ref:`ExtensionArray Operator Support +` documentation section for details on both +ways of adding operator support. + +.. _whatsnew_0240.enhancements.intna: + +Optional Integer NA Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +Here is an example of the usage. + +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) + +.. ipython:: python + + s = pd.Series([1, 2, np.nan], dtype='Int64') + s + + +Operations on these dtypes will propagate ``NaN`` as other pandas operations. + +.. ipython:: python + + # arithmetic + s + 1 + + # comparison + s == 1 + + # indexing + s.iloc[1:3] + + # operate with other dtypes + s + s.iloc[1:3].astype('Int8') + + # coerce when needed + s + 0.01 + +These dtypes can operate as part of of ``DataFrame``. + +.. ipython:: python + + df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df + df.dtypes + + +These dtypes can be merged & reshaped & casted. + +.. ipython:: python + + pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes + df['A'].astype(float) + +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + +.. warning:: + + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. + +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + + +.. _whatsnew_0240.enhancements.interval: + +Storing Interval Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` like previously (:issue:`19453`). + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, +this should result in better performance when storing an array of intervals in +a :class:`Series`. + +Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy +array, but rather an ``ExtensionArray``: + +.. ipython:: python + + ser.values + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0240.api_breaking.interval_values` for more. + + +.. _whatsnew_0240.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ +- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) +- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) +- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) +- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) +- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to + reflect changes from the `Pandas-GBQ library version 0.6.0 + `__. + (:issue:`21627`, :issue:`22557`) +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) +- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) +- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). + The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). +- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) +- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) +- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) +- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). +- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, + all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) +- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- Compatibility with Matplotlib 3.0 (:issue:`22790`). + +.. _whatsnew_0240.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) + +.. _whatsnew_0240.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`21242`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.12.0 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.0 | | ++-----------------+-----------------+----------+ +| matplotlib | 2.0.0 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.1 | | ++-----------------+-----------------+----------+ +| pytables | 3.4.2 | | ++-----------------+-----------------+----------+ +| scipy | 0.18.1 | | ++-----------------+-----------------+----------+ + +.. _whatsnew_0240.api_breaking.csv_line_terminator: + +`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'`` + for the default line terminator (:issue:`20353`). +This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator +even when ``'\n'`` was passed in ``line_terminator``. + +Previous Behavior on Windows: + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: # When passing file PATH to to_csv, line_terminator does not work, and csv is saved with '\r\n'. + ...: # Also, this converts all '\n's in the data to '\r\n'. + ...: data.to_csv("test.csv", index=False, line_terminator='\n') + +In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\r\n"a\r\nbc","a\r\r\nbc"\r\n' + +In [4]: # When passing file OBJECT with newline option to to_csv, line_terminator works. + ...: with open("test2.csv", mode='w', newline='\n') as f: + ...: data.to_csv(f, index=False, line_terminator='\n') + +In [5]: with open("test2.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' + + +New Behavior on Windows: + +- By passing ``line_terminator`` explicitly, line terminator is set to that character. +- The value of ``line_terminator`` only affects the line terminator of CSV, + so it does not change the value inside the data. + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: data.to_csv("test.csv", index=False, line_terminator='\n') + +In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' + + +- On Windows, the value of ``os.linesep`` is ``'\r\n'``, + so if ``line_terminator`` is not set, ``'\r\n'`` is used for line terminator. +- Again, it does not affect the value inside the data. + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: data.to_csv("test.csv", index=False) + +In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' + + +- For files objects, specifying ``newline`` is not sufficient to set the line terminator. + You must pass in the ``line_terminator`` explicitly, even in this case. + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: with open("test2.csv", mode='w', newline='\n') as f: + ...: data.to_csv(f, index=False) + +In [3]: with open("test2.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' + +.. _whatsnew_0240.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of ``Interval`` objects, use +:meth:`numpy.asarray` or ``idx.astype(object)``. + +.. ipython:: python + + np.asarray(idx) + idx.values.astype(object) + +.. _whatsnew_0240.api.timezone_offset_parsing: + +Parsing Datetime Strings with Timezone Offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` +or :class:`DatetimeIndex` would automatically convert the datetime to UTC +without timezone localization. This is inconsistent from parsing the same +datetime string with :class:`Timestamp` which would preserve the UTC +offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC +offset in the ``tz`` attribute when all the datetime strings have the same +UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") + Out[2]: Timestamp('2015-11-18 10:00:00') + + In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") + Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') + + # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) + In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) + +*Current Behavior*: + +.. ipython:: python + + pd.to_datetime("2015-11-18 15:30:00+05:30") + pd.Timestamp("2015-11-18 15:30:00+05:30") + +Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) + +Parsing datetime strings with different UTC offsets will now create an Index of +``datetime.datetime`` objects with different UTC offsets + +.. ipython:: python + + idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + idx + idx[0] + idx[1] + +Passing ``utc=True`` will mimic the previous behavior but will correctly indicate +that the dates have been converted to UTC + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.calendarday: + +CalendarDay Offset +^^^^^^^^^^^^^^^^^^ + +:class:`Day` and associated frequency alias ``'D'`` were documented to represent +a calendar day; however, arithmetic and operations with :class:`Day` sometimes +respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + + # Respects calendar arithmetic + In [3]: pd.date_range(start=ts, freq='D', periods=3) + Out[3]: + DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', + '2016-11-01 00:00:00+02:00'], + dtype='datetime64[ns, Europe/Helsinki]', freq='D') + + # Respects absolute arithmetic + In [4]: ts + pd.tseries.frequencies.to_offset('D') + Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') + +:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available +and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` +will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) +See the :ref:`documentation here ` for more information. + +Addition with :class:`CalendarDay` across a daylight savings time transition: + +.. ipython:: python + + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts + pd.offsets.Day(1) + ts + pd.offsets.CalendarDay(1) + +.. _whatsnew_0240.api_breaking.period_end_time: + +Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The time values in :class:`Period` and :class:`PeriodIndex` objects are now set +to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) + +Previous Behavior: + +.. code-block:: ipython + + In [2]: p = pd.Period('2017-01-01', 'D') + In [3]: pi = pd.PeriodIndex([p]) + + In [4]: pd.Series(pi).dt.end_time[0] + Out[4]: Timestamp(2017-01-01 00:00:00) + + In [5]: p.end_time + Out[5]: Timestamp(2017-01-01 23:59:59.999999999) + +Current Behavior: + +Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as +is the case with :attr:`Period.end_time`, for example + +.. ipython:: python + + p = pd.Period('2017-01-01', 'D') + pi = pd.PeriodIndex([p]) + + pd.Series(pi).dt.end_time[0] + + p.end_time + +.. _whatsnew_0240.api_breaking.sparse_values: + +Sparse Data Structure Refactor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, +is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). +To conform to this interface and for consistency with the rest of pandas, some API breaking +changes were made: + +- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. +- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. +- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) +- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): + + * The default value of ``allow_fill`` has changed from ``False`` to ``True``. + * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). + * Passing a scalar for ``indices`` is no longer allowed. + +- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. +- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. +- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. + + +Some new warnings are issued for operations that require or are likely to materialize a large dense array: + +- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. +- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. + +In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. + +.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: + +Raise ValueError in ``DataFrame.to_dict(orient='index')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with +``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) + df + + df.to_dict(orient='index') + +.. _whatsnew_0240.api.datetimelike.normalize: + +Tick DateOffset Normalize Restrictions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, +:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with +``normalize=True`` is no longer supported. This prevents unexpected behavior +where addition could fail to be monotone or associative. (:issue:`21427`) + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') + + In [3]: ts + Out[3]: Timestamp('2018-06-11 18:01:14') + + In [4]: tic = pd.offsets.Hour(n=2, normalize=True) + ...: + + In [5]: tic + Out[5]: <2 * Hours> + + In [6]: ts + tic + Out[6]: Timestamp('2018-06-11 00:00:00') + + In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) + Out[7]: False + +*Current Behavior*: + +.. ipython:: python + + ts = pd.Timestamp('2018-06-11 18:01:14') + tic = pd.offsets.Hour(n=2) + ts + tic + tic + tic == ts + (tic + tic + tic) + + +.. _whatsnew_0240.api.datetimelike: + + +.. _whatsnew_0240.api.period_subtraction: + +Period Subtraction +^^^^^^^^^^^^^^^^^^ + +Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. +instead of an integer (:issue:`21314`) + +.. ipython:: python + + june = pd.Period('June 2018') + april = pd.Period('April 2018') + june - april + +Previous Behavior: + +.. code-block:: ipython + + In [2]: june = pd.Period('June 2018') + + In [3]: april = pd.Period('April 2018') + + In [4]: june - april + Out [4]: 2 + +Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return +an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` + +.. ipython:: python + + pi = pd.period_range('June 2018', freq='M', periods=3) + pi - pi[0] + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) + + In [3]: pi - pi[0] + Out[3]: Int64Index([0, 1, 2], dtype='int64') + + +.. _whatsnew_0240.api.timedelta64_subtract_nan: + +Addition/Subtraction of ``NaN`` from :class:`DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adding or subtracting ``NaN`` from a :class:`DataFrame` column with +``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning +all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and +``Series`` behavior (:issue:`22163`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame([pd.Timedelta(days=1)]) + df - np.nan + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) + + In [5]: df - np.nan + Out[5]: + 0 + 0 NaT + + +.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: + +DataFrame Arithmetic Operations Broadcasting Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`DataFrame` arithmetic operations when operating with 2-dimensional +``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s +broadcast. (:issue:`23000`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: arr = np.arange(6).reshape(3, 2) + In [4]: df = pd.DataFrame(arr) + In [5]: df + arr[[0], :] # 1 row, 2 columns + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) + In [6]: df + arr[:, [1]] # 1 column, 3 rows + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) + +*Current Behavior*: + +.. ipython:: python + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr) + df + +.. ipython:: python + df + arr[[0], :] # 1 row, 2 columns + df + arr[:, [1]] # 1 column, 3 rows + + +.. _whatsnew_0240.api.extension: + +ExtensionType Changes +^^^^^^^^^^^^^^^^^^^^^ + +**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** + +Pandas now requires that extension dtypes be hashable. The base class implements +a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should +update the ``ExtensionDtype._metadata`` tuple to match the signature of your +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). + +**Other changes** + +- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) +- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore + the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) +- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). +- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) +- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) +- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) +- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) +- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) +- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). +- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) + +.. _whatsnew_0240.api.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + +.. _whatsnew_0240.api.crosstab_dtypes + +Crosstab Preserves Dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`crosstab` will preserve now dtypes in some cases that previously would +cast from integer dtype to floating dtype (:issue:`22019`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Out[4]: + b 3 4 + a + 1 0.5 0.0 + 2 0.5 1.0 + +Current Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + +Datetimelike API Changes +^^^^^^^^^^^^^^^^^^^^^^^^ + +- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) +- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) +- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) +- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) + +.. _whatsnew_0240.api.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) +- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in + :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of + a ``KeyError`` (:issue:`21678`). +- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) +- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) +- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) +- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) +- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) +- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) + +.. _whatsnew_0240.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) +- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) +- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) +- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) +- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) +- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) +- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) +- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain + many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) +- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) +- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) +- The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). + +.. _whatsnew_0240.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) +- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) +- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) +- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) +- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) +- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) +- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) +- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) +- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) +- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) +- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) +- :meth:`SparseArray.get_values` and :meth:`SparseArray.to_dense` have dropped the ``fill`` parameter (:issue:`14686`) +- :meth:`SparseSeries.to_dense` has dropped the ``sparse_only`` parameter (:issue:`14686`) + +.. _whatsnew_0240.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, + both when indexing by label (using .loc) and position(.iloc). + Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` + (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`, :issue:`21508`) +- Improved performance of :meth:`HDFStore.groups` (and dependent functions like + :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + (:issue:`21372`) +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) +- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) +- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) + + +.. _whatsnew_0240.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) +- +- + +.. _whatsnew_0240.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). +- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) +- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). + +Datetimelike +^^^^^^^^^^^^ + +- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) +- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) +- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) +- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) +- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) +- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) +- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) +- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) +- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) +- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) +- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) +- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) +- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) +- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) +- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) +- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) +- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) +- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- + +Timezones +^^^^^^^^^ + +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) +- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) +- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) +- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) +- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) +- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) +- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) +- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) +- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) +- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) +- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) +- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) + +Offsets +^^^^^^^ + +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) +- + +Numeric +^^^^^^^ + +- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) +- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) +- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) +- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, + when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), + a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). +- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) +- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) +- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) +- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) +- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). + +Strings +^^^^^^^ + +- +- +- + +Interval +^^^^^^^^ + +- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) +- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) +- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) + +Indexing +^^^^^^^^ + +- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) +- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) +- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) +- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) +- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) +- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) +- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) +- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) +- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) +- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) + +Missing +^^^^^^^ + +- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) +- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) + + +MultiIndex +^^^^^^^^^^ + +- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) +- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) + +I/O +^^^ + +.. _whatsnew_0240.bug_fixes.nan_with_str_dtype: + +Proper handling of `np.NaN` in a string data-typed column with the Python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There was bug in :func:`read_excel` and :func:`read_csv` with the Python +engine, where missing values turned to ``'nan'`` with ``dtype=str`` and +``na_filter=True``. Now, these missing values are converted to the string +missing indicator, ``np.nan``. (:issue `20377`) + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +Previous Behavior: + +.. code-block:: ipython + + In [5]: data = 'a,b,c\n1,,3\n4,5,6' + In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) + In [7]: df.loc[0, 'b'] + Out[7]: + 'nan' + +Current Behavior: + +.. ipython:: python + + data = 'a,b,c\n1,,3\n4,5,6' + df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) + df.loc[0, 'b'] + +Notice how we now instead output ``np.nan`` itself instead of a stringified form of it. + +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) +- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) + +Plotting +^^^^^^^^ + +- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) +- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) +- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a + ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) +- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). +- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) +- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) + +Reshaping +^^^^^^^^^ + +- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) +- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) +- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) +- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) +- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) +- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) +- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) +- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) +- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) +- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) +- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) +- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) + +.. _whatsnew_0240.bug_fixes.sparse: + +Sparse +^^^^^^ + +- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) +- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) +- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. +- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. +- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. +- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) +- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) +- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) + +Build Changes +^^^^^^^^^^^^^ + +- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) +- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) +- + +Other +^^^^^ + +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) +- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) +- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) +- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. +- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c77782f926a13..3e0d93d9dd091 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -71,7 +71,7 @@ is_iterator, is_sequence, is_named_tuple) -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms @@ -4126,25 +4126,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not isinstance(keys, list): keys = [keys] - missing = [] - for col in keys: - if (is_scalar(col) or isinstance(col, tuple)) and col in self: - # tuples can be both column keys or list-likes - # if they are valid column keys, everything is fine - continue - elif is_scalar(col) and col not in self: - # tuples that are not column keys are considered list-like, - # not considered missing - missing.append(col) - elif (not is_list_like(col, allow_sets=False) - or getattr(col, 'ndim', 1) > 1): - raise TypeError('The parameter "keys" may only contain a ' - 'combination of valid column keys and ' - 'one-dimensional list-likes') - - if missing: - raise KeyError('{}'.format(missing)) - if inplace: frame = self else: @@ -4154,7 +4135,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, names = [] if append: names = [x for x in self.index.names] - if isinstance(self.index, ABCMultiIndex): + if isinstance(self.index, MultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: @@ -4162,29 +4143,29 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, ABCMultiIndex): - for n in range(col.nlevels): + if isinstance(col, MultiIndex): + # append all but the last column so we don't have to modify + # the end of this loop + for n in range(col.nlevels - 1): arrays.append(col._get_level_values(n)) + + level = col._get_level_values(col.nlevels - 1) names.extend(col.names) - elif isinstance(col, (ABCIndexClass, ABCSeries)): - # if Index then not MultiIndex (treated above) - arrays.append(col) + elif isinstance(col, Series): + level = col._values names.append(col.name) - elif isinstance(col, (list, np.ndarray)): - arrays.append(col) - names.append(None) - elif (is_list_like(col) - and not (isinstance(col, tuple) and col in self)): - # all other list-likes (but avoid valid column keys) - col = list(col) # ensure iterator do not get read twice etc. - arrays.append(col) + elif isinstance(col, Index): + level = col + names.append(col.name) + elif isinstance(col, (list, np.ndarray, Index)): + level = col names.append(None) - # from here, col can only be a column label else: - arrays.append(frame[col]._values) + level = frame[col]._values names.append(col) if drop: to_remove.append(col) + arrays.append(level) index = ensure_index_from_sequences(arrays, names) @@ -4193,8 +4174,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, raise ValueError('Index has duplicate keys: {dup}'.format( dup=duplicates)) - # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): + for c in to_remove: del frame[c] # clear up memory usage diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 377e737a53158..1838339ae0ed8 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -209,13 +209,12 @@ def frame_of_index_cols(): """ Fixture for DataFrame of columns that can be used for indexing - Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; - 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. + Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but + are jointly unique), the rest are unique. """ df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], 'C': ['a', 'b', 'c', 'd', 'e'], 'D': np.random.randn(5), - 'E': np.random.randn(5), - ('tuple', 'as', 'label'): np.random.randn(5)}) + 'E': np.random.randn(5)}) return df diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 4faff8e75a7d2..aa2311762c571 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -47,8 +47,7 @@ def test_set_index_cast(self): tm.assert_frame_equal(df, df2) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) @pytest.mark.parametrize('inplace', [True, False]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_drop_inplace(self, frame_of_index_cols, @@ -71,8 +70,7 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_append(self, frame_of_index_cols, drop, keys): df = frame_of_index_cols @@ -88,8 +86,7 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], - ('tuple', 'as', 'label')]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys): @@ -118,7 +115,7 @@ def test_set_index_after_mutation(self): # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) @pytest.mark.parametrize('box', [Series, Index, np.array, - list, tuple, iter, lambda x: [list(x)], + list, lambda x: [list(x)], lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -131,11 +128,11 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, key = box(df['B']) if box == list: # list of strings gets interpreted as list of keys - msg = "['one', 'two', 'three', 'one', 'two']" + msg = "'one'" with pytest.raises(KeyError, match=msg): df.set_index(key, drop=drop, append=append) else: - # np.array/tuple/iter/list-of-list "forget" the name of B + # np.array/list-of-list "forget" the name of B name_mi = getattr(key, 'names', None) name = [getattr(key, 'name', None)] if name_mi is None else name_mi @@ -150,8 +147,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, - list, tuple, iter, + @pytest.mark.parametrize('box', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'A'), (True, 'B'), @@ -163,8 +159,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array/list/tuple/iter "forget" the name of B - names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] + # np.array and list "forget" the name of B + names = ['A', None if box in [np.array, list] else 'B'] result = df.set_index(keys, drop=drop, append=append) @@ -179,12 +175,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, - list, tuple, iter, + @pytest.mark.parametrize('box2', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, - list, tuple, iter, + @pytest.mark.parametrize('box1', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -196,22 +190,21 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, df.index.name = index_name keys = [box1(df['A']), box2(df['A'])] - result = df.set_index(keys, drop=drop, append=append) - # if either box was iter, the content has been consumed; re-read it - keys = [box1(df['A']), box2(df['A'])] + # == gives ambiguous Boolean for Series + if drop and keys[0] is 'A' and keys[1] is 'A': + with tm.assert_raises_regex(KeyError, '.*'): + df.set_index(keys, drop=drop, append=append) + else: + result = df.set_index(keys, drop=drop, append=append) - # need to adapt first drop for case that both keys are 'A' -- - # cannot drop the same column twice; - # use "is" because == would give ambiguous Boolean error for containers - first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop + # to test against already-tested behavior, we add sequentially, + # hence second append always True; must wrap in list, otherwise + # list-box will be illegal + expected = df.set_index([keys[0]], drop=drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) - # to test against already-tested behaviour, we add sequentially, - # hence second append always True; must wrap keys in list, otherwise - # box = list would be illegal - expected = df.set_index([keys[0]], drop=first_drop, append=append) - expected = expected.set_index([keys[1]], drop=drop, append=True) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) @@ -241,24 +234,13 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): def test_set_index_raise(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): - # column names are A-E, as well as one tuple + with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) # non-existent key in list with arrays - with pytest.raises(KeyError, match='X'): + with tm.assert_raises_regex(KeyError, '.*'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) - msg = 'The parameter "keys" may only contain a combination of.*' - # forbidden type, e.g. set - with pytest.raises(TypeError, match=msg): - df.set_index(set(df['A']), drop=drop, append=append) - - # forbidden type in list, e.g. set - with pytest.raises(TypeError, match=msg): - df.set_index(['A', df['A'], set(df['A'])], - drop=drop, append=append) - def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) ci.name = 'B' From 103a0929acbbf86d833ce42cee605fb5ccf4c7f7 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 1 Feb 2019 21:45:03 +0100 Subject: [PATCH 18/30] Replace deprecated assert_raises_regex --- pandas/tests/frame/test_alter_axes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index aa2311762c571..2fa505f8eb389 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -193,7 +193,7 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # == gives ambiguous Boolean for Series if drop and keys[0] is 'A' and keys[1] is 'A': - with tm.assert_raises_regex(KeyError, '.*'): + with pytest.raises(KeyError, match='.*'): df.set_index(keys, drop=drop, append=append) else: result = df.set_index(keys, drop=drop, append=append) @@ -234,11 +234,11 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): def test_set_index_raise(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E + with pytest.raises(KeyError, match='.*'): # column names are A-E df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) # non-existent key in list with arrays - with tm.assert_raises_regex(KeyError, '.*'): + with pytest.raises(KeyError, match='.*'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) def test_construction_with_categorical_index(self): From 8086f397e3caffc417d7a06ce97f3c5e650a2126 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 1 Feb 2019 21:47:20 +0100 Subject: [PATCH 19/30] Re-migrate 0.24.0 extension (.txt -> .rst) --- doc/source/whatsnew/v0.24.0.rst | 2 - doc/source/whatsnew/v0.24.0.txt | 1141 ------------------------------- 2 files changed, 1143 deletions(-) delete mode 100644 doc/source/whatsnew/v0.24.0.txt diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0777cbaf934b3..711b35f854035 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1195,8 +1195,6 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, - has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt deleted file mode 100644 index c1a2aae7767c3..0000000000000 --- a/doc/source/whatsnew/v0.24.0.txt +++ /dev/null @@ -1,1141 +0,0 @@ -.. _whatsnew_0240: - -v0.24.0 (Month XX, 2018) ------------------------- - -.. warning:: - - Starting January 1, 2019, pandas feature releases will support Python 3 only. - See :ref:`install.dropping-27` for more. - -.. _whatsnew_0240.enhancements: - -New features -~~~~~~~~~~~~ -- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - - -- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) - -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing -the user to override the engine's default behavior to include or omit the -dataframe's indexes from the resulting Parquet file. (:issue:`20768`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - - -.. _whatsnew_0240.enhancements.extension_array_operators: - -``ExtensionArray`` operator support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison -operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: - -1. Define each of the operators on your ``ExtensionArray`` subclass. -2. Use an operator implementation from pandas that depends on operators that are already defined - on the underlying elements (scalars) of the ``ExtensionArray``. - -See the :ref:`ExtensionArray Operator Support -` documentation section for details on both -ways of adding operator support. - -.. _whatsnew_0240.enhancements.intna: - -Optional Integer NA Support -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. -Here is an example of the usage. - -We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) - -.. ipython:: python - - s = pd.Series([1, 2, np.nan], dtype='Int64') - s - - -Operations on these dtypes will propagate ``NaN`` as other pandas operations. - -.. ipython:: python - - # arithmetic - s + 1 - - # comparison - s == 1 - - # indexing - s.iloc[1:3] - - # operate with other dtypes - s + s.iloc[1:3].astype('Int8') - - # coerce when needed - s + 0.01 - -These dtypes can operate as part of of ``DataFrame``. - -.. ipython:: python - - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) - df - df.dtypes - - -These dtypes can be merged & reshaped & casted. - -.. ipython:: python - - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) - -Reduction and groupby operations such as 'sum' work. - -.. ipython:: python - - df.sum() - df.groupby('B').A.sum() - -.. warning:: - - The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. - -.. _whatsnew_0240.enhancements.read_html: - -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as sequences of cells with the same -value. (:issue:`17054`) - -.. ipython:: python - - result = pd.read_html(""" - - - - - - - - - - - -
ABC
12
""") - -Previous Behavior: - -.. code-block:: ipython - - In [13]: result - Out [13]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. ipython:: python - - result - - -.. _whatsnew_0240.enhancements.interval: - -Storing Interval Data in Series and DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an -:class:`IntervalIndex` like previously (:issue:`19453`). - -.. ipython:: python - - ser = pd.Series(pd.interval_range(0, 5)) - ser - ser.dtype - -Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, -this should result in better performance when storing an array of intervals in -a :class:`Series`. - -Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy -array, but rather an ``ExtensionArray``: - -.. ipython:: python - - ser.values - -This is the same behavior as ``Series.values`` for categorical data. See -:ref:`whatsnew_0240.api_breaking.interval_values` for more. - - -.. _whatsnew_0240.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ -- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) -- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) -- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) -- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) -- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) -- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.6.0 - `__. - (:issue:`21627`, :issue:`22557`) -- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) -- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) -- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) -- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) -- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). - The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) -- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) -- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) -- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) -- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). -- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, - all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) -- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). -- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). -- Compatibility with Matplotlib 3.0 (:issue:`22790`). - -.. _whatsnew_0240.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - -.. _whatsnew_0240.api_breaking.deps: - -Dependencies have increased minimum versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We have updated our minimum supported versions of dependencies (:issue:`21242`). -If installed, we now require: - -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| numpy | 1.12.0 | X | -+-----------------+-----------------+----------+ -| bottleneck | 1.2.0 | | -+-----------------+-----------------+----------+ -| matplotlib | 2.0.0 | | -+-----------------+-----------------+----------+ -| numexpr | 2.6.1 | | -+-----------------+-----------------+----------+ -| pytables | 3.4.2 | | -+-----------------+-----------------+----------+ -| scipy | 0.18.1 | | -+-----------------+-----------------+----------+ - -.. _whatsnew_0240.api_breaking.csv_line_terminator: - -`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'`` - for the default line terminator (:issue:`20353`). -This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator -even when ``'\n'`` was passed in ``line_terminator``. - -Previous Behavior on Windows: - -.. code-block:: ipython - -In [1]: data = pd.DataFrame({ - ...: "string_with_lf": ["a\nbc"], - ...: "string_with_crlf": ["a\r\nbc"] - ...: }) - -In [2]: # When passing file PATH to to_csv, line_terminator does not work, and csv is saved with '\r\n'. - ...: # Also, this converts all '\n's in the data to '\r\n'. - ...: data.to_csv("test.csv", index=False, line_terminator='\n') - -In [3]: with open("test.csv", mode='rb') as f: - ...: print(f.read()) -b'string_with_lf,string_with_crlf\r\n"a\r\nbc","a\r\r\nbc"\r\n' - -In [4]: # When passing file OBJECT with newline option to to_csv, line_terminator works. - ...: with open("test2.csv", mode='w', newline='\n') as f: - ...: data.to_csv(f, index=False, line_terminator='\n') - -In [5]: with open("test2.csv", mode='rb') as f: - ...: print(f.read()) -b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' - - -New Behavior on Windows: - -- By passing ``line_terminator`` explicitly, line terminator is set to that character. -- The value of ``line_terminator`` only affects the line terminator of CSV, - so it does not change the value inside the data. - -.. code-block:: ipython - -In [1]: data = pd.DataFrame({ - ...: "string_with_lf": ["a\nbc"], - ...: "string_with_crlf": ["a\r\nbc"] - ...: }) - -In [2]: data.to_csv("test.csv", index=False, line_terminator='\n') - -In [3]: with open("test.csv", mode='rb') as f: - ...: print(f.read()) -b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' - - -- On Windows, the value of ``os.linesep`` is ``'\r\n'``, - so if ``line_terminator`` is not set, ``'\r\n'`` is used for line terminator. -- Again, it does not affect the value inside the data. - -.. code-block:: ipython - -In [1]: data = pd.DataFrame({ - ...: "string_with_lf": ["a\nbc"], - ...: "string_with_crlf": ["a\r\nbc"] - ...: }) - -In [2]: data.to_csv("test.csv", index=False) - -In [3]: with open("test.csv", mode='rb') as f: - ...: print(f.read()) -b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' - - -- For files objects, specifying ``newline`` is not sufficient to set the line terminator. - You must pass in the ``line_terminator`` explicitly, even in this case. - -.. code-block:: ipython - -In [1]: data = pd.DataFrame({ - ...: "string_with_lf": ["a\nbc"], - ...: "string_with_crlf": ["a\r\nbc"] - ...: }) - -In [2]: with open("test2.csv", mode='w', newline='\n') as f: - ...: data.to_csv(f, index=False) - -In [3]: with open("test2.csv", mode='rb') as f: - ...: print(f.read()) -b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' - -.. _whatsnew_0240.api_breaking.interval_values: - -``IntervalIndex.values`` is now an ``IntervalArray`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). - -Previous Behavior: - -.. code-block:: ipython - - In [1]: idx = pd.interval_range(0, 4) - - In [2]: idx.values - Out[2]: - array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), - Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], - dtype=object) - -New Behavior: - -.. ipython:: python - - idx = pd.interval_range(0, 4) - idx.values - -This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. - -For situations where you need an ``ndarray`` of ``Interval`` objects, use -:meth:`numpy.asarray` or ``idx.astype(object)``. - -.. ipython:: python - - np.asarray(idx) - idx.values.astype(object) - -.. _whatsnew_0240.api.timezone_offset_parsing: - -Parsing Datetime Strings with Timezone Offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` -or :class:`DatetimeIndex` would automatically convert the datetime to UTC -without timezone localization. This is inconsistent from parsing the same -datetime string with :class:`Timestamp` which would preserve the UTC -offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC -offset in the ``tz`` attribute when all the datetime strings have the same -UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) - -*Previous Behavior*: - -.. code-block:: ipython - - In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") - Out[2]: Timestamp('2015-11-18 10:00:00') - - In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") - Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') - - # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) - In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) - -*Current Behavior*: - -.. ipython:: python - - pd.to_datetime("2015-11-18 15:30:00+05:30") - pd.Timestamp("2015-11-18 15:30:00+05:30") - -Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) - -Parsing datetime strings with different UTC offsets will now create an Index of -``datetime.datetime`` objects with different UTC offsets - -.. ipython:: python - - idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - idx - idx[0] - idx[1] - -Passing ``utc=True`` will mimic the previous behavior but will correctly indicate -that the dates have been converted to UTC - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) - -.. _whatsnew_0240.api_breaking.calendarday: - -CalendarDay Offset -^^^^^^^^^^^^^^^^^^ - -:class:`Day` and associated frequency alias ``'D'`` were documented to represent -a calendar day; however, arithmetic and operations with :class:`Day` sometimes -respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - - # Respects calendar arithmetic - In [3]: pd.date_range(start=ts, freq='D', periods=3) - Out[3]: - DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', - '2016-11-01 00:00:00+02:00'], - dtype='datetime64[ns, Europe/Helsinki]', freq='D') - - # Respects absolute arithmetic - In [4]: ts + pd.tseries.frequencies.to_offset('D') - Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') - -:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available -and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` -will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) -See the :ref:`documentation here ` for more information. - -Addition with :class:`CalendarDay` across a daylight savings time transition: - -.. ipython:: python - - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - ts + pd.offsets.Day(1) - ts + pd.offsets.CalendarDay(1) - -.. _whatsnew_0240.api_breaking.period_end_time: - -Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The time values in :class:`Period` and :class:`PeriodIndex` objects are now set -to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) - -Previous Behavior: - -.. code-block:: ipython - - In [2]: p = pd.Period('2017-01-01', 'D') - In [3]: pi = pd.PeriodIndex([p]) - - In [4]: pd.Series(pi).dt.end_time[0] - Out[4]: Timestamp(2017-01-01 00:00:00) - - In [5]: p.end_time - Out[5]: Timestamp(2017-01-01 23:59:59.999999999) - -Current Behavior: - -Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as -is the case with :attr:`Period.end_time`, for example - -.. ipython:: python - - p = pd.Period('2017-01-01', 'D') - pi = pd.PeriodIndex([p]) - - pd.Series(pi).dt.end_time[0] - - p.end_time - -.. _whatsnew_0240.api_breaking.sparse_values: - -Sparse Data Structure Refactor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, -is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). -To conform to this interface and for consistency with the rest of pandas, some API breaking -changes were made: - -- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. -- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. -- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) -- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): - - * The default value of ``allow_fill`` has changed from ``False`` to ``True``. - * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). - * Passing a scalar for ``indices`` is no longer allowed. - -- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. -- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. -- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - - -Some new warnings are issued for operations that require or are likely to materialize a large dense array: - -- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. -- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. - -In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. - -.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: - -Raise ValueError in ``DataFrame.to_dict(orient='index')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with -``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) - df - - df.to_dict(orient='index') - -.. _whatsnew_0240.api.datetimelike.normalize: - -Tick DateOffset Normalize Restrictions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, -:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with -``normalize=True`` is no longer supported. This prevents unexpected behavior -where addition could fail to be monotone or associative. (:issue:`21427`) - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') - - In [3]: ts - Out[3]: Timestamp('2018-06-11 18:01:14') - - In [4]: tic = pd.offsets.Hour(n=2, normalize=True) - ...: - - In [5]: tic - Out[5]: <2 * Hours> - - In [6]: ts + tic - Out[6]: Timestamp('2018-06-11 00:00:00') - - In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) - Out[7]: False - -*Current Behavior*: - -.. ipython:: python - - ts = pd.Timestamp('2018-06-11 18:01:14') - tic = pd.offsets.Hour(n=2) - ts + tic + tic + tic == ts + (tic + tic + tic) - - -.. _whatsnew_0240.api.datetimelike: - - -.. _whatsnew_0240.api.period_subtraction: - -Period Subtraction -^^^^^^^^^^^^^^^^^^ - -Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. -instead of an integer (:issue:`21314`) - -.. ipython:: python - - june = pd.Period('June 2018') - april = pd.Period('April 2018') - june - april - -Previous Behavior: - -.. code-block:: ipython - - In [2]: june = pd.Period('June 2018') - - In [3]: april = pd.Period('April 2018') - - In [4]: june - april - Out [4]: 2 - -Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return -an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` - -.. ipython:: python - - pi = pd.period_range('June 2018', freq='M', periods=3) - pi - pi[0] - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) - - In [3]: pi - pi[0] - Out[3]: Int64Index([0, 1, 2], dtype='int64') - - -.. _whatsnew_0240.api.timedelta64_subtract_nan: - -Addition/Subtraction of ``NaN`` from :class:`DataFrame` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Adding or subtracting ``NaN`` from a :class:`DataFrame` column with -``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning -all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and -``Series`` behavior (:issue:`22163`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame([pd.Timedelta(days=1)]) - df - np.nan - -Previous Behavior: - -.. code-block:: ipython - - In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) - - In [5]: df - np.nan - Out[5]: - 0 - 0 NaT - - -.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: - -DataFrame Arithmetic Operations Broadcasting Changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`DataFrame` arithmetic operations when operating with 2-dimensional -``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s -broadcast. (:issue:`23000`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: arr = np.arange(6).reshape(3, 2) - In [4]: df = pd.DataFrame(arr) - In [5]: df + arr[[0], :] # 1 row, 2 columns - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) - In [6]: df + arr[:, [1]] # 1 column, 3 rows - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) - -*Current Behavior*: - -.. ipython:: python - arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr) - df - -.. ipython:: python - df + arr[[0], :] # 1 row, 2 columns - df + arr[:, [1]] # 1 column, 3 rows - - -.. _whatsnew_0240.api.extension: - -ExtensionType Changes -^^^^^^^^^^^^^^^^^^^^^ - -**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** - -Pandas now requires that extension dtypes be hashable. The base class implements -a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should -update the ``ExtensionDtype._metadata`` tuple to match the signature of your -``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). - -**Other changes** - -- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) -- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore - the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) -- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) -- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). -- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) -- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) -- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) -- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). -- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) -- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) -- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) -- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). -- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) - -.. _whatsnew_0240.api.incompatibilities: - -Series and Index Data-Dtype Incompatibilities -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series`` and ``Index`` constructors now raise when the -data is incompatible with a passed ``dtype=`` (:issue:`15832`) - -Previous Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - 0 18446744073709551615 - dtype: uint64 - -Current Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - ... - OverflowError: Trying to coerce negative values to unsigned integers - -.. _whatsnew_0240.api.crosstab_dtypes - -Crosstab Preserves Dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`crosstab` will preserve now dtypes in some cases that previously would -cast from integer dtype to floating dtype (:issue:`22019`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - Out[4]: - b 3 4 - a - 1 0.5 0.0 - 2 0.5 1.0 - -Current Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - -Datetimelike API Changes -^^^^^^^^^^^^^^^^^^^^^^^^ - -- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) -- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) -- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) -- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) -- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) - -.. _whatsnew_0240.api.other: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) -- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in - :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of - a ``KeyError`` (:issue:`21678`). -- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) -- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) -- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) -- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) -- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) -- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - -.. _whatsnew_0240.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) -- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) -- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) -- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) -- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) -- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) -- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain - many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) -- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) -- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) -- The ``fastpath`` keyword of the different Index constructors is deprecated (:issue:`23110`). - -.. _whatsnew_0240.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) -- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) -- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) -- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) -- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) -- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) -- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) -- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) -- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) -- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) -- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) -- :meth:`SparseArray.get_values` and :meth:`SparseArray.to_dense` have dropped the ``fill`` parameter (:issue:`14686`) -- :meth:`SparseSeries.to_dense` has dropped the ``sparse_only`` parameter (:issue:`14686`) - -.. _whatsnew_0240.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) -- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) -- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` - (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`, :issue:`21508`) -- Improved performance of :meth:`HDFStore.groups` (and dependent functions like - :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) - (:issue:`21372`) -- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) -- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) -- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) - - -.. _whatsnew_0240.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) -- -- - -.. _whatsnew_0240.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`). -- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) -- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - -Datetimelike -^^^^^^^^^^^^ - -- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) -- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) -- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) -- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) -- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) -- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) -- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) -- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) -- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) -- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) -- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) -- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) - -Timedelta -^^^^^^^^^ -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) -- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) -- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) -- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) -- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) -- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) -- - -Timezones -^^^^^^^^^ - -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) -- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) -- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) -- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) -- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) -- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) -- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) -- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) -- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) -- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) -- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) -- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) -- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) - -Offsets -^^^^^^^ - -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) -- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) -- - -Numeric -^^^^^^^ - -- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) -- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) -- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) -- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, - when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), - a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). -- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) -- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) -- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) -- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) -- Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). - -Strings -^^^^^^^ - -- -- -- - -Interval -^^^^^^^^ - -- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) -- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) -- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) - -Indexing -^^^^^^^^ - -- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) -- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) -- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) -- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) -- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) -- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) -- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) -- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) -- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) -- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) -- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) - -Missing -^^^^^^^ - -- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) -- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) -- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) - - -MultiIndex -^^^^^^^^^^ - -- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) -- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) -- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) - -I/O -^^^ - -.. _whatsnew_0240.bug_fixes.nan_with_str_dtype: - -Proper handling of `np.NaN` in a string data-typed column with the Python engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -There was bug in :func:`read_excel` and :func:`read_csv` with the Python -engine, where missing values turned to ``'nan'`` with ``dtype=str`` and -``na_filter=True``. Now, these missing values are converted to the string -missing indicator, ``np.nan``. (:issue `20377`) - -.. ipython:: python - :suppress: - - from pandas.compat import StringIO - -Previous Behavior: - -.. code-block:: ipython - - In [5]: data = 'a,b,c\n1,,3\n4,5,6' - In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) - In [7]: df.loc[0, 'b'] - Out[7]: - 'nan' - -Current Behavior: - -.. ipython:: python - - data = 'a,b,c\n1,,3\n4,5,6' - df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True) - df.loc[0, 'b'] - -Notice how we now instead output ``np.nan`` itself instead of a stringified form of it. - -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - -Plotting -^^^^^^^^ - -- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) -- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) - -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) -- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a - ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). -- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a - datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). -- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). -- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). -- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) -- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) - -Reshaping -^^^^^^^^^ - -- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) -- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) -- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) -- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) -- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) -- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) -- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) -- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) -- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) -- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) -- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - -.. _whatsnew_0240.bug_fixes.sparse: - -Sparse -^^^^^^ - -- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) -- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) -- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. -- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. -- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. -- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) -- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) -- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) - -Build Changes -^^^^^^^^^^^^^ - -- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) -- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) -- - -Other -^^^^^ - -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) -- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) -- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) -- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. -- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) -- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) From 999295e4ca10c83d9a23b958c4d17076dc46a2c3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 1 Feb 2019 21:53:57 +0100 Subject: [PATCH 20/30] Re-add docstring clarifications --- pandas/core/frame.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e0d93d9dd091..f0be419f0f32e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4052,12 +4052,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False, Set the DataFrame index using existing columns. Set the DataFrame index (row labels) using one or more existing - columns. The index can replace the existing index or expand on it. + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. Parameters ---------- - keys : label or list of label - Name or names of the columns that will be used as the index. + keys : label or array-like or list of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4102,7 +4106,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 7 2013 84 10 2014 31 - Create a multi-index using columns 'year' and 'month': + Create a MultiIndex using columns 'year' and 'month': >>> df.set_index(['year', 'month']) sale @@ -4112,15 +4116,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2013 7 84 2014 10 31 - Create a multi-index using a set of values and a column: + Create a MultiIndex using an Index and a column: - >>> df.set_index([[1, 2, 3, 4], 'year']) + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) month sale year 1 2012 1 55 2 2014 4 40 3 2013 7 84 4 2014 10 31 + + Create a MultiIndex using two Series: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, 'inplace') if not isinstance(keys, list): From c24df004ef531d0b4d7784d31add048ace13eba7 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Feb 2019 13:41:08 -0800 Subject: [PATCH 21/30] Backport PR #25063: API: change Index set ops sort=True -> sort=None (#25083) --- doc/source/whatsnew/v0.24.1.rst | 34 +++- pandas/_libs/lib.pyx | 3 +- pandas/core/indexes/api.py | 2 +- pandas/core/indexes/base.py | 82 ++++++-- pandas/core/indexes/datetimes.py | 6 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 43 +++- pandas/core/indexes/range.py | 8 +- pandas/tests/indexes/common.py | 2 +- pandas/tests/indexes/datetimes/test_setops.py | 12 +- .../tests/indexes/interval/test_interval.py | 24 +-- pandas/tests/indexes/multi/test_set_ops.py | 155 ++++++++++++-- pandas/tests/indexes/period/test_period.py | 2 +- pandas/tests/indexes/period/test_setops.py | 20 +- pandas/tests/indexes/test_base.py | 189 ++++++++++++++---- pandas/tests/indexes/test_range.py | 2 +- .../indexes/timedeltas/test_timedelta.py | 8 +- 17 files changed, 465 insertions(+), 129 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index c8b5417a5b77f..1ee4397960b0b 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -15,10 +15,40 @@ Whats New in 0.24.1 (February XX, 2019) These are the changes in pandas 0.24.1. See :ref:`release` for a full changelog including other versions of pandas. +.. _whatsnew_0241.api: + +API Changes +~~~~~~~~~~~ + +Changing the ``sort`` parameter for :class:`Index` set operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default ``sort`` value for :meth:`Index.union` has changed from ``True`` to ``None`` (:issue:`24959`). +The default *behavior*, however, remains the same: the result is sorted, unless + +1. ``self`` and ``other`` are identical +2. ``self`` or ``other`` is empty +3. ``self`` or ``other`` contain values that can not be compared (a ``RuntimeWarning`` is raised). + +This change will allow ``sort=True`` to mean "always sort" in a future release. + +The same change applies to :meth:`Index.difference` and :meth:`Index.symmetric_difference`, which +would not sort the result when the values could not be compared. + +The `sort` option for :meth:`Index.intersection` has changed in three ways. + +1. The default has changed from ``True`` to ``False``, to restore the + pandas 0.23.4 and earlier behavior of not sorting by default. +2. The behavior of ``sort=True`` can now be obtained with ``sort=None``. + This will sort the result only if the values in ``self`` and ``other`` + are not identical. +3. The value ``sort=True`` is no longer allowed. A future version of pandas + will properly support ``sort=True`` meaning "always sort". + .. _whatsnew_0241.regressions: Fixed Regressions -^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~ - Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) - Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) @@ -30,7 +60,7 @@ Fixed Regressions .. _whatsnew_0241.enhancements: Enhancements -^^^^^^^^^^^^ +~~~~~~~~~~~~ .. _whatsnew_0241.bug_fixes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f845a5437ded4..333626b309e3a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -231,10 +231,11 @@ def fast_unique_multiple(list arrays, sort: bool=True): if val not in table: table[val] = stub uniques.append(val) - if sort: + if sort is None: try: uniques.sort() except Exception: + # TODO: RuntimeWarning? pass return uniques diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 684a19c56c92f..6299fc482d0df 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -112,7 +112,7 @@ def _get_combined_index(indexes, intersect=False, sort=False): elif intersect: index = indexes[0] for other in indexes[1:]: - index = index.intersection(other, sort=sort) + index = index.intersection(other) else: index = _union_indexes(indexes, sort=sort) index = ensure_index(index) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3d176012df22b..2fa034670e885 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2245,18 +2245,37 @@ def _get_reconciled_name_object(self, other): return self._shallow_copy(name=name) return self - def union(self, other, sort=True): + def _validate_sort_keyword(self, sort): + if sort not in [None, False]: + raise ValueError("The 'sort' keyword only takes the values of " + "None or False; {0} was passed.".format(sort)) + + def union(self, other, sort=None): """ Form the union of two Index objects. Parameters ---------- other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + Returns ------- union : Index @@ -2269,6 +2288,7 @@ def union(self, other, sort=True): >>> idx1.union(idx2) Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other = ensure_index(other) @@ -2319,7 +2339,7 @@ def union(self, other, sort=True): else: result = lvals - if sort: + if sort is None: try: result = sorting.safe_sort(result) except TypeError as e: @@ -2342,14 +2362,19 @@ def intersection(self, other, sort=False): Parameters ---------- other : Index or array-like - sort : bool, default False - Sort the resulting index if possible + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 - Changed the default from ``True`` to ``False``. + Changed the default from ``True`` to ``False``, to match + the behaviour of 0.23.4 and earlier. Returns ------- @@ -2363,6 +2388,7 @@ def intersection(self, other, sort=False): >>> idx1.intersection(idx2) Int64Index([3, 4], dtype='int64') """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other = ensure_index(other) @@ -2402,7 +2428,7 @@ def intersection(self, other, sort=False): taken = other.take(indexer) - if sort: + if sort is None: taken = sorting.safe_sort(taken.values) if self.name != other.name: name = None @@ -2415,7 +2441,7 @@ def intersection(self, other, sort=False): return taken - def difference(self, other, sort=True): + def difference(self, other, sort=None): """ Return a new Index with elements from the index that are not in `other`. @@ -2425,11 +2451,22 @@ def difference(self, other, sort=True): Parameters ---------- other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + sort : False or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + Returns ------- difference : Index @@ -2444,6 +2481,7 @@ def difference(self, other, sort=True): >>> idx1.difference(idx2, sort=False) Int64Index([2, 1], dtype='int64') """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) if self.equals(other): @@ -2460,7 +2498,7 @@ def difference(self, other, sort=True): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) - if sort: + if sort is None: try: the_diff = sorting.safe_sort(the_diff) except TypeError: @@ -2468,7 +2506,7 @@ def difference(self, other, sort=True): return this._shallow_copy(the_diff, name=result_name, freq=None) - def symmetric_difference(self, other, result_name=None, sort=True): + def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two Index objects. @@ -2476,11 +2514,22 @@ def symmetric_difference(self, other, result_name=None, sort=True): ---------- other : Index or array-like result_name : str - sort : bool, default True - Sort the resulting index if possible + sort : False or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + Returns ------- symmetric_difference : Index @@ -2504,6 +2553,7 @@ def symmetric_difference(self, other, result_name=None, sort=True): >>> idx1 ^ idx2 Int64Index([1, 5], dtype='int64') """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_name_update = self._convert_can_do_setop(other) if result_name is None: @@ -2524,7 +2574,7 @@ def symmetric_difference(self, other, result_name=None, sort=True): right_diff = other.values.take(right_indexer) the_diff = _concat._concat_compat([left_diff, right_diff]) - if sort: + if sort is None: try: the_diff = sorting.safe_sort(the_diff) except TypeError: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ef941ab87ba12..9c46860eb49d6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -602,19 +602,21 @@ def intersection(self, other, sort=False): Parameters ---------- other : DatetimeIndex or array-like - sort : bool, default True + sort : False or None, default False Sort the resulting index if possible. .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 - Changed the default from ``True`` to ``False``. + Changed the default to ``False`` to match the behaviour + from before 0.24.0. Returns ------- y : Index or DatetimeIndex """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) if self.equals(other): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 736de94991181..2c63fe33c57fe 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1093,7 +1093,7 @@ def equals(self, other): def overlaps(self, other): return self._data.overlaps(other) - def _setop(op_name, sort=True): + def _setop(op_name, sort=None): def func(self, other, sort=sort): other = self._as_like_interval_index(other) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 16af3fe8eef26..14975dbbefa63 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2879,30 +2879,47 @@ def equal_levels(self, other): return False return True - def union(self, other, sort=True): + def union(self, other, sort=None): """ Form the union of two MultiIndex objects Parameters ---------- other : MultiIndex or array / Index of tuples - sort : bool, default True - Sort the resulting MultiIndex if possible + sort : False or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + Returns ------- Index >>> index.union(index2) """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): return self + # TODO: Index.union returns other when `len(self)` is 0. + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, other._ndarray_values], sort=sort) @@ -2917,19 +2934,21 @@ def intersection(self, other, sort=False): Parameters ---------- other : MultiIndex or array / Index of tuples - sort : bool, default True + sort : False or None, default False Sort the resulting MultiIndex if possible .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 - Changed the default from ``True`` to ``False``. + Changed the default from ``True`` to ``False``, to match + behaviour from before 0.24.0 Returns ------- Index """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_names = self._convert_can_do_setop(other) @@ -2940,7 +2959,7 @@ def intersection(self, other, sort=False): other_tuples = other._ndarray_values uniq_tuples = set(self_tuples) & set(other_tuples) - if sort: + if sort is None: uniq_tuples = sorted(uniq_tuples) if len(uniq_tuples) == 0: @@ -2951,22 +2970,28 @@ def intersection(self, other, sort=False): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def difference(self, other, sort=True): + def difference(self, other, sort=None): """ Compute set difference of two MultiIndex objects Parameters ---------- other : MultiIndex - sort : bool, default True + sort : False or None, default None Sort the resulting MultiIndex if possible .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default value from ``True`` to ``None`` + (without change in behaviour). + Returns ------- diff : MultiIndex """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) other, result_names = self._convert_can_do_setop(other) @@ -2986,7 +3011,7 @@ def difference(self, other, sort=True): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) difference = this.values.take(label_diff) - if sort: + if sort is None: difference = sorted(difference) if len(difference) == 0: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e17a6a682af40..5aafe9734b6a0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -350,19 +350,21 @@ def intersection(self, other, sort=False): Parameters ---------- other : Index or array-like - sort : bool, default True + sort : False or None, default False Sort the resulting index if possible .. versionadded:: 0.24.0 .. versionchanged:: 0.24.1 - Changed the default from ``True`` to ``False``. + Changed the default to ``False`` to match the behaviour + from before 0.24.0. Returns ------- intersection : Index """ + self._validate_sort_keyword(sort) if self.equals(other): return self._get_reconciled_name_object(other) @@ -405,7 +407,7 @@ def intersection(self, other, sort=False): if (self._step < 0 and other._step < 0) is not (new_index._step < 0): new_index = new_index[::-1] - if sort: + if sort is None: new_index = new_index.sort_values() return new_index diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 499f01f0e7f7b..e0259b83a8768 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -478,7 +478,7 @@ def test_union_base(self): with pytest.raises(TypeError, match=msg): first.union([1, 2, 3]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_base(self, sort): for name, idx in compat.iteritems(self.indices): first = idx[2:] diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index bd37cc815d0f7..19009e45ee83a 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -138,7 +138,7 @@ def test_intersection2(self): @pytest.mark.parametrize("tz", [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Pacific']) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, tz, sort): # GH 4690 (with tz) base = date_range('6/1/2000', '6/30/2000', freq='D', name='idx') @@ -187,7 +187,7 @@ def test_intersection(self, tz, sort): for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -212,7 +212,7 @@ def test_intersection_bug_1708(self): assert len(result) == 0 @pytest.mark.parametrize("tz", tz) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, tz, sort): rng_dates = ['1/2/2000', '1/3/2000', '1/1/2000', '1/4/2000', '1/5/2000'] @@ -233,11 +233,11 @@ def test_difference(self, tz, sort): (rng2, other2, expected2), (rng3, other3, expected3)]: result_diff = rng.difference(other, sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result_diff, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): # GH14323: difference of DatetimeIndex should not preserve frequency @@ -254,7 +254,7 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal('freq', idx_diff, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_datetimeindex_diff(self, sort): dti1 = date_range(freq='Q-JAN', start=datetime(1997, 12, 31), periods=100) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index db69258c1d3d2..f1fd06c9cab6e 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -783,19 +783,19 @@ def test_non_contiguous(self, closed): assert 1.5 not in index - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union(self, closed, sort): index = self.create_index(closed=closed) other = IntervalIndex.from_breaks(range(5, 13), closed=closed) expected = IntervalIndex.from_breaks(range(13), closed=closed) result = index[::-1].union(other, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) result = other[::-1].union(index, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) @@ -812,19 +812,19 @@ def test_union(self, closed, sort): result = index.union(other, sort=sort) tm.assert_index_equal(result, index) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, closed, sort): index = self.create_index(closed=closed) other = IntervalIndex.from_breaks(range(5, 13), closed=closed) expected = IntervalIndex.from_breaks(range(5, 11), closed=closed) result = index[::-1].intersection(other, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) result = other[::-1].intersection(index, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) @@ -842,14 +842,14 @@ def test_intersection(self, closed, sort): result = index.intersection(other, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, closed, sort): index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) result = index.difference(index[:1], sort=sort) expected = index[1:] - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) @@ -864,19 +864,19 @@ def test_difference(self, closed, sort): result = index.difference(other, sort=sort) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, closed, sort): index = self.create_index(closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) expected = IntervalIndex([index[0], index[-1]]) - if sort: + if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) - if sort: + if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) @@ -888,7 +888,7 @@ def test_symmetric_difference(self, closed, sort): @pytest.mark.parametrize('op_name', [ 'union', 'intersection', 'difference', 'symmetric_difference']) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_set_operation_errors(self, closed, op_name, sort): index = self.create_index(closed=closed) set_op = getattr(index, op_name) diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index 208d6cf1c639f..41a0e1e59e8a5 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize("case", [0.5, "xxx"]) -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) @pytest.mark.parametrize("method", ["intersection", "union", "difference", "symmetric_difference"]) def test_set_ops_error_cases(idx, case, sort, method): @@ -19,13 +19,13 @@ def test_set_ops_error_cases(idx, case, sort, method): getattr(idx, method)(case, sort=sort) -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) def test_intersection_base(idx, sort): first = idx[:5] second = idx[:3] intersect = first.intersection(second, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(intersect, second.sort_values()) assert tm.equalContents(intersect, second) @@ -34,7 +34,7 @@ def test_intersection_base(idx, sort): for klass in [np.array, Series, list]] for case in cases: result = first.intersection(case, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, second.sort_values()) assert tm.equalContents(result, second) @@ -43,13 +43,13 @@ def test_intersection_base(idx, sort): first.intersection([1, 2, 3], sort=sort) -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) def test_union_base(idx, sort): first = idx[3:] second = idx[:5] everything = idx union = first.union(second, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) @@ -58,7 +58,7 @@ def test_union_base(idx, sort): for klass in [np.array, Series, list]] for case in cases: result = first.union(case, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, everything.sort_values()) assert tm.equalContents(result, everything) @@ -67,13 +67,13 @@ def test_union_base(idx, sort): first.union([1, 2, 3], sort=sort) -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) def test_difference_base(idx, sort): second = idx[4:] answer = idx[:4] result = idx.difference(second, sort=sort) - if sort: + if sort is None: answer = answer.sort_values() assert result.equals(answer) @@ -91,14 +91,14 @@ def test_difference_base(idx, sort): idx.difference([1, 2, 3], sort=sort) -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(idx, sort): first = idx[1:] second = idx[:-1] answer = idx[[-1, 0]] result = first.symmetric_difference(second, sort=sort) - if sort: + if sort is None: answer = answer.sort_values() tm.assert_index_equal(result, answer) @@ -121,14 +121,14 @@ def test_empty(idx): assert idx[:0].empty -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) def test_difference(idx, sort): first = idx result = first.difference(idx[-3:], sort=sort) vals = idx[:-3].values - if sort: + if sort is None: vals = sorted(vals) expected = MultiIndex.from_tuples(vals, @@ -189,14 +189,62 @@ def test_difference(idx, sort): first.difference([1, 2, 3, 4, 5], sort=sort) -@pytest.mark.parametrize("sort", [True, False]) +def test_difference_sort_special(): + # GH-24959 + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + # sort=None, the default + result = idx.difference([]) + tm.assert_index_equal(result, idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_difference_sort_special_true(): + # TODO decide on True behaviour + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + result = idx.difference([], sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(result, expected) + + +def test_difference_sort_incomparable(): + # GH-24959 + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], + ['a', 'b']]) + + other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], + ['c', 'd']]) + # sort=None, the default + # MultiIndex.difference deviates here from other difference + # implementations in not catching the TypeError + with pytest.raises(TypeError): + result = idx.difference(other) + + # sort=False + result = idx.difference(other, sort=False) + tm.assert_index_equal(result, idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_difference_sort_incomparable_true(): + # TODO decide on True behaviour + # # sort=True, raises + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], + ['a', 'b']]) + other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], + ['c', 'd']]) + + with pytest.raises(TypeError): + idx.difference(other, sort=True) + + +@pytest.mark.parametrize("sort", [None, False]) def test_union(idx, sort): piece1 = idx[:5][::-1] piece2 = idx[3:] the_union = piece1.union(piece2, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(the_union, idx.sort_values()) assert tm.equalContents(the_union, idx) @@ -225,14 +273,14 @@ def test_union(idx, sort): # assert result.equals(result2) -@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("sort", [None, False]) def test_intersection(idx, sort): piece1 = idx[:5][::-1] piece2 = idx[3:] the_int = piece1.intersection(piece2, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(the_int, idx[3:5]) assert tm.equalContents(the_int, idx[3:5]) @@ -249,3 +297,76 @@ def test_intersection(idx, sort): # tuples = _index.values # result = _index & tuples # assert result.equals(tuples) + + +def test_intersect_equal_sort(): + # GH-24959 + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=None), idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_intersect_equal_sort_true(): + # TODO decide on True behaviour + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + sorted_ = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + + +@pytest.mark.parametrize('slice_', [slice(None), slice(0)]) +def test_union_sort_other_empty(slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + # MultiIndex does not special case empty.union(idx) + # tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_union_sort_other_empty_sort(slice_): + # TODO decide on True behaviour + # # sort=True + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + other = idx[:0] + result = idx.union(other, sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(result, expected) + + +def test_union_sort_other_incomparable(): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) + + # default, sort=None + result = idx.union(idx[:1]) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + + +@pytest.mark.xfail(reason="Not implemented.") +def test_union_sort_other_incomparable_sort(): + # TODO decide on True behaviour + # # sort=True + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) + with pytest.raises(TypeError, match='Cannot compare'): + idx.union(idx[:1], sort=True) + + +@pytest.mark.parametrize("method", ['union', 'intersection', 'difference', + 'symmetric_difference']) +def test_setops_disallow_true(method): + idx1 = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) + idx2 = pd.MultiIndex.from_product([['b', 'c'], [1, 2]]) + + with pytest.raises(ValueError, match="The 'sort' keyword only takes"): + getattr(idx1, method)(idx2, sort=True) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 464ff7aa5d58d..dc9a32d75d272 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -77,7 +77,7 @@ def test_no_millisecond_field(self): with pytest.raises(AttributeError): DatetimeIndex([]).millisecond - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): # GH14323: difference of Period MUST preserve frequency # but the ability to union results must be preserved diff --git a/pandas/tests/indexes/period/test_setops.py b/pandas/tests/indexes/period/test_setops.py index a97ab47bcda16..bf29edad4841e 100644 --- a/pandas/tests/indexes/period/test_setops.py +++ b/pandas/tests/indexes/period/test_setops.py @@ -38,7 +38,7 @@ def test_join_does_not_recur(self): df.columns[0], df.columns[1]], object) tm.assert_index_equal(res, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # union other1 = pd.period_range('1/1/2000', freq='D', periods=5) @@ -97,11 +97,11 @@ def test_union(self, sort): (rng8, other8, expected8)]: result_union = rng.union(other, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result_union, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union_misc(self, sort): index = period_range('1/1/2000', '1/20/2000', freq='D') @@ -110,7 +110,7 @@ def test_union_misc(self, sort): # not in order result = _permute(index[:-5]).union(_permute(index[10:]), sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, index) assert tm.equalContents(result, index) @@ -139,7 +139,7 @@ def test_union_dataframe_index(self): exp = pd.period_range('1/1/1980', '1/1/2012', freq='M') tm.assert_index_equal(df.index, exp) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): index = period_range('1/1/2000', '1/20/2000', freq='D') @@ -150,7 +150,7 @@ def test_intersection(self, sort): left = _permute(index[:-5]) right = _permute(index[10:]) result = left.intersection(right, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, index[10:-5]) assert tm.equalContents(result, index[10:-5]) @@ -164,7 +164,7 @@ def test_intersection(self, sort): with pytest.raises(period.IncompatibleFrequency): index.intersection(index3, sort=sort) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_cases(self, sort): base = period_range('6/1/2000', '6/30/2000', freq='D', name='idx') @@ -210,7 +210,7 @@ def test_intersection_cases(self, sort): for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert result.name == expected.name @@ -224,7 +224,7 @@ def test_intersection_cases(self, sort): result = rng.intersection(rng[0:0]) assert len(result) == 0 - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference(self, sort): # diff period_rng = ['1/3/2000', '1/2/2000', '1/1/2000', '1/5/2000', @@ -276,6 +276,6 @@ def test_difference(self, sort): (rng6, other6, expected6), (rng7, other7, expected7), ]: result_difference = rng.difference(other, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result_difference, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 20e439de46bde..c99007cef90d4 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -3,6 +3,7 @@ from collections import defaultdict from datetime import datetime, timedelta import math +import operator import sys import numpy as np @@ -684,12 +685,12 @@ def test_empty_fancy_raises(self, attr): # np.ndarray only accepts ndarray of int & bool dtypes, so should Index pytest.raises(IndexError, index.__getitem__, empty_farr) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): first = self.strIndex[:20] second = self.strIndex[:10] intersect = first.intersection(second, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(intersect, second.sort_values()) assert tm.equalContents(intersect, second) @@ -701,7 +702,7 @@ def test_intersection(self, sort): (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name (Index([3, 4, 5, 6, 7], name="other"), False), # drop diff names (Index([3, 4, 5, 6, 7]), False)]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation(self, index2, keeps_name, sort): index1 = Index([1, 2, 3, 4, 5], name='index') expected = Index([3, 4, 5]) @@ -715,7 +716,7 @@ def test_intersection_name_preservation(self, index2, keeps_name, sort): @pytest.mark.parametrize("first_name,second_name,expected_name", [ ('A', 'A', 'A'), ('A', 'B', None), (None, 'B', None)]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_name_preservation2(self, first_name, second_name, expected_name, sort): first = self.strIndex[5:20] @@ -728,7 +729,7 @@ def test_intersection_name_preservation2(self, first_name, second_name, @pytest.mark.parametrize("index2,keeps_name", [ (Index([4, 7, 6, 5, 3], name='index'), True), (Index([4, 7, 6, 5, 3], name='other'), False)]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_monotonic(self, index2, keeps_name, sort): index1 = Index([5, 3, 2, 4, 1], name='index') expected = Index([5, 3, 4]) @@ -737,25 +738,25 @@ def test_intersection_monotonic(self, index2, keeps_name, sort): expected.name = "index" result = index1.intersection(index2, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) @pytest.mark.parametrize("index2,expected_arr", [ (Index(['B', 'D']), ['B']), (Index(['B', 'D', 'A']), ['A', 'B', 'A'])]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique index1 = Index(['A', 'B', 'A', 'C']) expected = Index(expected_arr, dtype='object') result = index1.intersection(index2, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersect_str_dates(self, sort): dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)] @@ -770,7 +771,19 @@ def test_intersect_nosort(self): expected = pd.Index(['b', 'a']) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + def test_intersection_equal_sort(self): + idx = pd.Index(['c', 'a', 'b']) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=None), idx) + + @pytest.mark.xfail(reason="Not implemented") + def test_intersection_equal_sort_true(self): + # TODO decide on True behaviour + idx = pd.Index(['c', 'a', 'b']) + sorted_ = pd.Index(['a', 'b', 'c']) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + + @pytest.mark.parametrize("sort", [None, False]) def test_chained_union(self, sort): # Chained unions handles names correctly i1 = Index([1, 2], name='i1') @@ -787,7 +800,7 @@ def test_chained_union(self, sort): expected = j1.union(j2, sort=sort).union(j3, sort=sort) tm.assert_index_equal(union, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union(self, sort): # TODO: Replace with fixturesult first = self.strIndex[5:20] @@ -795,13 +808,65 @@ def test_union(self, sort): everything = self.strIndex[:20] union = first.union(second, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) + @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + def test_union_sort_other_special(self, slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + + idx = pd.Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + @pytest.mark.xfail(reason="Not implemented") + @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + def test_union_sort_special_true(self, slice_): + # TODO decide on True behaviour + # sort=True + idx = pd.Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + + result = idx.union(other, sort=True) + expected = pd.Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + def test_union_sort_other_incomparable(self): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.Index([1, pd.Timestamp('2000')]) + # default (sort=None) + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) + + tm.assert_index_equal(result, idx) + + # sort=None + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1], sort=None) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + + @pytest.mark.xfail(reason="Not implemented") + def test_union_sort_other_incomparable_true(self): + # TODO decide on True behaviour + # sort=True + idx = pd.Index([1, pd.Timestamp('2000')]) + with pytest.raises(TypeError, match='.*'): + idx.union(idx[:1], sort=True) + @pytest.mark.parametrize("klass", [ np.array, Series, list]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union_from_iterables(self, klass, sort): # GH 10149 # TODO: Replace with fixturesult @@ -811,29 +876,30 @@ def test_union_from_iterables(self, klass, sort): case = klass(second.values) result = first.union(case, sort=sort) - if sort: + if sort is None: tm.assert_index_equal(result, everything.sort_values()) assert tm.equalContents(result, everything) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union_identity(self, sort): # TODO: replace with fixturesult first = self.strIndex[5:20] union = first.union(first, sort=sort) - assert union is first + # i.e. identity is not preserved when sort is True + assert (union is first) is (not sort) union = first.union([], sort=sort) - assert union is first + assert (union is first) is (not sort) union = Index([]).union(first, sort=sort) - assert union is first + assert (union is first) is (not sort) @pytest.mark.parametrize("first_list", [list('ba'), list()]) @pytest.mark.parametrize("second_list", [list('ab'), list()]) @pytest.mark.parametrize("first_name, second_name, expected_name", [ ('A', 'B', None), (None, 'B', None), ('A', None, None)]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union_name_preservation(self, first_list, second_list, first_name, second_name, expected_name, sort): first = Index(first_list, name=first_name) @@ -842,14 +908,14 @@ def test_union_name_preservation(self, first_list, second_list, first_name, vals = set(first_list).union(second_list) - if sort and len(first_list) > 0 and len(second_list) > 0: + if sort is None and len(first_list) > 0 and len(second_list) > 0: expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: expected = Index(vals, name=expected_name) assert tm.equalContents(union, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_union_dt_as_obj(self, sort): # TODO: Replace with fixturesult firstCat = self.strIndex.union(self.dateIndex) @@ -866,6 +932,15 @@ def test_union_dt_as_obj(self, sort): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) + @pytest.mark.parametrize("method", ['union', 'intersection', 'difference', + 'symmetric_difference']) + def test_setops_disallow_true(self, method): + idx1 = pd.Index(['a', 'b']) + idx2 = pd.Index(['b', 'c']) + + with pytest.raises(ValueError, match="The 'sort' keyword only takes"): + getattr(idx1, method)(idx2, sort=True) + def test_map_identity_mapping(self): # GH 12766 # TODO: replace with fixture @@ -987,7 +1062,7 @@ def test_append_empty_preserve_name(self, name, expected): @pytest.mark.parametrize("second_name,expected", [ (None, None), ('name', 'name')]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_name_preservation(self, second_name, expected, sort): # TODO: replace with fixturesult first = self.strIndex[5:20] @@ -1005,7 +1080,7 @@ def test_difference_name_preservation(self, second_name, expected, sort): else: assert result.name == expected - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_empty_arg(self, sort): first = self.strIndex[5:20] first.name == 'name' @@ -1014,7 +1089,7 @@ def test_difference_empty_arg(self, sort): assert tm.equalContents(result, first) assert result.name == first.name - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_identity(self, sort): first = self.strIndex[5:20] first.name == 'name' @@ -1023,7 +1098,7 @@ def test_difference_identity(self, sort): assert len(result) == 0 assert result.name == first.name - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_sort(self, sort): first = self.strIndex[5:20] second = self.strIndex[:10] @@ -1031,12 +1106,12 @@ def test_difference_sort(self, sort): result = first.difference(second, sort) expected = self.strIndex[10:20] - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference(self, sort): # smoke index1 = Index([5, 2, 3, 4], name='index1') @@ -1045,7 +1120,7 @@ def test_symmetric_difference(self, sort): expected = Index([5, 1]) assert tm.equalContents(result, expected) assert result.name is None - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) @@ -1054,13 +1129,43 @@ def test_symmetric_difference(self, sort): assert tm.equalContents(result, expected) assert result.name is None - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + def test_difference_incomparable(self, opname): + a = pd.Index([3, pd.Timestamp('2000'), 1]) + b = pd.Index([2, pd.Timestamp('1999'), 1]) + op = operator.methodcaller(opname, b) + + # sort=None, the default + result = op(a) + expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')]) + if opname == 'difference': + expected = expected[:2] + tm.assert_index_equal(result, expected) + + # sort=False + op = operator.methodcaller(opname, b, sort=False) + result = op(a) + tm.assert_index_equal(result, expected) + + @pytest.mark.xfail(reason="Not implemented") + @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + def test_difference_incomparable_true(self, opname): + # TODO decide on True behaviour + # # sort=True, raises + a = pd.Index([3, pd.Timestamp('2000'), 1]) + b = pd.Index([2, pd.Timestamp('1999'), 1]) + op = operator.methodcaller(opname, b, sort=True) + + with pytest.raises(TypeError, match='Cannot compare'): + op(a) + + @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(self.tuples) index2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) result = index1.symmetric_difference(index2, sort=sort) expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) @@ -1068,18 +1173,18 @@ def test_symmetric_difference_mi(self, sort): @pytest.mark.parametrize("index2,expected", [ (Index([0, 1, np.nan]), Index([2.0, 3.0, 0.0])), (Index([0, 1]), Index([np.nan, 2.0, 3.0, 0.0]))]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_missing(self, index2, expected, sort): # GH 13514 change: {nan} - {nan} == {} # (GH 6444, sorting of nans, is no longer an issue) index1 = Index([1, np.nan, 2, 3]) result = index1.symmetric_difference(index2, sort=sort) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_symmetric_difference_non_index(self, sort): index1 = Index([1, 2, 3, 4], name='index1') index2 = np.array([2, 3, 4, 5]) @@ -1093,7 +1198,7 @@ def test_symmetric_difference_non_index(self, sort): assert tm.equalContents(result, expected) assert result.name == 'new_name' - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_type(self, sort): # GH 20040 # If taking difference of a set and itself, it @@ -1104,7 +1209,7 @@ def test_difference_type(self, sort): expected = index.drop(index) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_difference(self, sort): # GH 20040 # Test that the intersection of an index with an @@ -1607,11 +1712,11 @@ def test_drop_tuple(self, values, to_drop): ('intersection', np.array([(1, 'A'), (1, 'B'), (2, 'A'), (2, 'B')], dtype=[('num', int), ('let', 'a1')]), - True), + None), ('union', np.array([(1, 'A'), (1, 'B'), (1, 'C'), (2, 'A'), (2, 'B'), (2, 'C')], dtype=[('num', int), ('let', 'a1')]), - True) + None) ]) def test_tuple_union_bug(self, method, expected, sort): index1 = Index(np.array([(1, 'A'), (2, 'A'), (1, 'B'), (2, 'B')], @@ -2259,20 +2364,20 @@ def test_unique_na(self): result = idx.unique() tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_base(self, sort): # (same results for py2 and py3 but sortedness not tested elsewhere) index = self.create_index() first = index[:5] second = index[:3] - expected = Index([0, 1, 'a']) if sort else Index([0, 'a', 1]) + expected = Index([0, 1, 'a']) if sort is None else Index([0, 'a', 1]) result = first.intersection(second, sort=sort) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [ np.array, Series, list]) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection_different_type_base(self, klass, sort): # GH 10149 index = self.create_index() @@ -2282,7 +2387,7 @@ def test_intersection_different_type_base(self, klass, sort): result = first.intersection(klass(second.values), sort=sort) assert tm.equalContents(result, second) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_base(self, sort): # (same results for py2 and py3 but sortedness not tested elsewhere) index = self.create_index() @@ -2291,7 +2396,7 @@ def test_difference_base(self, sort): result = first.difference(second, sort) expected = Index([0, 'a', 1]) - if sort: + if sort is None: expected = Index(safe_sort(expected)) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index bbd1e0ccc19b1..96cf83d477376 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -503,7 +503,7 @@ def test_join_self(self): joined = self.index.join(self.index, how=kind) assert self.index is joined - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_intersection(self, sort): # intersect with Int64Index other = Index(np.arange(1, 6)) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 547366ec79094..79210705103ab 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -51,7 +51,7 @@ def test_fillna_timedelta(self): [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) tm.assert_index_equal(idx.fillna('x'), exp) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_freq(self, sort): # GH14323: Difference of TimedeltaIndex should not preserve frequency @@ -69,7 +69,7 @@ def test_difference_freq(self, sort): tm.assert_index_equal(idx_diff, expected) tm.assert_attr_equal('freq', idx_diff, expected) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, False]) def test_difference_sort(self, sort): index = pd.TimedeltaIndex(["5 days", "3 days", "2 days", "4 days", @@ -80,7 +80,7 @@ def test_difference_sort(self, sort): expected = TimedeltaIndex(["5 days", "0 days"], freq=None) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(idx_diff, expected) @@ -90,7 +90,7 @@ def test_difference_sort(self, sort): idx_diff = index.difference(other, sort) expected = TimedeltaIndex(["1 days", "0 days"], freq=None) - if sort: + if sort is None: expected = expected.sort_values() tm.assert_index_equal(idx_diff, expected) From 627b17acbb96f77bfc50e1d9336e642b961152a8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 1 Feb 2019 16:06:26 -0600 Subject: [PATCH 22/30] trigger azure From bc405ce339fb3c761bb20d26f5ac15b44834f20d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Feb 2019 23:15:04 -0800 Subject: [PATCH 23/30] Backport PR #25084: DOC: Cleanup 0.24.1 whatsnew (#25086) --- doc/source/whatsnew/v0.24.1.rst | 48 ++------------------------------- 1 file changed, 2 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 1ee4397960b0b..bc45a14666ba2 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -13,7 +13,7 @@ Whats New in 0.24.1 (February XX, 2019) {{ header }} These are the changes in pandas 0.24.1. See :ref:`release` for a full changelog -including other versions of pandas. +including other versions of pandas. See :ref:`whatsnew_0240` for the 0.24.0 changelog. .. _whatsnew_0241.api: @@ -57,52 +57,9 @@ Fixed Regressions - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). - Fixed regression in :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` where passing ``None`` failed to remove the axis name (:issue:`25034`) -.. _whatsnew_0241.enhancements: - -Enhancements -~~~~~~~~~~~~ - - -.. _whatsnew_0241.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -**Conversion** - -- -- -- - -**Indexing** - -- -- -- - -**I/O** - -- -- -- - -**Categorical** - -- -- -- - -**Timezones** - -- -- -- - **Timedelta** + - Bug in :func:`to_timedelta` with `box=False` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) -- -- -- **Reshaping** @@ -116,7 +73,6 @@ Bug Fixes **Other** - Fixed AttributeError when printing a DataFrame's HTML repr after accessing the IPython config object (:issue:`25036`) -- .. _whatsnew_0.241.contributors: From 02db6ec5085f125f698f641fef17f5478bd96645 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 1 Feb 2019 23:15:34 -0800 Subject: [PATCH 24/30] Backport PR #25026: DOC: Start 0.24.2.rst (#25073) --- doc/source/whatsnew/v0.24.2.rst | 99 +++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 doc/source/whatsnew/v0.24.2.rst diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst new file mode 100644 index 0000000000000..cba21ce7ee1e6 --- /dev/null +++ b/doc/source/whatsnew/v0.24.2.rst @@ -0,0 +1,99 @@ +:orphan: + +.. _whatsnew_0242: + +Whats New in 0.24.2 (February XX, 2019) +--------------------------------------- + +.. warning:: + + The 0.24.x series of releases will be the last to support Python 2. Future feature + releases will support Python 3 only. See :ref:`install.dropping-27` for more. + +{{ header }} + +These are the changes in pandas 0.24.2. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0242.regressions: + +Fixed Regressions +^^^^^^^^^^^^^^^^^ + +- +- +- + +.. _whatsnew_0242.enhancements: + +Enhancements +^^^^^^^^^^^^ + +- +- + +.. _whatsnew_0242.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Conversion** + +- +- +- + +**Indexing** + +- +- +- + +**I/O** + +- +- +- + +**Categorical** + +- +- +- + +**Timezones** + +- +- +- + +**Timedelta** + +- +- +- + +**Reshaping** + +- +- +- + +**Visualization** + +- +- +- + +**Other** + +- +- +- + +.. _whatsnew_0.242.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.24.1..v0.24.2 \ No newline at end of file From ff34d2e379c327ac8e3072dca0a81b4c69eb2984 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 2 Feb 2019 06:09:47 -0600 Subject: [PATCH 25/30] trigger azure From 330b343edbd2695be15c188efad3c7e7f8350887 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 3 Feb 2019 14:15:59 +0100 Subject: [PATCH 26/30] Keep all tests from #24984; xfail where necessary --- pandas/tests/frame/conftest.py | 7 +- pandas/tests/frame/test_alter_axes.py | 179 +++++++++++++++++++++++--- 2 files changed, 164 insertions(+), 22 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 1838339ae0ed8..377e737a53158 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -209,12 +209,13 @@ def frame_of_index_cols(): """ Fixture for DataFrame of columns that can be used for indexing - Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but - are jointly unique), the rest are unique. + Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')]; + 'A' & 'B' contain duplicates (but are jointly unique), the rest are unique. """ df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], 'C': ['a', 'b', 'c', 'd', 'e'], 'D': np.random.randn(5), - 'E': np.random.randn(5)}) + 'E': np.random.randn(5), + ('tuple', 'as', 'label'): np.random.randn(5)}) return df diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 2fa505f8eb389..0393c5ad54b60 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -47,7 +47,8 @@ def test_set_index_cast(self): tm.assert_frame_equal(df, df2) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], + ('tuple', 'as', 'label')]) @pytest.mark.parametrize('inplace', [True, False]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_drop_inplace(self, frame_of_index_cols, @@ -70,7 +71,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], + ('tuple', 'as', 'label')]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_append(self, frame_of_index_cols, drop, keys): df = frame_of_index_cols @@ -86,7 +88,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys): tm.assert_frame_equal(result, expected) # A has duplicate values, C does not - @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']]) + @pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'], + ('tuple', 'as', 'label')]) @pytest.mark.parametrize('drop', [True, False]) def test_set_index_append_to_multiindex(self, frame_of_index_cols, drop, keys): @@ -128,7 +131,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, key = box(df['B']) if box == list: # list of strings gets interpreted as list of keys - msg = "'one'" + msg = "['one', 'two', 'three', 'one', 'two']" with pytest.raises(KeyError, match=msg): df.set_index(key, drop=drop, append=append) else: @@ -159,8 +162,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array and list "forget" the name of B - names = ['A', None if box in [np.array, list] else 'B'] + # np.array/list "forget" the name of B + names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] result = df.set_index(keys, drop=drop, append=append) @@ -193,18 +196,22 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # == gives ambiguous Boolean for Series if drop and keys[0] is 'A' and keys[1] is 'A': - with pytest.raises(KeyError, match='.*'): - df.set_index(keys, drop=drop, append=append) - else: - result = df.set_index(keys, drop=drop, append=append) + print('hello') + pytest.xfail(reason='broken due to reversion, see GH 25085') - # to test against already-tested behavior, we add sequentially, - # hence second append always True; must wrap in list, otherwise - # list-box will be illegal - expected = df.set_index([keys[0]], drop=drop, append=append) - expected = expected.set_index([keys[1]], drop=drop, append=True) + result = df.set_index(keys, drop=drop, append=append) - tm.assert_frame_equal(result, expected) + # need to adapt first drop for case that both keys are 'A' -- + # cannot drop the same column twice; + # use "is" because == would give ambiguous Boolean error for containers + first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop + + # to test against already-tested behaviour, we add sequentially, + # hence second append always True; must wrap keys in list, otherwise + # box = list would be interpreted as keys + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) @@ -231,16 +238,150 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) - def test_set_index_raise(self, frame_of_index_cols, drop, append): + def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - with pytest.raises(KeyError, match='.*'): # column names are A-E + with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"): + # column names are A-E, as well as one tuple df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) # non-existent key in list with arrays - with pytest.raises(KeyError, match='.*'): + with pytest.raises(KeyError, match='X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) + msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]" + # tuples always raise KeyError + with pytest.raises(KeyError, match=msg): + df.set_index(tuple(df['A']), drop=drop, append=append) + + # also within a list + with pytest.raises(KeyError, match=msg): + df.set_index(['A', df['A'], tuple(df['A'])], + drop=drop, append=append) + + @pytest.mark.xfail(reason='broken due to revert, see GH 25085') + @pytest.mark.parametrize('append', [True, False]) + @pytest.mark.parametrize('drop', [True, False]) + @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)], + ids=['set', 'iter', 'generator']) + def test_set_index_raise_on_type(self, frame_of_index_cols, box, + drop, append): + df = frame_of_index_cols + + msg = 'The parameter "keys" may be a column key, .*' + # forbidden type, e.g. set/iter/generator + with pytest.raises(TypeError, match=msg): + df.set_index(box(df['A']), drop=drop, append=append) + + # forbidden type in list, e.g. set/iter/generator + with pytest.raises(TypeError, match=msg): + df.set_index(['A', df['A'], box(df['A'])], + drop=drop, append=append) + + def test_set_index_custom_label_type(self): + # GH 24969 + + class Thing(object): + def __init__(self, name, color): + self.name = name + self.color = color + + def __str__(self): + return "" % (self.name,) + + # necessary for pretty KeyError + __repr__ = __str__ + + thing1 = Thing('One', 'red') + thing2 = Thing('Two', 'blue') + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, + index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing('Three', 'pink') + msg = "" + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_hashable_iterable(self): + # GH 24969 + + # actual example discussed in GH 24984 was e.g. for shapely.geometry + # objects (e.g. a collection of Points) that can be both hashable and + # iterable; using frozenset as a stand-in for testing here + + class Thing(frozenset): + # need to stabilize repr for KeyError (due to random order in sets) + def __repr__(self): + tmp = sorted(list(self)) + # double curly brace prints one brace in format string + return "frozenset({{{}}})".format(', '.join(map(repr, tmp))) + + thing1 = Thing(['One', 'red']) + thing2 = Thing(['Two', 'blue']) + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, + index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing(['Three', 'pink']) + msg = '.*' # due to revert, see GH 25085 + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_type_raises(self): + # GH 24969 + + # purposefully inherit from something unhashable + class Thing(set): + def __init__(self, name, color): + self.name = name + self.color = color + + def __str__(self): + return "" % (self.name,) + + thing1 = Thing('One', 'red') + thing2 = Thing('Two', 'blue') + df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) + + msg = 'unhashable type.*' + + with pytest.raises(TypeError, match=msg): + # use custom label directly + df.set_index(thing2) + + with pytest.raises(TypeError, match=msg): + # custom label wrapped in list + df.set_index([thing2]) + def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) ci.name = 'B' From 963a813e700e35c40aed38633ea69dc69aa852ae Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 3 Feb 2019 14:22:37 +0100 Subject: [PATCH 27/30] Remove stray debugging line --- pandas/tests/frame/test_alter_axes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 0393c5ad54b60..ec297e21c7291 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -196,7 +196,6 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # == gives ambiguous Boolean for Series if drop and keys[0] is 'A' and keys[1] is 'A': - print('hello') pytest.xfail(reason='broken due to reversion, see GH 25085') result = df.set_index(keys, drop=drop, append=append) From 4db484919fcc604e0430f52e80340a7560745b29 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 3 Feb 2019 17:42:33 +0100 Subject: [PATCH 28/30] Add whatsnew --- doc/source/whatsnew/v0.24.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index bc45a14666ba2..57c707b5ffbe5 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -64,6 +64,7 @@ Fixed Regressions **Reshaping** - Bug in :meth:`DataFrame.groupby` with :class:`Grouper` when there is a time change (DST) and grouping frequency is ``'1d'`` (:issue:`24972`) +- Fixed regression where custom hashable types could not be used as column keys in :meth:`DataFrame.set_index` (:issue:`24969`) **Visualization** From ff62753e99bbb4f6df2cbdc70c90e0ce73d840ab Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 3 Feb 2019 18:53:49 +0100 Subject: [PATCH 29/30] Re-add reverted 0.24.0 whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 711b35f854035..a49ea2cf493a6 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1195,6 +1195,8 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types, + and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). From 65c788083d7fedfee838740a603285d3d7a08098 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 3 Feb 2019 19:38:38 +0100 Subject: [PATCH 30/30] Re-add handling for duplicate drops --- pandas/core/frame.py | 3 ++- pandas/tests/frame/test_alter_axes.py | 5 ----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 07d5efecae444..b6d1b5157c6c1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4189,7 +4189,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, raise ValueError('Index has duplicate keys: {dup}'.format( dup=duplicates)) - for c in to_remove: + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): del frame[c] # clear up memory usage diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index ec297e21c7291..cc3687f856b4e 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -193,11 +193,6 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, df.index.name = index_name keys = [box1(df['A']), box2(df['A'])] - - # == gives ambiguous Boolean for Series - if drop and keys[0] is 'A' and keys[1] is 'A': - pytest.xfail(reason='broken due to reversion, see GH 25085') - result = df.set_index(keys, drop=drop, append=append) # need to adapt first drop for case that both keys are 'A' --