From 1940cfd6b387ca427c46b02c0e547250cbc6ac10 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 02:37:55 +0200 Subject: [PATCH 01/14] fix #24972 --- pandas/core/resample.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6822225273906..0b575c1082d14 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1413,6 +1413,16 @@ def _get_time_bins(self, ax): ambiguous='infer', nonexistent='shift_forward') + # GH #24972 + # In edge case of tz-aware grouping binner last index can be + # less than the ax.max() variable in data object, this happens + # because of normalization + if len(binner) > 1 and binner[-1] < ax.max(): + extra_date_range = pd.date_range(binner[-1], ax.max() + self.freq, + freq=self.freq, tz=binner[-1].tz, + name=ax.name) + binner = labels = binner.append(extra_date_range[1:]) + ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) From dd4220069283646fa5363d91e13e06492fe95b55 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 02:39:04 +0200 Subject: [PATCH 02/14] improve comment --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0b575c1082d14..6a7cfa3cc14ad 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1416,7 +1416,7 @@ def _get_time_bins(self, ax): # GH #24972 # In edge case of tz-aware grouping binner last index can be # less than the ax.max() variable in data object, this happens - # because of normalization + # because of normalization and DST time change if len(binner) > 1 and binner[-1] < ax.max(): extra_date_range = pd.date_range(binner[-1], ax.max() + self.freq, freq=self.freq, tz=binner[-1].tz, From f8cb89ad8a3b633be5df8b1dd6f8c668d4fa5a46 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 02:41:40 +0200 Subject: [PATCH 03/14] add entry to what's new --- doc/source/whatsnew/v0.24.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 3ac2ed73ea53f..6d2176dbe9bfc 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -66,7 +66,7 @@ Bug Fixes **Reshaping** - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - +Bug in ``DataFrame.groupby(pd.Grouper(freq='1d'))`` when there is a time change (DST) and grouping frequecy is 1d (:issue:`24972`) **Other** - From eccca8ecbda9a93adcd5a2701b04130977ddeff5 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 02:58:03 +0200 Subject: [PATCH 04/14] add test for issue 24972 --- pandas/tests/groupby/test_groupby.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 98c917a6eca3c..3859503cbd403 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1744,3 +1744,19 @@ def test_groupby_agg_ohlc_non_first(): result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) tm.assert_frame_equal(result, expected) + + +def test_groupby_with_dst_time_change(): + # GH 24972 + import pandas as pd + index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], + tz='UTC').tz_convert('America/Chicago') + + df = pd.DataFrame([1, 2], index=index) + result = df.groupby(pd.Grouper(freq='1d')).last() + expected_index_values = pd.date_range('2016-11-02', '2016-11-24', + freq='d', tz='America/Chicago') + + index = pd.DatetimeIndex(expected_index_values) + expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) + assert_frame_equal(result, expected) From 0454dc345b9eae1f2226369719659203b329dbea Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 03:10:42 +0200 Subject: [PATCH 05/14] remove import --- pandas/tests/groupby/test_groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3859503cbd403..88ed77ca62d67 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1748,7 +1748,6 @@ def test_groupby_agg_ohlc_non_first(): def test_groupby_with_dst_time_change(): # GH 24972 - import pandas as pd index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], tz='UTC').tz_convert('America/Chicago') From b009f176f56e390b5b3dc459aac21eec91905bb2 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 09:38:43 +0200 Subject: [PATCH 06/14] change fix for values fall after last bin --- pandas/core/resample.py | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6a7cfa3cc14ad..27c718baf9d1d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1413,16 +1413,6 @@ def _get_time_bins(self, ax): ambiguous='infer', nonexistent='shift_forward') - # GH #24972 - # In edge case of tz-aware grouping binner last index can be - # less than the ax.max() variable in data object, this happens - # because of normalization and DST time change - if len(binner) > 1 and binner[-1] < ax.max(): - extra_date_range = pd.date_range(binner[-1], ax.max() + self.freq, - freq=self.freq, tz=binner[-1].tz, - name=ax.name) - binner = labels = binner.append(extra_date_range[1:]) - ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) @@ -1623,20 +1613,20 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0): A tuple of length 2, containing the adjusted pd.Timestamp objects. """ if isinstance(offset, Tick): - is_day = isinstance(offset, Day) - day_nanos = delta_to_nanoseconds(timedelta(1)) - - # #1165 and #24127 - if (is_day and not offset.nanos % day_nanos) or not is_day: - first, last = _adjust_dates_anchored(first, last, offset, - closed=closed, base=base) - if is_day and first.tz is not None: - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). - # Ensure first/last snap to midnight. - first = first.normalize() - last = last.normalize() - return first, last + if isinstance(offset, Day): + # _adjust_dates_anchored assumes 'D' means 24H, but first/last + # might contain a DST transition (23H, 24H, or 25H). + # So "pretend" the dates are naive when adjusting the endpoints + tz = first.tz + first = first.tz_localize(None) + last = last.tz_localize(None) + + first, last = _adjust_dates_anchored(first, last, offset, + closed=closed, base=base) + if isinstance(offset, Day): + first = first.tz_localize(tz) + last = last.tz_localize(tz) + return first, last else: first = first.normalize() From 66861aba8d9281120906304f5a6b7fd0698252cf Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 13:36:40 +0200 Subject: [PATCH 07/14] fix imports --- pandas/core/resample.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 27c718baf9d1d..55840ec6a65f2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -30,8 +30,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.tseries.frequencies import to_offset -from pandas.tseries.offsets import ( - DateOffset, Day, Nano, Tick, delta_to_nanoseconds) +from pandas.tseries.offsets import (DateOffset, Day, Nano, Tick) _shared_docs_kwargs = dict() From dda1c5cabbf45a25a4a6fe8a9c6322cbace483ea Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 16:49:08 +0200 Subject: [PATCH 08/14] fix imports --- pandas/core/resample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 55840ec6a65f2..7723827ff478a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -30,7 +30,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.tseries.frequencies import to_offset -from pandas.tseries.offsets import (DateOffset, Day, Nano, Tick) +from pandas.tseries.offsets import DateOffset, Day, Nano, Tick _shared_docs_kwargs = dict() From 3edb267885e715aaf0577e1499bba2f66fb15a9d Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 21:37:10 +0200 Subject: [PATCH 09/14] change the description --- doc/source/whatsnew/v0.24.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 6d2176dbe9bfc..e92a585abb839 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -66,7 +66,7 @@ Bug Fixes **Reshaping** - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) -Bug in ``DataFrame.groupby(pd.Grouper(freq='1d'))`` when there is a time change (DST) and grouping frequecy is 1d (:issue:`24972`) +Bug in :meth:`DataFrame.groupby`(:class:`pd.Grouper`(freq='1d')) when there is a time change (DST) and grouping frequecy is 1d (:issue:`24972`) **Other** - From c0bc3a56a6aeed411c656e031dfafe728dc450c7 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 21:39:29 +0200 Subject: [PATCH 10/14] move test from groupby to timegrouper --- pandas/tests/groupby/test_groupby.py | 15 --------------- pandas/tests/groupby/test_timegrouper.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 88ed77ca62d67..98c917a6eca3c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1744,18 +1744,3 @@ def test_groupby_agg_ohlc_non_first(): result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) tm.assert_frame_equal(result, expected) - - -def test_groupby_with_dst_time_change(): - # GH 24972 - index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], - tz='UTC').tz_convert('America/Chicago') - - df = pd.DataFrame([1, 2], index=index) - result = df.groupby(pd.Grouper(freq='1d')).last() - expected_index_values = pd.date_range('2016-11-02', '2016-11-24', - freq='d', tz='America/Chicago') - - index = pd.DatetimeIndex(expected_index_values) - expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) - assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a2f2c1392b251..6756de594ac8d 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -650,3 +650,17 @@ def test_scalar_call_versus_list_call(self): expected = grouped.count() assert_frame_equal(result, expected) + + def test_groupby_with_dst_time_change(self): + # GH 24972 + index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], + tz='UTC').tz_convert('America/Chicago') + + df = pd.DataFrame([1, 2], index=index) + result = df.groupby(pd.Grouper(freq='1d')).last() + expected_index_values = pd.date_range('2016-11-02', '2016-11-24', + freq='d', tz='America/Chicago') + + index = pd.DatetimeIndex(expected_index_values) + expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) + assert_frame_equal(result, expected) From e631821d3ec339bb2430711454eda9a59c51585e Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Mon, 28 Jan 2019 23:20:59 +0200 Subject: [PATCH 11/14] fix typo --- doc/source/whatsnew/v0.24.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index e92a585abb839..e944ffba71ce3 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -66,7 +66,7 @@ Bug Fixes **Reshaping** - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) -Bug in :meth:`DataFrame.groupby`(:class:`pd.Grouper`(freq='1d')) when there is a time change (DST) and grouping frequecy is 1d (:issue:`24972`) +Bug in :meth:`DataFrame.groupby`(:class:`pd.Grouper`(freq='1d')) when there is a time change (DST) and grouping frequency is 1d (:issue:`24972`) **Other** - From 5c07d25f6ffb0d766b9e5f21c5f8977a2898b0f9 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Tue, 29 Jan 2019 01:12:58 +0200 Subject: [PATCH 12/14] move tests --- pandas/tests/groupby/test_timegrouper.py | 14 -------------- pandas/tests/resample/test_datetime_index.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 6756de594ac8d..a2f2c1392b251 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -650,17 +650,3 @@ def test_scalar_call_versus_list_call(self): expected = grouped.count() assert_frame_equal(result, expected) - - def test_groupby_with_dst_time_change(self): - # GH 24972 - index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], - tz='UTC').tz_convert('America/Chicago') - - df = pd.DataFrame([1, 2], index=index) - result = df.groupby(pd.Grouper(freq='1d')).last() - expected_index_values = pd.date_range('2016-11-02', '2016-11-24', - freq='d', tz='America/Chicago') - - index = pd.DatetimeIndex(expected_index_values) - expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) - assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 73995cbe79ecd..b743aeecdc756 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1276,6 +1276,21 @@ def test_resample_across_dst(): assert_frame_equal(result, expected) +def test_groupby_with_dst_time_change(): + # GH 24972 + index = pd.DatetimeIndex([1478064900001000000, 1480037118776792000], + tz='UTC').tz_convert('America/Chicago') + + df = pd.DataFrame([1, 2], index=index) + result = df.groupby(pd.Grouper(freq='1d')).last() + expected_index_values = pd.date_range('2016-11-02', '2016-11-24', + freq='d', tz='America/Chicago') + + index = pd.DatetimeIndex(expected_index_values) + expected = pd.DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index) + assert_frame_equal(result, expected) + + def test_resample_dst_anchor(): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') From df7d6508ef210c753cc2cf1f8a2de6078b0a33a3 Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Tue, 29 Jan 2019 02:53:23 +0200 Subject: [PATCH 13/14] chagne the bug fix description --- doc/source/whatsnew/v0.24.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index e944ffba71ce3..7cfa61e391112 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -66,7 +66,7 @@ Bug Fixes **Reshaping** - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) -Bug in :meth:`DataFrame.groupby`(:class:`pd.Grouper`(freq='1d')) when there is a time change (DST) and grouping frequency is 1d (:issue:`24972`) +- Bug in :meth:`DataFrame.groupby` with :class:`Grouper` when there is a time change (DST) and grouping frequency is ``'1d'`` (:issue:`24972`) **Other** - From 47eb40b284bf6b3ffc97df01fca3f0f73d6170fc Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Tue, 29 Jan 2019 15:27:16 +0200 Subject: [PATCH 14/14] remove the record that is not on master anymore --- doc/source/whatsnew/v0.24.1.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index c27ed3606b78a..8f4c3982c745f 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -72,7 +72,6 @@ Bug Fixes **Reshaping** -- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - Bug in :meth:`DataFrame.groupby` with :class:`Grouper` when there is a time change (DST) and grouping frequency is ``'1d'`` (:issue:`24972`) **Visualization**