From ec2020735d72ff73e0a6a607689281aad173c702 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Wed, 20 Jun 2018 16:03:07 +0530 Subject: [PATCH] REGR: Fixes first_valid_index when DataFrame or Series has duplicate row index (GH21441) (#21497) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/generic.py | 23 +++++++++++------------ pandas/tests/frame/test_timeseries.py | 15 ++++++++++++++- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5454dc9eca360..5b3e607956f7a 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -17,7 +17,8 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) -- +- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- .. _whatsnew_0232.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 555108a5d9349..1780e359164e2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8968,18 +8968,17 @@ def _find_valid_index(self, how): is_valid = is_valid.any(1) # reduce axis 1 if how == 'first': - # First valid value case - i = is_valid.idxmax() - if not is_valid[i]: - return None - return i - - elif how == 'last': - # Last valid value case - i = is_valid.values[::-1].argmax() - if not is_valid.iat[len(self) - i - 1]: - return None - return self.index[len(self) - i - 1] + idxpos = is_valid.values[::].argmax() + + if how == 'last': + idxpos = len(self) - 1 - is_valid.values[::-1].argmax() + + chk_notna = is_valid.iat[idxpos] + idx = self.index[idxpos] + + if not chk_notna: + return None + return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', 'klass': 'NDFrame'}) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 90fbc6e628369..fb9bd74d9876d 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -506,7 +506,15 @@ def test_asfreq_fillvalue(self): actual_series = ts.asfreq(freq='1S', fill_value=9.0) assert_series_equal(expected_series, actual_series) - def test_first_last_valid(self): + @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ + ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), + ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), + ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), + ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) + def test_first_last_valid(self, data, idx, + expected_first, expected_last): N = len(self.frame.index) mat = randn(N) mat[:5] = nan @@ -539,6 +547,11 @@ def test_first_last_valid(self): assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq + # GH 21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + def test_first_subset(self): ts = tm.makeTimeDataFrame(freq='12h') result = ts.first('10d')