REGR: Fixes first_valid_index when DataFrame or Series has duplicate …

…row index (GH21441) (#21497) (cherry picked from commit ec20207)
pandas-dev · Jul 2, 2018 · d44fddb · d44fddb
1 parent 030a058
commit d44fddb
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 14 deletions.
diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
@@ -17,7 +17,8 @@ Fixed Regressions
 ~~~~~~~~~~~~~~~~~
 
 - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
--
+- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`)
+- 
 
 .. _whatsnew_0232.performance:
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8969,18 +8969,17 @@ def _find_valid_index(self, how):
             is_valid = is_valid.any(1)  # reduce axis 1
 
         if how == 'first':
-            # First valid value case
-            i = is_valid.idxmax()
-            if not is_valid[i]:
-                return None
-            return i
-
-        elif how == 'last':
-            # Last valid value case
-            i = is_valid.values[::-1].argmax()
-            if not is_valid.iat[len(self) - i - 1]:
-                return None
-            return self.index[len(self) - i - 1]
+            idxpos = is_valid.values[::].argmax()
+
+        if how == 'last':
+            idxpos = len(self) - 1 - is_valid.values[::-1].argmax()
+
+        chk_notna = is_valid.iat[idxpos]
+        idx = self.index[idxpos]
+
+        if not chk_notna:
+            return None
+        return idx
 
     @Appender(_shared_docs['valid_index'] % {'position': 'first',
                                              'klass': 'NDFrame'})

diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py
@@ -506,7 +506,15 @@ def test_asfreq_fillvalue(self):
         actual_series = ts.asfreq(freq='1S', fill_value=9.0)
         assert_series_equal(expected_series, actual_series)
 
-    def test_first_last_valid(self):
+    @pytest.mark.parametrize("data,idx,expected_first,expected_last", [
+        ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2),
+        ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2),
+        ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'),
+        ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2),
+        ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2),
+        ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)])
+    def test_first_last_valid(self, data, idx,
+                              expected_first, expected_last):
         N = len(self.frame.index)
         mat = randn(N)
         mat[:5] = nan
@@ -539,6 +547,11 @@ def test_first_last_valid(self):
         assert frame.first_valid_index().freq == frame.index.freq
         assert frame.last_valid_index().freq == frame.index.freq
 
+        # GH 21441
+        df = DataFrame(data, index=idx)
+        assert expected_first == df.first_valid_index()
+        assert expected_last == df.last_valid_index()
+
     def test_first_subset(self):
         ts = tm.makeTimeDataFrame(freq='12h')
         result = ts.first('10d')