BUG: Patch missing data handling with usecols (pandas-dev#15066)

Closes pandas-devgh-6710. Closes pandas-devgh-8985.
AnkurDedania · Mar 21, 2017 · c014209 · c014209
1 parent 6b049ba
commit c014209
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 3 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1215,6 +1215,19 @@ You can elect to skip bad lines:
     0  1  2   3
     1  8  9  10
 
+You can also use the ``usecols`` parameter to eliminate extraneous column
+data that appear in some lines but not others:
+
+.. code-block:: ipython
+
+   In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
+
+    Out[30]:
+       a  b   c
+    0  1  2   3
+    1  4  5   6
+    2  8  9  10
+
 .. _io.quoting:
 
 Quoting and Escape Characters

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -306,6 +306,7 @@ Bug Fixes
 - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
+- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2295,11 +2295,12 @@ def _infer_columns(self):
                     columns = [lrange(ncols)]
                 columns = self._handle_usecols(columns, columns[0])
             else:
-                if self.usecols is None or len(names) == num_original_columns:
+                if self.usecols is None or len(names) >= num_original_columns:
                     columns = self._handle_usecols([names], names)
                     num_original_columns = len(names)
                 else:
-                    if self.usecols and len(names) != len(self.usecols):
+                    if (not callable(self.usecols) and
+                            len(names) != len(self.usecols)):
                         raise ValueError(
                             'Number of passed names did not match number of '
                             'header fields in the file'

diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py
@@ -440,3 +440,28 @@ def test_callable_usecols(self):
         expected = DataFrame()
         df = self.read_csv(StringIO(s), usecols=lambda x: False)
         tm.assert_frame_equal(df, expected)
+
+    def test_incomplete_first_row(self):
+        # see gh-6710
+        data = '1,2\n1,2,3'
+        names = ['a', 'b', 'c']
+        expected = DataFrame({'a': [1, 1],
+                              'c': [np.nan, 3]})
+
+        usecols = ['a', 'c']
+        df = self.read_csv(StringIO(data), names=names, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+        usecols = lambda x: x in ['a', 'c']
+        df = self.read_csv(StringIO(data), names=names, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
+
+    def test_uneven_length_cols(self):
+        # see gh-8985
+        usecols = [0, 1, 2]
+        data = '19,29,39\n' * 2 + '10,20,30,40'
+        expected = DataFrame([[19, 29, 39],
+                              [19, 29, 39],
+                              [10, 20, 30]])
+        df = self.read_csv(StringIO(data), header=None, usecols=usecols)
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1317,7 +1317,8 @@ cdef class TextReader:
 
     cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
         if self.has_usecols and self.names is not None:
-            if len(self.names) == len(self.usecols):
+            if (not callable(self.usecols) and
+                    len(self.names) == len(self.usecols)):
                 return self.names[nused]
             else:
                 return self.names[i - self.leading_cols]