Skip to content

Commit

Permalink
BUG: Patch missing data handling with usecols (pandas-dev#15066)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and AnkurDedania committed Mar 21, 2017
1 parent 6b049ba commit c014209
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 3 deletions.
13 changes: 13 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1215,6 +1215,19 @@ You can elect to skip bad lines:
0 1 2 3
1 8 9 10
You can also use the ``usecols`` parameter to eliminate extraneous column
data that appear in some lines but not others:

.. code-block:: ipython
In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
Out[30]:
a b c
0 1 2 3
1 4 5 6
2 8 9 10
.. _io.quoting:

Quoting and Escape Characters
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ Bug Fixes
- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)

- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)

Expand Down
5 changes: 3 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2295,11 +2295,12 @@ def _infer_columns(self):
columns = [lrange(ncols)]
columns = self._handle_usecols(columns, columns[0])
else:
if self.usecols is None or len(names) == num_original_columns:
if self.usecols is None or len(names) >= num_original_columns:
columns = self._handle_usecols([names], names)
num_original_columns = len(names)
else:
if self.usecols and len(names) != len(self.usecols):
if (not callable(self.usecols) and
len(names) != len(self.usecols)):
raise ValueError(
'Number of passed names did not match number of '
'header fields in the file'
Expand Down
25 changes: 25 additions & 0 deletions pandas/io/tests/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,3 +440,28 @@ def test_callable_usecols(self):
expected = DataFrame()
df = self.read_csv(StringIO(s), usecols=lambda x: False)
tm.assert_frame_equal(df, expected)

def test_incomplete_first_row(self):
# see gh-6710
data = '1,2\n1,2,3'
names = ['a', 'b', 'c']
expected = DataFrame({'a': [1, 1],
'c': [np.nan, 3]})

usecols = ['a', 'c']
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(df, expected)

usecols = lambda x: x in ['a', 'c']
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(df, expected)

def test_uneven_length_cols(self):
# see gh-8985
usecols = [0, 1, 2]
data = '19,29,39\n' * 2 + '10,20,30,40'
expected = DataFrame([[19, 29, 39],
[19, 29, 39],
[10, 20, 30]])
df = self.read_csv(StringIO(data), header=None, usecols=usecols)
tm.assert_frame_equal(df, expected)
3 changes: 2 additions & 1 deletion pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1317,7 +1317,8 @@ cdef class TextReader:

cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
if self.has_usecols and self.names is not None:
if len(self.names) == len(self.usecols):
if (not callable(self.usecols) and
len(self.names) == len(self.usecols)):
return self.names[nused]
else:
return self.names[i - self.leading_cols]
Expand Down

0 comments on commit c014209

Please sign in to comment.