Skip to content

Commit

Permalink
BUG: Patch read_csv NA values behaviour
Browse files Browse the repository at this point in the history
Patches the following behaviour when `na_values` is passed in as a
dictionary:    1. Prevent aliasing in case `na_values` was defined in
a broader scope.  2. Respect column indices as keys when doing NA
conversions.    Closes #14203.

Author: gfyoung <gfyoung17@gmail.com>

Closes #14751 from gfyoung/csv-na-values-patching and squashes the following commits:

cac422c [gfyoung] BUG: Respect column indices for dict-like na_values
1439c27 [gfyoung] BUG: Prevent aliasing of dict na_values
  • Loading branch information
gfyoung authored and jreback committed Dec 16, 2016
1 parent 6f4e36a commit dd8cba2
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 14 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ Bug Fixes

- Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`)
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
- Bug in ``pd.read_csv`` in which aliasing was being done for ``na_values`` when passed in as a dictionary (:issue:`14203`)
- Bug in ``pd.read_csv`` in which column indices for a dict-like ``na_values`` were not being respected (:issue:`14203`)
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.
Expand Down
25 changes: 22 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,9 +2055,27 @@ def _clean_mapping(mapping):
else:
clean_dtypes = _clean_mapping(self.dtype)

return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
self.verbose, clean_conv,
clean_dtypes)
# Apply NA values.
clean_na_values = {}
clean_na_fvalues = {}

if isinstance(self.na_values, dict):
for col in self.na_values:
na_value = self.na_values[col]
na_fvalue = self.na_fvalues[col]

if isinstance(col, int) and col not in self.orig_names:
col = self.orig_names[col]

clean_na_values[col] = na_value
clean_na_fvalues[col] = na_fvalue
else:
clean_na_values = self.na_values
clean_na_fvalues = self.na_fvalues

return self._convert_to_ndarrays(data, clean_na_values,
clean_na_fvalues, self.verbose,
clean_conv, clean_dtypes)

def _to_recarray(self, data, columns):
dtypes = []
Expand Down Expand Up @@ -2767,6 +2785,7 @@ def _clean_na_values(na_values, keep_default_na=True):
na_values = []
na_fvalues = set()
elif isinstance(na_values, dict):
na_values = na_values.copy() # Prevent aliasing.
if keep_default_na:
for k, v in compat.iteritems(na_values):
if not is_list_like(v):
Expand Down
23 changes: 23 additions & 0 deletions pandas/io/tests/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,26 @@ def test_na_values_scalar(self):
out = self.read_csv(StringIO(data), names=names,
na_values={'a': 2, 'b': 1})
tm.assert_frame_equal(out, expected)

def test_na_values_dict_aliasing(self):
na_values = {'a': 2, 'b': 1}
na_values_copy = na_values.copy()

names = ['a', 'b']
data = '1,2\n2,1'

expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
out = self.read_csv(StringIO(data), names=names, na_values=na_values)

tm.assert_frame_equal(out, expected)
tm.assert_dict_equal(na_values, na_values_copy)

def test_na_values_dict_col_index(self):
# see gh-14203

data = 'a\nfoo\n1'
na_values = {0: 'foo'}

out = self.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({'a': [np.nan, 1]})
tm.assert_frame_equal(out, expected)
26 changes: 15 additions & 11 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1262,19 +1262,23 @@ cdef class TextReader:
return None, set()

if isinstance(self.na_values, dict):
key = None
values = None

if name is not None and name in self.na_values:
values = self.na_values[name]
if values is not None and not isinstance(values, list):
values = list(values)
fvalues = self.na_fvalues[name]
if fvalues is not None and not isinstance(fvalues, set):
fvalues = set(fvalues)
else:
if i in self.na_values:
return self.na_values[i], self.na_fvalues[i]
else:
return _NA_VALUES, set()
key = name
elif i in self.na_values:
key = i
else: # No na_values provided for this column.
return _NA_VALUES, set()

values = self.na_values[key]
if values is not None and not isinstance(values, list):
values = list(values)

fvalues = self.na_fvalues[key]
if fvalues is not None and not isinstance(fvalues, set):
fvalues = set(fvalues)

return _ensure_encoded(values), fvalues
else:
Expand Down

0 comments on commit dd8cba2

Please sign in to comment.