Skip to content

Commit

Permalink
ENH: can pass dict of column-specific NA sentinels, close #754
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Apr 10, 2012
1 parent 1ac953b commit d196363
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 20 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pandas 0.7.3
result of apply (GH #938)
- scatter_matrix method in pandas/tools/plotting.py (PR #935)
- Add ``kurt`` methods to Series and DataFrame (PR #964)
- Can pass dict of column -> list/set NA values for text parsers (GH #754)

**API Changes**

Expand Down
56 changes: 36 additions & 20 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
given, a MultiIndex is used.
names : array-like
List of column names
na_values : list-like, default None
List of additional strings to recognize as NA/NaN
na_values : list-like or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values
parse_dates : boolean, default False
Attempt to parse dates in the index column(s)
date_parser : function
Expand Down Expand Up @@ -293,6 +294,15 @@ def __init__(self, fh, delimiter=','):
class BufferedCSVReader(BufferedReader):
pass


# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
'#N/A N/A', 'NA', '#NA', 'NULL', 'NaN',
'nan', ''])


class TextParser(object):
"""
Converts lists of lists/tuples into DataFrames with proper type inference
Expand All @@ -309,7 +319,7 @@ class TextParser(object):
rows will be discarded
index_col : int or list, default None
Column or columns to use as the (possibly hierarchical) index
na_values : iterable, defualt None
na_values : iterable, default None
Custom NA values
parse_dates : boolean, default False
date_parser : function, default None
Expand All @@ -321,13 +331,6 @@ class TextParser(object):
Encoding to use for UTF when reading/writing (ex. 'utf-8')
"""

# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
'#N/A N/A', 'NA', '#NA', 'NULL', 'NaN',
'nan', ''])

def __init__(self, f, delimiter=None, names=None, header=0,
index_col=None, na_values=None, parse_dates=False,
date_parser=None, chunksize=None, skiprows=None,
Expand All @@ -350,10 +353,10 @@ def __init__(self, f, delimiter=None, names=None, header=0,
self.passed_names = names is not None
self.encoding = encoding


if com.is_integer(skiprows):
skiprows = range(skiprows)
self.skiprows = set() if skiprows is None else set(skiprows)

self.skip_footer = skip_footer
self.delimiter = delimiter
self.verbose = verbose
Expand All @@ -367,9 +370,11 @@ def __init__(self, f, delimiter=None, names=None, header=0,
assert(self.skip_footer >= 0)

if na_values is None:
self.na_values = self.NA_VALUES
self.na_values = _NA_VALUES
elif isinstance(na_values, dict):
self.na_values = na_values
else:
self.na_values = set(list(na_values)) | self.NA_VALUES
self.na_values = set(list(na_values)) | _NA_VALUES

if hasattr(f, 'readline'):
self._make_reader(f)
Expand Down Expand Up @@ -489,6 +494,8 @@ def __iter__(self):
except StopIteration:
pass

_implicit_index = False

def _get_index_name(self):
columns = self.columns

Expand All @@ -515,10 +522,8 @@ def _get_index_name(self):
self.buf = self.buf[1:]
return line

self.implicit_idx = False

if implicit_first_cols > 0:
self.implicit_idx = True
self._implicit_index = True
if self.index_col is None:
if implicit_first_cols == 1:
self.index_col = 0
Expand Down Expand Up @@ -576,7 +581,8 @@ def get_chunk(self, rows=None):
index = []
for idx in self.index_col:
index.append(zipped_content[idx])
# remove index items from content and columns, don't pop in loop
# remove index items from content and columns, don't pop in
# loop
for i in reversed(sorted(self.index_col)):
zipped_content.pop(i)

Expand All @@ -600,7 +606,7 @@ def get_chunk(self, rows=None):

if not index._verify_integrity():
dups = index.get_duplicates()
idx_str = 'Index' if not self.implicit_idx else 'Implicit index'
idx_str = 'Index' if not self._implicit_index else 'Implicit index'
err_msg = ('%s (columns %s) have duplicate values %s'
% (idx_str, self.index_col, str(dups)))
raise Exception(err_msg)
Expand Down Expand Up @@ -658,9 +664,19 @@ def _get_lines(self, rows=None):
return lines

def _convert_to_ndarrays(dct, na_values, verbose=False):
def _get_na_values(col):
if isinstance(na_values, dict):
if col in na_values:
return set(list(na_values[col]))
else:
return _NA_VALUES
else:
return na_values

result = {}
for c, values in dct.iteritems():
cvals, na_count = _convert_types(values, na_values)
col_na_values = _get_na_values(c)
cvals, na_count = _convert_types(values, col_na_values)
result[c] = cvals
if verbose and na_count:
print 'Filled %d NA values in column %s' % (na_count, str(c))
Expand Down Expand Up @@ -784,7 +800,7 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
-------
parsed : DataFrame
"""
choose = {True:self._parse_xlsx,
choose = {True:self._parse_xlsx,
False:self._parse_xls}
return choose[self.use_xlsx](sheetname, header=header,
skiprows=skiprows, index_col=index_col,
Expand Down
14 changes: 14 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,20 @@ def test_fwf(self):
df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
assert_frame_equal(df, expected)

def test_na_value_dict(self):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""

df = read_csv(StringIO(data),
na_values={'A': ['foo'], 'B': ['bar']})
expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
'B': [np.nan, 'foo', np.nan, 'foo'],
'C': [np.nan, 'foo', np.nan, 'foo']})
assert_frame_equal(df, expected)

@slow
def test_url(self):
# HTTP(S)
Expand Down

0 comments on commit d196363

Please sign in to comment.