diff --git a/RELEASE.rst b/RELEASE.rst index 933851404c109..7ca861a5c8d54 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -33,6 +33,7 @@ pandas 0.7.3 result of apply (GH #938) - scatter_matrix method in pandas/tools/plotting.py (PR #935) - Add ``kurt`` methods to Series and DataFrame (PR #964) + - Can pass dict of column -> list/set NA values for text parsers (GH #754) **API Changes** diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fce10d7bb3635..e15f38db5e153 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -36,8 +36,9 @@ given, a MultiIndex is used. names : array-like List of column names -na_values : list-like, default None - List of additional strings to recognize as NA/NaN +na_values : list-like or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values parse_dates : boolean, default False Attempt to parse dates in the index column(s) date_parser : function @@ -293,6 +294,15 @@ def __init__(self, fh, delimiter=','): class BufferedCSVReader(BufferedReader): pass + +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '']) + + class TextParser(object): """ Converts lists of lists/tuples into DataFrames with proper type inference @@ -309,7 +319,7 @@ class TextParser(object): rows will be discarded index_col : int or list, default None Column or columns to use as the (possibly hierarchical) index - na_values : iterable, defualt None + na_values : iterable, default None Custom NA values parse_dates : boolean, default False date_parser : function, default None @@ -321,13 +331,6 @@ class TextParser(object): Encoding to use for UTF when reading/writing (ex. 'utf-8') """ - # common NA values - # no longer excluding inf representations - # '1.#INF','-1.#INF', '1.#INF000000', - NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '']) - def __init__(self, f, delimiter=None, names=None, header=0, index_col=None, na_values=None, parse_dates=False, date_parser=None, chunksize=None, skiprows=None, @@ -350,10 +353,10 @@ def __init__(self, f, delimiter=None, names=None, header=0, self.passed_names = names is not None self.encoding = encoding - if com.is_integer(skiprows): skiprows = range(skiprows) self.skiprows = set() if skiprows is None else set(skiprows) + self.skip_footer = skip_footer self.delimiter = delimiter self.verbose = verbose @@ -367,9 +370,11 @@ def __init__(self, f, delimiter=None, names=None, header=0, assert(self.skip_footer >= 0) if na_values is None: - self.na_values = self.NA_VALUES + self.na_values = _NA_VALUES + elif isinstance(na_values, dict): + self.na_values = na_values else: - self.na_values = set(list(na_values)) | self.NA_VALUES + self.na_values = set(list(na_values)) | _NA_VALUES if hasattr(f, 'readline'): self._make_reader(f) @@ -489,6 +494,8 @@ def __iter__(self): except StopIteration: pass + _implicit_index = False + def _get_index_name(self): columns = self.columns @@ -515,10 +522,8 @@ def _get_index_name(self): self.buf = self.buf[1:] return line - self.implicit_idx = False - if implicit_first_cols > 0: - self.implicit_idx = True + self._implicit_index = True if self.index_col is None: if implicit_first_cols == 1: self.index_col = 0 @@ -576,7 +581,8 @@ def get_chunk(self, rows=None): index = [] for idx in self.index_col: index.append(zipped_content[idx]) - # remove index items from content and columns, don't pop in loop + # remove index items from content and columns, don't pop in + # loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) @@ -600,7 +606,7 @@ def get_chunk(self, rows=None): if not index._verify_integrity(): dups = index.get_duplicates() - idx_str = 'Index' if not self.implicit_idx else 'Implicit index' + idx_str = 'Index' if not self._implicit_index else 'Implicit index' err_msg = ('%s (columns %s) have duplicate values %s' % (idx_str, self.index_col, str(dups))) raise Exception(err_msg) @@ -658,9 +664,19 @@ def _get_lines(self, rows=None): return lines def _convert_to_ndarrays(dct, na_values, verbose=False): + def _get_na_values(col): + if isinstance(na_values, dict): + if col in na_values: + return set(list(na_values[col])) + else: + return _NA_VALUES + else: + return na_values + result = {} for c, values in dct.iteritems(): - cvals, na_count = _convert_types(values, na_values) + col_na_values = _get_na_values(c) + cvals, na_count = _convert_types(values, col_na_values) result[c] = cvals if verbose and na_count: print 'Filled %d NA values in column %s' % (na_count, str(c)) @@ -784,7 +800,7 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None, ------- parsed : DataFrame """ - choose = {True:self._parse_xlsx, + choose = {True:self._parse_xlsx, False:self._parse_xls} return choose[self.use_xlsx](sheetname, header=header, skiprows=skiprows, index_col=index_col, diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0209268796963..ebf9598c6ec06 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -781,6 +781,20 @@ def test_fwf(self): df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None) assert_frame_equal(df, expected) + def test_na_value_dict(self): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + + df = read_csv(StringIO(data), + na_values={'A': ['foo'], 'B': ['bar']}) + expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], + 'B': [np.nan, 'foo', np.nan, 'foo'], + 'C': [np.nan, 'foo', np.nan, 'foo']}) + assert_frame_equal(df, expected) + @slow def test_url(self): # HTTP(S)