ENH: can pass dict of column-specific NA sentinels, close #754

pandas-dev · Apr 10, 2012 · d196363 · d196363
1 parent 1ac953b
commit d196363
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 20 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -33,6 +33,7 @@ pandas 0.7.3
     result of apply (GH #938)
   - scatter_matrix method in pandas/tools/plotting.py (PR #935)
   - Add ``kurt`` methods to Series and DataFrame (PR #964)
+  - Can pass dict of column -> list/set NA values for text parsers (GH #754)
 
 **API Changes**
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -36,8 +36,9 @@
     given, a MultiIndex is used.
 names : array-like
     List of column names
-na_values : list-like, default None
-    List of additional strings to recognize as NA/NaN
+na_values : list-like or dict, default None
+    Additional strings to recognize as NA/NaN. If dict passed, specific
+    per-column NA values
 parse_dates : boolean, default False
     Attempt to parse dates in the index column(s)
 date_parser : function
@@ -293,6 +294,15 @@ def __init__(self, fh, delimiter=','):
 class BufferedCSVReader(BufferedReader):
     pass
 
+
+# common NA values
+# no longer excluding inf representations
+# '1.#INF','-1.#INF', '1.#INF000000',
+_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
+                 '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN',
+                 'nan', ''])
+
+
 class TextParser(object):
     """
     Converts lists of lists/tuples into DataFrames with proper type inference
@@ -309,7 +319,7 @@ class TextParser(object):
         rows will be discarded
     index_col : int or list, default None
         Column or columns to use as the (possibly hierarchical) index
-    na_values : iterable, defualt None
+    na_values : iterable, default None
         Custom NA values
     parse_dates : boolean, default False
     date_parser : function, default None
@@ -321,13 +331,6 @@ class TextParser(object):
         Encoding to use for UTF when reading/writing (ex. 'utf-8')
     """
 
-    # common NA values
-    # no longer excluding inf representations
-    # '1.#INF','-1.#INF', '1.#INF000000',
-    NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
-                     '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN',
-                     'nan', ''])
-
     def __init__(self, f, delimiter=None, names=None, header=0,
                  index_col=None, na_values=None, parse_dates=False,
                  date_parser=None, chunksize=None, skiprows=None,
@@ -350,10 +353,10 @@ def __init__(self, f, delimiter=None, names=None, header=0,
         self.passed_names = names is not None
         self.encoding = encoding
 
-
         if com.is_integer(skiprows):
             skiprows = range(skiprows)
         self.skiprows = set() if skiprows is None else set(skiprows)
+
         self.skip_footer = skip_footer
         self.delimiter = delimiter
         self.verbose = verbose
@@ -367,9 +370,11 @@ def __init__(self, f, delimiter=None, names=None, header=0,
         assert(self.skip_footer >= 0)
 
         if na_values is None:
-            self.na_values = self.NA_VALUES
+            self.na_values = _NA_VALUES
+        elif isinstance(na_values, dict):
+            self.na_values = na_values
         else:
-            self.na_values = set(list(na_values)) | self.NA_VALUES
+            self.na_values = set(list(na_values)) | _NA_VALUES
 
         if hasattr(f, 'readline'):
             self._make_reader(f)
@@ -489,6 +494,8 @@ def __iter__(self):
         except StopIteration:
             pass
 
+    _implicit_index = False
+
     def _get_index_name(self):
         columns = self.columns
 
@@ -515,10 +522,8 @@ def _get_index_name(self):
                     self.buf = self.buf[1:]
                     return line
 
-        self.implicit_idx = False
-
         if implicit_first_cols > 0:
-            self.implicit_idx = True
+            self._implicit_index = True
             if self.index_col is None:
                 if implicit_first_cols == 1:
                     self.index_col = 0
@@ -576,7 +581,8 @@ def get_chunk(self, rows=None):
                 index = []
                 for idx in self.index_col:
                     index.append(zipped_content[idx])
-                # remove index items from content and columns, don't pop in loop
+                # remove index items from content and columns, don't pop in
+                # loop
                 for i in reversed(sorted(self.index_col)):
                     zipped_content.pop(i)
 
@@ -600,7 +606,7 @@ def get_chunk(self, rows=None):
 
         if not index._verify_integrity():
             dups = index.get_duplicates()
-            idx_str = 'Index' if not self.implicit_idx else 'Implicit index'
+            idx_str = 'Index' if not self._implicit_index else 'Implicit index'
             err_msg = ('%s (columns %s) have duplicate values %s'
                        % (idx_str, self.index_col, str(dups)))
             raise Exception(err_msg)
@@ -658,9 +664,19 @@ def _get_lines(self, rows=None):
         return lines
 
 def _convert_to_ndarrays(dct, na_values, verbose=False):
+    def _get_na_values(col):
+        if isinstance(na_values, dict):
+            if col in na_values:
+                return set(list(na_values[col]))
+            else:
+                return _NA_VALUES
+        else:
+            return na_values
+
     result = {}
     for c, values in dct.iteritems():
-        cvals, na_count = _convert_types(values, na_values)
+        col_na_values = _get_na_values(c)
+        cvals, na_count = _convert_types(values, col_na_values)
         result[c] = cvals
         if verbose and na_count:
             print 'Filled %d NA values in column %s' % (na_count, str(c))
@@ -784,7 +800,7 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
         -------
         parsed : DataFrame
         """
-        choose = {True:self._parse_xlsx, 
+        choose = {True:self._parse_xlsx,
                   False:self._parse_xls}
         return choose[self.use_xlsx](sheetname, header=header,
                                      skiprows=skiprows, index_col=index_col,

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -781,6 +781,20 @@ def test_fwf(self):
         df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None)
         assert_frame_equal(df, expected)
 
+    def test_na_value_dict(self):
+        data = """A,B,C
+foo,bar,NA
+bar,foo,foo
+foo,bar,NA
+bar,foo,foo"""
+
+        df = read_csv(StringIO(data),
+                      na_values={'A': ['foo'], 'B': ['bar']})
+        expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
+                              'B': [np.nan, 'foo', np.nan, 'foo'],
+                              'C': [np.nan, 'foo', np.nan, 'foo']})
+        assert_frame_equal(df, expected)
+
     @slow
     def test_url(self):
         # HTTP(S)