Merge pull request #3575 from jreback/mi_csv

ENH: allow to_csv to write multi-index columns, read_csv to read with header=list arg
pandas-dev · May 19, 2013 · 860b05d · cpcloud · May 19, 2013 · jreback
2 parents 8eaf19a + faf4d53
commit 860b05d
Show file tree

Hide file tree

Showing 12 changed files with 609 additions and 151 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -34,6 +34,15 @@ pandas 0.11.1
     courtesy of @cpcloud. (GH3477_)
   - Support for reading Amazon S3 files. (GH3504_)
   - Added module for reading and writing Stata files: pandas.io.stata (GH1512_)
+  - Added support for writing in ``to_csv`` and reading in ``read_csv``,
+    multi-index columns. The ``header`` option in ``read_csv`` now accepts a
+    list of the rows from which to read the index. Added the option,
+    ``tupleize_cols`` to provide compatiblity for the pre 0.11.1 behavior of
+    writing and reading multi-index columns via a list of tuples. The default in
+    0.11.1 is to write lists of tuples and *not* interpret list of tuples as a 
+    multi-index column.  
+    Note: The default value will change in 0.12 to make the default *to* write and
+    read multi-index columns in the new format. (GH3571_, GH1651_, GH3141_)
 
 **Improvements to existing features**
 
@@ -180,13 +189,19 @@ pandas 0.11.1
 .. _GH3596: https://github.com/pydata/pandas/issues/3596
 .. _GH3617: https://github.com/pydata/pandas/issues/3617
 .. _GH3435: https://github.com/pydata/pandas/issues/3435
+<<<<<<< HEAD
 .. _GH3611: https://github.com/pydata/pandas/issues/3611
 .. _GH3062: https://github.com/pydata/pandas/issues/3062
 .. _GH3624: https://github.com/pydata/pandas/issues/3624
 .. _GH3626: https://github.com/pydata/pandas/issues/3626
 .. _GH3601: https://github.com/pydata/pandas/issues/3601
 .. _GH3631: https://github.com/pydata/pandas/issues/3631
 .. _GH1512: https://github.com/pydata/pandas/issues/1512
+=======
+.. _GH3571: https://github.com/pydata/pandas/issues/3571
+.. _GH1651: https://github.com/pydata/pandas/issues/1651
+.. _GH3141: https://github.com/pydata/pandas/issues/3141
+>>>>>>> DOC: updated releasenotes, v0.11.1 whatsnew, io.rst
 
 
 pandas 0.11.0

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -57,7 +57,10 @@ They can take a number of arguments:
     specified, data types will be inferred.
   - ``header``: row number to use as the column names, and the start of the
     data.  Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly
-    pass ``header=0`` to be able to replace existing names.
+    pass ``header=0`` to be able to replace existing names. The header can be
+    a list of integers that specify row locations for a multi-index on the columns
+    E.g. [0,1,3]. Interveaning rows that are not specified will be skipped.
+    (E.g. 2 in this example are skipped)
   - ``skiprows``: A collection of numbers for rows in the file to skip. Can
     also be an integer to skip the first ``n`` rows
   - ``index_col``: column number, column name, or list of column numbers/names,
@@ -112,6 +115,10 @@ They can take a number of arguments:
   - ``error_bad_lines``: if False then any lines causing an error will be skipped :ref:`bad lines <io.bad_lines>`
   - ``usecols``: a subset of columns to return, results in much faster parsing 
     time and lower memory usage.
+  - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified 
+    as 'X.0'...'X.N', rather than 'X'...'X'
+  - ``tupleize_cols``: boolean, default True, if False, convert a list of tuples
+    to a multi-index of columns, otherwise, leave the column index as a list of tuples
 
 .. ipython:: python
    :suppress:
@@ -762,6 +769,36 @@ column numbers to turn multiple columns into a ``MultiIndex``:
    df
    df.ix[1978]
 
+.. _io.multi_index_columns:
+
+Specifying a multi-index columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By specifying list of row locations for the ``header`` argument, you
+can read in a multi-index for the columns. Specifying non-consecutive
+rows will skip the interveaing rows.
+
+.. ipython:: python
+
+   from pandas.util.testing import makeCustomDataframe as mkdf
+   df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
+   df.to_csv('mi.csv',tupleize_cols=False)
+   print open('mi.csv').read()
+   pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False)
+
+Note: The default behavior in 0.11.1 remains unchanged (``tupleize_cols=True``),
+but starting with 0.12, the default *to* write and read multi-index columns will be in the new 
+format (``tupleize_cols=False``)
+
+Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
+with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will be *lost*.
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('mi.csv')
+
 .. _io.sniff:
 
 Automatically "sniffing" the delimiter
@@ -845,6 +882,8 @@ function takes a number of arguments. Only the first is required.
   - ``sep`` : Field delimiter for the output file (default ",")
   - ``encoding``: a string representing the encoding to use if the contents are
     non-ascii, for python versions prior to 3
+  - ``tupleize_cols``: boolean, default True, if False, write as a list of tuples,
+    otherwise write in an expanded line format suitable for ``read_csv``
 
 Writing a formatted string
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -876,6 +915,9 @@ The Series object also has a ``to_string`` method, but with only the ``buf``,
 which, if set to ``True``, will additionally output the length of the Series.
 
 
+HTML
+----
+
 Reading HTML format
 ~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -73,13 +73,47 @@ Enhancements
       an index with a different frequency than the existing, or attempting
       to append an index with a different name than the existing
     - support datelike columns with a timezone as data_columns (GH2852_)
+
   - ``fillna`` methods now raise a ``TypeError`` if the ``value`` parameter is
     a list or tuple.
   - Added module for reading and writing Stata files: pandas.io.stata (GH1512_)
   - ``DataFrame.replace()`` now allows regular expressions on contained
     ``Series`` with object dtype. See the examples section in the regular docs
     :ref:`Replacing via String Expression <missing_data.replace_expression>`
 
+  - Multi-index column support for reading and writing csvs
+
+    - The ``header`` option in ``read_csv`` now accepts a
+      list of the rows from which to read the index.
+
+    - The option, ``tupleize_cols`` can now be specified in both ``to_csv`` and
+      ``read_csv``, to provide compatiblity for the pre 0.11.1 behavior of
+      writing and reading multi-index columns via a list of tuples. The default in
+      0.11.1 is to write lists of tuples and *not* interpret list of tuples as a 
+      multi-index column.  
+
+      Note: The default behavior in 0.11.1 remains unchanged, but starting with 0.12,
+      the default *to* write and read multi-index columns will be in the new 
+      format. (GH3571_, GH1651_, GH3141_)
+
+    - If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
+      with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will 
+      be *lost*.
+
+    .. ipython:: python
+
+       from pandas.util.testing import makeCustomDataframe as mkdf
+       df = mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
+       df.to_csv('mi.csv',tupleize_cols=False)
+       print open('mi.csv').read()
+       pd.read_csv('mi.csv',header=[0,1,2,3],index_col=[0,1],tupleize_cols=False)
+
+    .. ipython:: python
+       :suppress:
+
+       import os
+       os.remove('mi.csv')
+
 See the `full release notes
 <https://github.com/pydata/pandas/blob/master/RELEASE.rst>`__ or issue tracker
 on GitHub for a complete list.
@@ -96,3 +130,6 @@ on GitHub for a complete list.
 .. _GH1512: https://github.com/pydata/pandas/issues/1512
 .. _GH2285: https://github.com/pydata/pandas/issues/2285
 .. _GH3631: https://github.com/pydata/pandas/issues/3631
+.. _GH3571: https://github.com/pydata/pandas/issues/3571
+.. _GH1651: https://github.com/pydata/pandas/issues/1651
+.. _GH3141: https://github.com/pydata/pandas/issues/3141
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -772,9 +772,10 @@ def grouper(x):
 class CSVFormatter(object):
 
     def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
-               cols=None, header=True, index=True, index_label=None,
-               mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None, engine=None):
+                 cols=None, header=True, index=True, index_label=None,
+                 mode='w', nanRep=None, encoding=None, quoting=None,
+                 line_terminator='\n', chunksize=None, engine=None,
+                 tupleize_cols=True):
 
         self.engine = engine  # remove for 0.12
 
@@ -803,6 +804,15 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None,
             msg= "columns.is_unique == False not supported with engine='python'"
             raise NotImplementedError(msg)
 
+        self.tupleize_cols = tupleize_cols
+        self.has_mi_columns = isinstance(obj.columns, MultiIndex
+                                         ) and not self.tupleize_cols
+
+        # validate mi options
+        if self.has_mi_columns:
+            if cols is not None:
+                raise Exception("cannot specify cols with a multi_index on the columns")
+
         if cols is not None:
             if isinstance(cols,Index):
                 cols = cols.to_native_types(na_rep=na_rep,float_format=float_format)
@@ -958,48 +968,82 @@ def _save_header(self):
         obj = self.obj
         index_label = self.index_label
         cols = self.cols
+        has_mi_columns = self.has_mi_columns
         header = self.header
+        encoded_labels = []
 
         has_aliases = isinstance(header, (tuple, list, np.ndarray))
-        if has_aliases or self.header:
-            if self.index:
-                # should write something for index label
-                if index_label is not False:
-                    if index_label is None:
-                        if isinstance(obj.index, MultiIndex):
-                            index_label = []
-                            for i, name in enumerate(obj.index.names):
-                                if name is None:
-                                    name = ''
-                                index_label.append(name)
+        if not (has_aliases or self.header):
+            return
+
+        if self.index:
+            # should write something for index label
+            if index_label is not False:
+                if index_label is None:
+                    if isinstance(obj.index, MultiIndex):
+                        index_label = []
+                        for i, name in enumerate(obj.index.names):
+                            if name is None:
+                                name = ''
+                            index_label.append(name)
+                    else:
+                        index_label = obj.index.name
+                        if index_label is None:
+                            index_label = ['']
                         else:
-                            index_label = obj.index.name
-                            if index_label is None:
-                                index_label = ['']
-                            else:
-                                index_label = [index_label]
-                    elif not isinstance(index_label, (list, tuple, np.ndarray)):
-                        # given a string for a DF with Index
-                        index_label = [index_label]
+                            index_label = [index_label]
+                elif not isinstance(index_label, (list, tuple, np.ndarray)):
+                    # given a string for a DF with Index
+                    index_label = [index_label]
 
-                    encoded_labels = list(index_label)
-                else:
-                    encoded_labels = []
+                encoded_labels = list(index_label)
+            else:
+                encoded_labels = []
 
-                if has_aliases:
-                    if len(header) != len(cols):
-                        raise ValueError(('Writing %d cols but got %d aliases'
-                                          % (len(cols), len(header))))
-                    else:
-                        write_cols = header
+            if has_aliases:
+                if len(header) != len(cols):
+                    raise ValueError(('Writing %d cols but got %d aliases'
+                                      % (len(cols), len(header))))
                 else:
-                    write_cols = cols
-                encoded_cols = list(write_cols)
-
-                writer.writerow(encoded_labels + encoded_cols)
+                    write_cols = header
             else:
-                encoded_cols = list(cols)
-                writer.writerow(encoded_cols)
+                write_cols = cols
+
+            if not has_mi_columns:
+                encoded_labels += list(write_cols)
+
+        else:
+
+            if not has_mi_columns:
+                encoded_labels += list(cols)
+
+        # write out the mi
+        if has_mi_columns:
+            columns = obj.columns
+
+            # write out the names for each level, then ALL of the values for each level
+            for i in range(columns.nlevels):
+
+                # we need at least 1 index column to write our col names
+                col_line = []
+                if self.index:
+
+                    # name is the first column
+                    col_line.append( columns.names[i] )
+
+                    if isinstance(index_label,list) and len(index_label)>1:
+                        col_line.extend([ '' ] * (len(index_label)-1))
+
+                col_line.extend(columns.get_level_values(i))
+
+                writer.writerow(col_line)
+
+            # add blanks for the columns, so that we
+            # have consistent seps
+            encoded_labels.extend([ '' ] * len(columns))
+
+        # write out the index label line
+        writer.writerow(encoded_labels)
 
     def _save(self):
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1250,7 +1250,7 @@ def _from_arrays(cls, arrays, columns, index, dtype=None):
 
     @classmethod
     def from_csv(cls, path, header=0, sep=',', index_col=0,
-                 parse_dates=True, encoding=None):
+                 parse_dates=True, encoding=None, tupleize_cols=False):
         """
         Read delimited file into DataFrame
 
@@ -1266,6 +1266,9 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
             is used. Different default from read_table
         parse_dates : boolean, default True
             Parse dates. Different default from read_table
+        tupleize_cols : boolean, default True
+            write multi_index columns as a list of tuples (if True)
+            or new (expanded format) if False)
 
         Notes
         -----
@@ -1280,7 +1283,7 @@ def from_csv(cls, path, header=0, sep=',', index_col=0,
         from pandas.io.parsers import read_table
         return read_table(path, header=header, sep=sep,
                           parse_dates=parse_dates, index_col=index_col,
-                          encoding=encoding)
+                          encoding=encoding,tupleize_cols=False)
 
     @classmethod
     def from_dta(dta, path, parse_dates=True, convert_categoricals=True, encoding=None, index_col=None):
@@ -1391,7 +1394,8 @@ def to_panel(self):
     def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                cols=None, header=True, index=True, index_label=None,
                mode='w', nanRep=None, encoding=None, quoting=None,
-               line_terminator='\n', chunksize=None,**kwds):
+               line_terminator='\n', chunksize=None,
+               tupleize_cols=True, **kwds):
         """
         Write DataFrame to a comma-separated values (csv) file
 
@@ -1429,6 +1433,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
         quoting : optional constant from csv module
             defaults to csv.QUOTE_MINIMAL
         chunksize : rows to write at a time
+        tupleize_cols : boolean, default True
+            write multi_index columns as a list of tuples (if True)
+            or new (expanded format) if False)
         """
         if nanRep is not None:  # pragma: no cover
             import warnings
@@ -1445,7 +1452,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
                                          float_format=float_format, cols=cols,
                                          header=header, index=index,
                                          index_label=index_label,mode=mode,
-                                         chunksize=chunksize,engine=kwds.get("engine") )
+                                         chunksize=chunksize,engine=kwds.get("engine"),
+                                         tupleize_cols=tupleize_cols)
             formatter.save()
 
     def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',