ENH: Enable ExcelWriter to construct in-memory sheets

Add support for StringIO/BytesIO to ExcelWriter Add vbench support for writing excel files Add support for serializing lists/dicts to strings Fix bug when reading blank excel sheets Added xlwt to Python 3.4 builds closes #8188 closes #7074 closes #6403 closes #7171 closes #6947
pandas-dev · Jun 20, 2015 · 9220309 · 9220309
1 parent 0b74c72
commit 9220309
Show file tree

Hide file tree

Showing 13 changed files with 179 additions and 14 deletions.
diff --git a/ci/requirements-3.4.txt b/ci/requirements-3.4.txt
@@ -3,6 +3,7 @@ pytz
 openpyxl
 xlsxwriter
 xlrd
+xlwt
 html5lib
 patsy
 beautiful-soup

diff --git a/ci/requirements-3.4_SLOW.txt b/ci/requirements-3.4_SLOW.txt
@@ -3,6 +3,7 @@ pytz
 openpyxl
 xlsxwriter
 xlrd
+xlwt
 html5lib
 patsy
 beautiful-soup

diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -249,10 +249,9 @@ Optional Dependencies
 * `statsmodels <http://statsmodels.sourceforge.net/>`__
    * Needed for parts of :mod:`pandas.stats`
 * `openpyxl <http://packages.python.org/openpyxl/>`__, `xlrd/xlwt <http://www.python-excel.org/>`__
-   * openpyxl version 1.6.1 or higher, but lower than 2.0.0
    * Needed for Excel I/O
 * `XlsxWriter <https://pypi.python.org/pypi/XlsxWriter>`__
-   * Alternative Excel writer.
+   * Alternative Excel writer
 * `boto <https://pypi.python.org/pypi/boto>`__: necessary for Amazon S3
   access.
 * `blosc <https://pypi.python.org/pypi/blosc>`__: for msgpack compression using ``blosc``

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -2130,7 +2130,9 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
        df1.to_excel(writer, sheet_name='Sheet1')
        df2.to_excel(writer, sheet_name='Sheet2')
 
-.. note:: Wringing a little more performance out of ``read_excel``
+.. note::
+
+    Wringing a little more performance out of ``read_excel``
     Internally, Excel stores all numeric data as floats. Because this can
     produce unexpected behavior when reading in data, pandas defaults to trying
     to convert integers to floats if it doesn't lose information (``1.0 -->
@@ -2182,6 +2184,45 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are:
 
    df.to_excel('path_to_file.xlsx', sheet_name='Sheet1')
 
+Writing Excel Files to Memory
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. versionadded:: 0.17
+
+.. _io.excel_writing_buffer
+
+Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or
+``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`.
+
+.. code-block:: python
+
+   # Safe import for either Python 2.x or 3.x
+   try:
+       from io import BytesIO
+   except ImportError:
+       from cStringIO import StringIO as BytesIO
+
+   bio = BytesIO()
+
+   # By setting the 'engine' in the ExcelWriter constructor.
+   writer = ExcelWriter(bio, engine='xlsxwriter')
+   df.to_excel(writer, sheet_name='Sheet1')
+
+   # Save the workbook
+   writer.save()
+
+   # Seek to the beginning and read to copy the workbook to a variable in memory
+   bio.seek(0)
+   workbook = bio.read()
+
+.. note::
+
+    ``engine`` is optional but recommended.  Setting the engine determines
+    the version of workbook produced. Setting ``engine='xlrd'`` will produce an
+    Excel 2003-format workbook (xls).  Using either ``'openpyxl'`` or
+    ``'xlsxwriter'`` will produce an Excel 2007-format workbook (xlsx). If
+    omitted, an Excel 2007-formatted workbook is produced.
+
 .. _io.clipboard:
 
 Clipboard

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -38,6 +38,8 @@ Backwards incompatible API changes
 
 Other API Changes
 ^^^^^^^^^^^^^^^^^
+- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
+- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 
 .. _whatsnew_0170.deprecations:
 
@@ -53,11 +55,15 @@ Removal of prior version deprecations/changes
 
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
+- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
 
 .. _whatsnew_0170.bug_fixes:
 
 Bug Fixes
 ~~~~~~~~~
 - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
+
+
 - Bug in ``pd.Series`` when setting a value on an empty ``Series`` whose index has a frequency. (:issue:`10193`)
 - Bug in ``DataFrame.reset_index`` when index contains `NaT`. (:issue:`10388`)
+- Bug in ``ExcelReader`` when worksheet is empty (:issue:`6403`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1246,6 +1246,9 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
         >>> df1.to_excel(writer,'Sheet1')
         >>> df2.to_excel(writer,'Sheet2')
         >>> writer.save()
+
+        For compatibility with to_csv, to_excel serializes lists and dicts to
+        strings before writing.
         """
         from pandas.io.excel import ExcelWriter
         if self.columns.nlevels > 1:

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -9,11 +9,13 @@
 import abc
 import numpy as np
 
+from pandas.core.frame import DataFrame
 from pandas.io.parsers import TextParser
 from pandas.io.common import _is_url, _urlopen
 from pandas.tseries.period import Period
 from pandas import json
-from pandas.compat import map, zip, reduce, range, lrange, u, add_metaclass
+from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
+                           BytesIO, string_types)
 from pandas.core import config
 from pandas.core.common import pprint_thing
 import pandas.compat as compat
@@ -417,10 +419,13 @@ def _parse_cell(cell_contents,cell_typ):
                     if parse_cols is None or should_parse[j]:
                         row.append(_parse_cell(value,typ))
                 data.append(row)
-
+
+            if sheet.nrows == 0:
+                return DataFrame()
+
             if header is not None:
                 data[header] = _trim_excel_header(data[header])
-    
+
             parser = TextParser(data, header=header, index_col=index_col,
                                 has_index_names=has_index_names,
                                 na_values=na_values,
@@ -474,6 +479,8 @@ def _conv_value(val):
         val = bool(val)
     elif isinstance(val, Period):
         val = "%s" % val
+    elif com.is_list_like(val):
+        val = str(val)
 
     return val
 
@@ -497,6 +504,11 @@ class ExcelWriter(object):
     datetime_format : string, default None
         Format string for datetime objects written into Excel files
         (e.g. 'YYYY-MM-DD HH:MM:SS')
+
+    Notes
+    -----
+    For compatibility with CSV writers, ExcelWriter serializes lists
+    and dicts to strings before writing.
     """
     # Defining an ExcelWriter implementation (see abstract methods for more...)
 
@@ -521,9 +533,13 @@ class ExcelWriter(object):
     # ExcelWriter.
     def __new__(cls, path, engine=None, **kwargs):
         # only switch class if generic(ExcelWriter)
-        if cls == ExcelWriter:
+        if issubclass(cls, ExcelWriter):
             if engine is None:
-                ext = os.path.splitext(path)[-1][1:]
+                if isinstance(path, string_types):
+                    ext = os.path.splitext(path)[-1][1:]
+                else:
+                    ext = 'xlsx'
+
                 try:
                     engine = config.get_option('io.excel.%s.writer' % ext)
                 except KeyError:
@@ -574,7 +590,11 @@ def save(self):
     def __init__(self, path, engine=None,
                  date_format=None, datetime_format=None, **engine_kwargs):
         # validate that this engine can handle the extension
-        ext = os.path.splitext(path)[-1]
+        if isinstance(path, string_types):
+            ext = os.path.splitext(path)[-1]
+        else:
+            ext = 'xls' if engine == 'xlwt' else 'xlsx'
+
         self.check_extension(ext)
 
         self.path = path
@@ -1159,7 +1179,7 @@ class _XlwtWriter(ExcelWriter):
     def __init__(self, path, engine=None, encoding=None, **engine_kwargs):
         # Use the xlwt module as the Excel writer.
         import xlwt
-
+        engine_kwargs['engine'] = engine
         super(_XlwtWriter, self).__init__(path, **engine_kwargs)
 
         if encoding is None:
@@ -1311,6 +1331,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
         style_dict = {}
 
         for cell in cells:
+            val = _conv_value(cell.val)
+
             num_format_str = None
             if isinstance(cell.val, datetime.datetime):
                 num_format_str = self.datetime_format
@@ -1336,7 +1358,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0):
             else:
                 wks.write(startrow + cell.row,
                           startcol + cell.col,
-                          cell.val, style)
+                          val, style)
 
     def _convert_to_style(self, style_dict, num_format_str=None):
         """

diff --git a/pandas/io/tests/data/blank.xls b/pandas/io/tests/data/blank.xls
diff --git a/pandas/io/tests/data/blank.xlsx b/pandas/io/tests/data/blank.xlsx
diff --git a/pandas/io/tests/data/blank_with_header.xls b/pandas/io/tests/data/blank_with_header.xls
diff --git a/pandas/io/tests/data/blank_with_header.xlsx b/pandas/io/tests/data/blank_with_header.xlsx
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -1,6 +1,6 @@
 # pylint: disable=E1101
 
-from pandas.compat import u, range, map, openpyxl_compat
+from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems
 from datetime import datetime, date, time
 import sys
 import os
@@ -455,7 +455,7 @@ def test_reading_multiple_specific_sheets(self):
     def test_creating_and_reading_multiple_sheets(self):
         # Test reading multiple sheets, from a runtime created excel file
         # with multiple sheets.
-         # See PR #9450       
+        # See PR #9450
 
         _skip_if_no_xlrd()
         _skip_if_no_xlwt()
@@ -471,7 +471,7 @@ def tdf(sheetname):
 
         with ensure_clean('.xlsx') as pth:
             with ExcelWriter(pth) as ew:
-                for sheetname, df in dfs.iteritems():
+                for sheetname, df in iteritems(dfs):
                     df.to_excel(ew,sheetname)
             dfs_returned = pd.read_excel(pth,sheetname=sheets)
             for s in sheets:
@@ -520,6 +520,29 @@ def test_reader_seconds(self):
         actual = read_excel(epoch_1904, 'Sheet1')
         tm.assert_frame_equal(actual, expected)
 
+    # GH6403
+    def test_read_excel_blank(self):
+        _skip_if_no_xlrd()
+
+        blank = os.path.join(self.dirpath, 'blank.xls')
+        actual = read_excel(blank, 'Sheet1')
+        tm.assert_frame_equal(actual, DataFrame())
+
+        blank = os.path.join(self.dirpath, 'blank.xlsx')
+        actual = read_excel(blank, 'Sheet1')
+        tm.assert_frame_equal(actual, DataFrame())
+
+    def test_read_excel_blank_with_header(self):
+        _skip_if_no_xlrd()
+
+        expected = DataFrame(columns=['col_1', 'col_2'])
+        blank = os.path.join(self.dirpath, 'blank_with_header.xls')
+        actual = read_excel(blank, 'Sheet1')
+        tm.assert_frame_equal(actual, expected)
+
+        blank = os.path.join(self.dirpath, 'blank_with_header.xlsx')
+        actual = read_excel(blank, 'Sheet1')
+        tm.assert_frame_equal(actual, expected)
 
 class ExcelWriterBase(SharedItems):
     # Base class for test cases to run with different Excel writers.
@@ -1218,6 +1241,30 @@ def test_datetimes(self):
 
             tm.assert_series_equal(write_frame['A'], read_frame['A'])
 
+    # GH7074
+    def test_bytes_io(self):
+        bio = BytesIO()
+        df = DataFrame(np.random.randn(10, 2))
+        writer = ExcelWriter(bio)
+        df.to_excel(writer)
+        writer.save()
+        bio.seek(0)
+        reread_df = pd.read_excel(bio)
+        tm.assert_frame_equal(df, reread_df)
+
+    # GH8188
+    def test_write_lists_dict(self):
+        df = pd.DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}],
+                           'numeric': [1, 2, 3.0],
+                           'str': ['apple', 'banana', 'cherry']})
+        expected = df.copy()
+        expected.mixed = expected.mixed.apply(str)
+        expected.numeric = expected.numeric.astype('int64')
+        with ensure_clean(self.ext) as path:
+            df.to_excel(path, 'Sheet1')
+            read = read_excel(path, 'Sheet1', header=0)
+            tm.assert_frame_equal(read, expected)
+
 def raise_wrapper(major_ver):
     def versioned_raise_wrapper(orig_method):
         @functools.wraps(orig_method)
@@ -1512,6 +1559,7 @@ class XlsxWriterTests_NoMerge(ExcelWriterBase, tm.TestCase):
 
 
 class ExcelWriterEngineTests(tm.TestCase):
+
     def test_ExcelWriter_dispatch(self):
         with tm.assertRaisesRegexp(ValueError, 'No engine'):
             ExcelWriter('nothing')

diff --git a/vb_suite/packers.py b/vb_suite/packers.py
@@ -7,6 +7,7 @@
 import os
 import pandas as pd
 from pandas.core import common as com
+from pandas.compat import BytesIO
 from random import randrange
 
 f = '__test__.msg'
@@ -206,3 +207,46 @@ def remove(f):
 packers_read_stata_with_validation = Benchmark("pd.read_stata(f)", setup, start_date=start_date)
 
 packers_write_stata_with_validation = Benchmark("df.to_stata(f, {'index': 'tc'})", setup, cleanup="remove(f)", start_date=start_date)
+
+#----------------------------------------------------------------------
+# Excel - alternative writers
+setup = common_setup + """
+bio = BytesIO()
+"""
+
+excel_writer_bench = """
+bio.seek(0)
+writer = pd.io.excel.ExcelWriter(bio, engine='{engine}')
+df[:2000].to_excel(writer)
+writer.save()
+"""
+
+benchmark_xlsxwriter = excel_writer_bench.format(engine='xlsxwriter')
+
+packers_write_excel_xlsxwriter = Benchmark(benchmark_xlsxwriter, setup)
+
+benchmark_openpyxl = excel_writer_bench.format(engine='openpyxl')
+
+packers_write_excel_openpyxl = Benchmark(benchmark_openpyxl, setup)
+
+benchmark_xlwt = excel_writer_bench.format(engine='xlwt')
+
+packers_write_excel_xlwt = Benchmark(benchmark_xlwt, setup)
+
+
+#----------------------------------------------------------------------
+# Excel - reader
+
+setup = common_setup + """
+bio = BytesIO()
+writer = pd.io.excel.ExcelWriter(bio, engine='xlsxwriter')
+df[:2000].to_excel(writer)
+writer.save()
+"""
+
+benchmark_read_excel="""
+bio.seek(0)
+pd.read_excel(bio)
+"""
+
+packers_read_excel = Benchmark(benchmark_read_excel, setup)