Merge pull request pandas-dev#4308 from cpcloud/read-excel-page-numbe…

…r-sheet-name ENH: add integer sheetname support in read_excel
guyrt · Jul 25, 2013 · 5d2b85f · 5d2b85f
2 parents 7b47d31 + 121c4dc
commit 5d2b85f
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 28 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1532,6 +1532,26 @@ advanced strategies
 
         read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])
 
+.. versionadded:: 0.13
+
+There are now two ways to read in sheets from an Excel file. You can provide
+either the index of a sheet or its name. If the value provided is an integer
+then it is assumed that the integer refers to the index of a sheet, otherwise
+if a string is passed then it is assumed that the string refers to the name of
+a particular sheet in the file.
+
+Using the sheet name:
+
+.. code-block:: python
+
+   read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])
+
+Using the sheet index:
+
+.. code-block:: python
+
+   read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA'])
+
 It is often the case that users will insert columns to do temporary computations
 in Excel and you may not want to read in those columns. `read_excel` takes
 a `parse_cols` keyword to allow you to specify a subset of columns to parse.

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -37,6 +37,8 @@ pandas 0.13
 
   - ``read_html`` now raises a ``URLError`` instead of catching and raising a
     ``ValueError`` (:issue:`4303`, :issue:`4305`)
+  - ``read_excel`` now supports an integer in its ``sheetname`` argument giving
+    the index of the sheet to read in (:issue:`4301`).
 
 **API Changes**
 

diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -9,6 +9,9 @@ enhancements along with a large number of bug fixes.
 API changes
 ~~~~~~~~~~~
 
+  - ``read_excel`` now supports an integer in its ``sheetname`` argument giving
+    the index of the sheet to read in (:issue:`4301`).
+
 Enhancements
 ~~~~~~~~~~~~
 

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -48,8 +48,9 @@ def read_excel(path_or_buf, sheetname, kind=None, **kwds):
     parsed : DataFrame
         DataFrame from the passed in Excel file
     """
-    return ExcelFile(path_or_buf,kind=kind).parse(sheetname=sheetname,
-                                                  kind=kind, **kwds)
+    return ExcelFile(path_or_buf, kind=kind).parse(sheetname=sheetname,
+                                                   kind=kind, **kwds)
+
 
 class ExcelFile(object):
     """
@@ -86,8 +87,8 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
 
         Parameters
         ----------
-        sheetname : string
-            Name of Excel sheet
+        sheetname : string or integer
+            Name of Excel sheet or the page number of the sheet
         header : int, default 0
             Row to use for the column labels of the parsed DataFrame
         skiprows : list-like
@@ -117,27 +118,20 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
         parsed : DataFrame
             DataFrame parsed from the Excel file
         """
-
-        # has_index_names: boolean, default False
-        #     True if the cols defined in index_col have an index name and are
-        #     not in the header
         has_index_names = False  # removed as new argument of API function
 
         skipfooter = kwds.pop('skipfooter', None)
         if skipfooter is not None:
             skip_footer = skipfooter
 
-        return  self._parse_excel(sheetname, header=header,
-                                     skiprows=skiprows, index_col=index_col,
-                                     has_index_names=has_index_names,
-                                     parse_cols=parse_cols,
-                                     parse_dates=parse_dates,
-                                     date_parser=date_parser,
-                                     na_values=na_values,
-                                     thousands=thousands,
-                                     chunksize=chunksize,
-                                     skip_footer=skip_footer,
-                                     **kwds)
+        return  self._parse_excel(sheetname, header=header, skiprows=skiprows,
+                                  index_col=index_col,
+                                  has_index_names=has_index_names,
+                                  parse_cols=parse_cols,
+                                  parse_dates=parse_dates,
+                                  date_parser=date_parser, na_values=na_values,
+                                  thousands=thousands, chunksize=chunksize,
+                                  skip_footer=skip_footer, **kwds)
 
     def _should_parse(self, i, parse_cols):
 
@@ -171,20 +165,22 @@ def _excel2num(x):
         else:
             return i in parse_cols
 
-    def _parse_excel(self, sheetname, header=0, skiprows=None,
-                   skip_footer=0, index_col=None, has_index_names=None,
-                   parse_cols=None, parse_dates=False, date_parser=None,
-                   na_values=None, thousands=None, chunksize=None,
-                   **kwds):
+    def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
+                     index_col=None, has_index_names=None, parse_cols=None,
+                     parse_dates=False, date_parser=None, na_values=None,
+                     thousands=None, chunksize=None, **kwds):
         from xlrd import (xldate_as_tuple, XL_CELL_DATE,
                           XL_CELL_ERROR, XL_CELL_BOOLEAN)
 
         datemode = self.book.datemode
-        sheet = self.book.sheet_by_name(sheetname)
+        if isinstance(sheetname, basestring):
+            sheet = self.book.sheet_by_name(sheetname)
+        else:  # assume an integer if not a string
+            sheet = self.book.sheet_by_index(sheetname)
 
         data = []
         should_parse = {}
-        for i in range(sheet.nrows):
+        for i in xrange(sheet.nrows):
             row = []
             for j, (value, typ) in enumerate(izip(sheet.row_values(i),
                                                   sheet.row_types(i))):
@@ -225,7 +221,7 @@ def _parse_excel(self, sheetname, header=0, skiprows=None,
 
     @property
     def sheet_names(self):
-            return self.book.sheet_names()
+        return self.book.sheet_names()
 
 
 def _trim_excel_header(row):

diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -82,6 +82,7 @@ def setUp(self):
         self.csv1 = os.path.join(self.dirpath, 'test1.csv')
         self.csv2 = os.path.join(self.dirpath, 'test2.csv')
         self.xls1 = os.path.join(self.dirpath, 'test.xls')
+        self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx')
         self.frame = _frame.copy()
         self.frame2 = _frame2.copy()
         self.tsframe = _tsframe.copy()
@@ -198,6 +199,49 @@ def test_excel_passes_na(self):
                              columns=['Test'])
         tm.assert_frame_equal(parsed, expected)
 
+    def check_excel_table_sheet_by_index(self, filename, csvfile):
+        import xlrd
+
+        pth = os.path.join(self.dirpath, filename)
+        xls = ExcelFile(pth)
+        df = xls.parse(0, index_col=0, parse_dates=True)
+        df2 = self.read_csv(csvfile, index_col=0, parse_dates=True)
+        df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True)
+        tm.assert_frame_equal(df, df2, check_names=False)
+        tm.assert_frame_equal(df3, df2, check_names=False)
+
+        df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1)
+        df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1)
+        tm.assert_frame_equal(df4, df.ix[:-1])
+        tm.assert_frame_equal(df4, df5)
+
+        self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf')
+
+    def test_excel_table_sheet_by_index(self):
+        _skip_if_no_xlrd()
+        for filename, csvfile in [(self.xls1, self.csv1),
+                                  (self.xlsx1, self.csv1)]:
+            self.check_excel_table_sheet_by_index(filename, csvfile)
+
+    def check_excel_sheet_by_name_raise(self, ext):
+        import xlrd
+        pth = os.path.join(self.dirpath, 'testit.{0}'.format(ext))
+
+        with ensure_clean(pth) as pth:
+            gt = DataFrame(np.random.randn(10, 2))
+            gt.to_excel(pth)
+            xl = ExcelFile(pth)
+            df = xl.parse(0)
+            tm.assert_frame_equal(gt, df)
+
+            self.assertRaises(xlrd.XLRDError, xl.parse, '0')
+
+    def test_excel_sheet_by_name_raise(self):
+        _skip_if_no_xlrd()
+        _skip_if_no_xlwt()
+        for ext in ('xls', 'xlsx'):
+            self.check_excel_sheet_by_name_raise(ext)
+
     def test_excel_table(self):
         _skip_if_no_xlrd()
 
@@ -438,7 +482,6 @@ def _check_extension_sheets(self, ext):
             np.testing.assert_equal('test1', reader.sheet_names[0])
             np.testing.assert_equal('test2', reader.sheet_names[1])
 
-
     def test_excel_roundtrip_xls_colaliases(self):
         _skip_if_no_excelsuite()
         self._check_extension_colaliases('xls')
@@ -892,6 +935,7 @@ def test_deprecated_from_parsers(self):
                 from pandas.io.parsers import ExcelWriter as xw
                 xw(path)
 
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)