Skip to content

Commit

Permalink
Merge pull request pandas-dev#4308 from cpcloud/read-excel-page-numbe…
Browse files Browse the repository at this point in the history
…r-sheet-name

ENH: add integer sheetname support in read_excel
  • Loading branch information
cpcloud committed Jul 25, 2013
2 parents 7b47d31 + 121c4dc commit 5d2b85f
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 28 deletions.
20 changes: 20 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,26 @@ advanced strategies
read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])
.. versionadded:: 0.13

There are now two ways to read in sheets from an Excel file. You can provide
either the index of a sheet or its name. If the value provided is an integer
then it is assumed that the integer refers to the index of a sheet, otherwise
if a string is passed then it is assumed that the string refers to the name of
a particular sheet in the file.

Using the sheet name:

.. code-block:: python
read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])
Using the sheet index:

.. code-block:: python
read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA'])
It is often the case that users will insert columns to do temporary computations
in Excel and you may not want to read in those columns. `read_excel` takes
a `parse_cols` keyword to allow you to specify a subset of columns to parse.
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ pandas 0.13

- ``read_html`` now raises a ``URLError`` instead of catching and raising a
``ValueError`` (:issue:`4303`, :issue:`4305`)
- ``read_excel`` now supports an integer in its ``sheetname`` argument giving
the index of the sheet to read in (:issue:`4301`).

**API Changes**

Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ enhancements along with a large number of bug fixes.
API changes
~~~~~~~~~~~

- ``read_excel`` now supports an integer in its ``sheetname`` argument giving
the index of the sheet to read in (:issue:`4301`).

Enhancements
~~~~~~~~~~~~

Expand Down
50 changes: 23 additions & 27 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def read_excel(path_or_buf, sheetname, kind=None, **kwds):
parsed : DataFrame
DataFrame from the passed in Excel file
"""
return ExcelFile(path_or_buf,kind=kind).parse(sheetname=sheetname,
kind=kind, **kwds)
return ExcelFile(path_or_buf, kind=kind).parse(sheetname=sheetname,
kind=kind, **kwds)


class ExcelFile(object):
"""
Expand Down Expand Up @@ -86,8 +87,8 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
Parameters
----------
sheetname : string
Name of Excel sheet
sheetname : string or integer
Name of Excel sheet or the page number of the sheet
header : int, default 0
Row to use for the column labels of the parsed DataFrame
skiprows : list-like
Expand Down Expand Up @@ -117,27 +118,20 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
parsed : DataFrame
DataFrame parsed from the Excel file
"""

# has_index_names: boolean, default False
# True if the cols defined in index_col have an index name and are
# not in the header
has_index_names = False # removed as new argument of API function

skipfooter = kwds.pop('skipfooter', None)
if skipfooter is not None:
skip_footer = skipfooter

return self._parse_excel(sheetname, header=header,
skiprows=skiprows, index_col=index_col,
has_index_names=has_index_names,
parse_cols=parse_cols,
parse_dates=parse_dates,
date_parser=date_parser,
na_values=na_values,
thousands=thousands,
chunksize=chunksize,
skip_footer=skip_footer,
**kwds)
return self._parse_excel(sheetname, header=header, skiprows=skiprows,
index_col=index_col,
has_index_names=has_index_names,
parse_cols=parse_cols,
parse_dates=parse_dates,
date_parser=date_parser, na_values=na_values,
thousands=thousands, chunksize=chunksize,
skip_footer=skip_footer, **kwds)

def _should_parse(self, i, parse_cols):

Expand Down Expand Up @@ -171,20 +165,22 @@ def _excel2num(x):
else:
return i in parse_cols

def _parse_excel(self, sheetname, header=0, skiprows=None,
skip_footer=0, index_col=None, has_index_names=None,
parse_cols=None, parse_dates=False, date_parser=None,
na_values=None, thousands=None, chunksize=None,
**kwds):
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
index_col=None, has_index_names=None, parse_cols=None,
parse_dates=False, date_parser=None, na_values=None,
thousands=None, chunksize=None, **kwds):
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN)

datemode = self.book.datemode
sheet = self.book.sheet_by_name(sheetname)
if isinstance(sheetname, basestring):
sheet = self.book.sheet_by_name(sheetname)
else: # assume an integer if not a string
sheet = self.book.sheet_by_index(sheetname)

data = []
should_parse = {}
for i in range(sheet.nrows):
for i in xrange(sheet.nrows):
row = []
for j, (value, typ) in enumerate(izip(sheet.row_values(i),
sheet.row_types(i))):
Expand Down Expand Up @@ -225,7 +221,7 @@ def _parse_excel(self, sheetname, header=0, skiprows=None,

@property
def sheet_names(self):
return self.book.sheet_names()
return self.book.sheet_names()


def _trim_excel_header(row):
Expand Down
46 changes: 45 additions & 1 deletion pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def setUp(self):
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
self.xls1 = os.path.join(self.dirpath, 'test.xls')
self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx')
self.frame = _frame.copy()
self.frame2 = _frame2.copy()
self.tsframe = _tsframe.copy()
Expand Down Expand Up @@ -198,6 +199,49 @@ def test_excel_passes_na(self):
columns=['Test'])
tm.assert_frame_equal(parsed, expected)

def check_excel_table_sheet_by_index(self, filename, csvfile):
import xlrd

pth = os.path.join(self.dirpath, filename)
xls = ExcelFile(pth)
df = xls.parse(0, index_col=0, parse_dates=True)
df2 = self.read_csv(csvfile, index_col=0, parse_dates=True)
df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True)
tm.assert_frame_equal(df, df2, check_names=False)
tm.assert_frame_equal(df3, df2, check_names=False)

df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1)
df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1)
tm.assert_frame_equal(df4, df.ix[:-1])
tm.assert_frame_equal(df4, df5)

self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf')

def test_excel_table_sheet_by_index(self):
_skip_if_no_xlrd()
for filename, csvfile in [(self.xls1, self.csv1),
(self.xlsx1, self.csv1)]:
self.check_excel_table_sheet_by_index(filename, csvfile)

def check_excel_sheet_by_name_raise(self, ext):
import xlrd
pth = os.path.join(self.dirpath, 'testit.{0}'.format(ext))

with ensure_clean(pth) as pth:
gt = DataFrame(np.random.randn(10, 2))
gt.to_excel(pth)
xl = ExcelFile(pth)
df = xl.parse(0)
tm.assert_frame_equal(gt, df)

self.assertRaises(xlrd.XLRDError, xl.parse, '0')

def test_excel_sheet_by_name_raise(self):
_skip_if_no_xlrd()
_skip_if_no_xlwt()
for ext in ('xls', 'xlsx'):
self.check_excel_sheet_by_name_raise(ext)

def test_excel_table(self):
_skip_if_no_xlrd()

Expand Down Expand Up @@ -438,7 +482,6 @@ def _check_extension_sheets(self, ext):
np.testing.assert_equal('test1', reader.sheet_names[0])
np.testing.assert_equal('test2', reader.sheet_names[1])


def test_excel_roundtrip_xls_colaliases(self):
_skip_if_no_excelsuite()
self._check_extension_colaliases('xls')
Expand Down Expand Up @@ -892,6 +935,7 @@ def test_deprecated_from_parsers(self):
from pandas.io.parsers import ExcelWriter as xw
xw(path)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)

0 comments on commit 5d2b85f

Please sign in to comment.