Skip to content

Commit

Permalink
DEPR: Deprecate using xlrd engine for read_excel (#35029)
Browse files Browse the repository at this point in the history
  • Loading branch information
roberthdevries authored Dec 1, 2020
1 parent 1829a61 commit b3a3932
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 8 deletions.
9 changes: 9 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ including other versions of pandas.

{{ header }}

.. warning::

Previously, the default argument ``engine=None`` to ``pd.read_excel``
would result in using the `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ engine in
many cases. The engine ``xlrd`` is no longer maintained, and is not supported with
python >= 3.9. If `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
many of these cases will now default to using the ``openpyxl`` engine. See the
:func:`read_excel` documentation for more details.

.. ---------------------------------------------------------------------------
Enhancements
Expand Down
95 changes: 91 additions & 4 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import abc
import datetime
import inspect
from io import BufferedIOBase, BytesIO, RawIOBase
import os
from textwrap import fill
from typing import Any, Dict, Mapping, Union, cast
import warnings

from pandas._config import config

from pandas._libs.parsers import STR_NA_VALUES
from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions
from pandas.compat._optional import import_optional_dependency
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments

Expand Down Expand Up @@ -99,12 +102,32 @@
of dtype conversion.
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd".
Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
Engine compatibility :
- "xlrd" supports most old/new Excel file formats.
- "openpyxl" supports newer Excel file formats.
- "odf" supports OpenDocument file formats (.odf, .ods, .odt).
- "pyxlsb" supports Binary Excel files.
.. versionchanged:: 1.2.0
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
is no longer maintained, and is not supported with
python >= 3.9. When ``engine=None``, the following logic will be
used to determine the engine.
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
be used.
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
then ``openpyxl`` will be used.
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
Specifying ``engine="xlrd"`` will continue to be allowed for the
indefinite future.
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
Expand Down Expand Up @@ -880,13 +903,32 @@ class ExcelFile:
.xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
engine : str, default None
If io is not a buffer or path, this must be set to identify io.
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
default ``xlrd``.
Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
Engine compatibility :
- ``xlrd`` supports most old/new Excel file formats.
- ``openpyxl`` supports newer Excel file formats.
- ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
- ``pyxlsb`` supports Binary Excel files.
.. versionchanged:: 1.2.0
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
is no longer maintained, and is not supported with
python >= 3.9. When ``engine=None``, the following logic will be
used to determine the engine.
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
will be used.
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
then ``openpyxl`` will be used.
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
Specifying ``engine="xlrd"`` will continue to be allowed for the
indefinite future.
"""

from pandas.io.excel._odfreader import ODFReader
Expand All @@ -905,14 +947,59 @@ def __init__(
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
):
if engine is None:
engine = "xlrd"
# Determine ext and use odf for ods stream/file
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
ext = None
if _is_ods_stream(path_or_buffer):
engine = "odf"
else:
ext = os.path.splitext(str(path_or_buffer))[-1]
if ext == ".ods":
engine = "odf"

if (
import_optional_dependency(
"xlrd", raise_on_missing=False, on_version="ignore"
)
is not None
):
from xlrd import Book

if isinstance(path_or_buffer, Book):
engine = "xlrd"

# GH 35029 - Prefer openpyxl except for xls files
if engine is None:
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
engine = "xlrd"
elif (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
):
engine = "openpyxl"
else:
caller = inspect.stack()[1]
if (
caller.filename.endswith("pandas/io/excel/_base.py")
and caller.function == "read_excel"
):
stacklevel = 4
else:
stacklevel = 2
warnings.warn(
"The xlrd engine is no longer maintained and is not "
"supported when using pandas with python >= 3.9. However, "
"the engine xlrd will continue to be allowed for the "
"indefinite future. Beginning with pandas 1.2.0, the "
"openpyxl engine will be used if it is installed and the "
"engine argument is not specified. Either install openpyxl "
"or specify engine='xlrd' to silence this warning.",
FutureWarning,
stacklevel=stacklevel,
)
engine = "xlrd"
if engine not in self._engines:
raise ValueError(f"Unknown engine: {engine}")

Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,10 @@ def test_date_conversion_overflow(self, read_ext):
if pd.read_excel.keywords["engine"] == "openpyxl":
pytest.xfail("Maybe not supported by openpyxl")

if pd.read_excel.keywords["engine"] is None:
# GH 35029
pytest.xfail("Defaults to openpyxl, maybe not supported")

result = pd.read_excel("testdateoverflow" + read_ext)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1159,7 +1163,7 @@ def test_excel_high_surrogate(self, engine):
expected = DataFrame(["\udc88"], columns=["Column1"])

# should not produce a segmentation violation
actual = pd.read_excel("high_surrogate.xlsx")
actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
tm.assert_frame_equal(expected, actual)

@pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/io/excel/test_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,15 @@ def test_excel_sheet_by_name_raise(self, path, engine):
msg = "sheet 0 not found"
with pytest.raises(ValueError, match=msg):
pd.read_excel(xl, "0")
else:
elif engine == "xlwt":
import xlrd

msg = "No sheet named <'0'>"
with pytest.raises(xlrd.XLRDError, match=msg):
pd.read_excel(xl, sheet_name="0")
else:
with pytest.raises(KeyError, match="Worksheet 0 does not exist."):
pd.read_excel(xl, sheet_name="0")

def test_excel_writer_context_manager(self, frame, path):
with ExcelWriter(path) as writer:
Expand Down Expand Up @@ -1192,7 +1195,9 @@ def test_datetimes(self, path):

write_frame = DataFrame({"A": datetimes})
write_frame.to_excel(path, "Sheet1")
read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
# GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
engine = "odf" if path.endswith("ods") else "xlrd"
read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)

tm.assert_series_equal(write_frame["A"], read_frame["A"])

Expand Down
46 changes: 45 additions & 1 deletion pandas/tests/io/excel/test_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from pandas.compat._optional import import_optional_dependency

import pandas as pd
import pandas._testing as tm

Expand Down Expand Up @@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame):
# TODO: test for openpyxl as well
def test_excel_table_sheet_by_index(datapath, read_ext):
path = datapath("io", "data", "excel", f"test1{read_ext}")
with ExcelFile(path) as excel:
with ExcelFile(path, engine="xlrd") as excel:
with pytest.raises(xlrd.XLRDError):
pd.read_excel(excel, sheet_name="asdf")


def test_excel_file_warning_with_xlsx_file(datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
has_openpyxl = (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
)
if not has_openpyxl:
with tm.assert_produces_warning(
FutureWarning,
raise_on_extra_warnings=False,
match="The xlrd engine is no longer maintained",
):
ExcelFile(path, engine=None)
else:
with tm.assert_produces_warning(None):
pd.read_excel(path, "Sheet1", engine=None)


def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
# GH 29375
path = datapath("io", "data", "excel", "test1.xlsx")
has_openpyxl = (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
)
if not has_openpyxl:
with tm.assert_produces_warning(
FutureWarning,
raise_on_extra_warnings=False,
match="The xlrd engine is no longer maintained",
):
pd.read_excel(path, "Sheet1", engine=None)
else:
with tm.assert_produces_warning(None):
pd.read_excel(path, "Sheet1", engine=None)

0 comments on commit b3a3932

Please sign in to comment.