diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6aff4f4bd41e2..b1257fe893804 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -8,6 +8,15 @@ including other versions of pandas. {{ header }} +.. warning:: + + Previously, the default argument ``engine=None`` to ``pd.read_excel`` + would result in using the `xlrd `_ engine in + many cases. The engine ``xlrd`` is no longer maintained, and is not supported with + python >= 3.9. If `openpyxl `_ is installed, + many of these cases will now default to using the ``openpyxl`` engine. See the + :func:`read_excel` documentation for more details. + .. --------------------------------------------------------------------------- Enhancements diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index c519baa4c21da..0235d6a3f6384 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,14 +1,17 @@ import abc import datetime +import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill from typing import Any, Dict, Mapping, Union, cast +import warnings from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions +from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -99,12 +102,32 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". Engine compatibility : + - "xlrd" supports most old/new Excel file formats. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + + .. versionchanged:: 1.2.0 + The engine `xlrd `_ + is no longer maintained, and is not supported with + python >= 3.9. When ``engine=None``, the following logic will be + used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will + be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future. + converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -877,13 +900,32 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``xlrd``. + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` Engine compatibility : + - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + + .. versionchanged:: 1.2.0 + + The engine `xlrd `_ + is no longer maintained, and is not supported with + python >= 3.9. When ``engine=None``, the following logic will be + used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` + will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future. """ from pandas.io.excel._odfreader import ODFReader @@ -902,14 +944,59 @@ def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - engine = "xlrd" + # Determine ext and use odf for ods stream/file if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + ext = None if _is_ods_stream(path_or_buffer): engine = "odf" else: ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" + + if ( + import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + from xlrd import Book + + if isinstance(path_or_buffer, Book): + engine = "xlrd" + + # GH 35029 - Prefer openpyxl except for xls files + if engine is None: + if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": + engine = "xlrd" + elif ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + engine = "openpyxl" + else: + caller = inspect.stack()[1] + if ( + caller.filename.endswith("pandas/io/excel/_base.py") + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + "The xlrd engine is no longer maintained and is not " + "supported when using pandas with python >= 3.9. However, " + "the engine xlrd will continue to be allowed for the " + "indefinite future. Beginning with pandas 1.2.0, the " + "openpyxl engine will be used if it is installed and the " + "engine argument is not specified. Either install openpyxl " + "or specify engine='xlrd' to silence this warning.", + FutureWarning, + stacklevel=stacklevel, + ) + engine = "xlrd" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index c582a0fa23577..98a55ae39bd77 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -577,6 +577,10 @@ def test_date_conversion_overflow(self, read_ext): if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") + if pd.read_excel.keywords["engine"] is None: + # GH 35029 + pytest.xfail("Defaults to openpyxl, maybe not supported") + result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) @@ -1159,7 +1163,7 @@ def test_excel_high_surrogate(self, engine): expected = DataFrame(["\udc88"], columns=["Column1"]) # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") + actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd") tm.assert_frame_equal(expected, actual) @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8da9c79160e91..0aaa8be616342 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,12 +351,15 @@ def test_excel_sheet_by_name_raise(self, path, engine): msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - else: + elif engine == "xlwt": import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") + else: + with pytest.raises(KeyError, match="Worksheet 0 does not exist."): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1193,7 +1196,9 @@ def test_datetimes(self, path): write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) + # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd + engine = "odf" if path.endswith("ods") else "xlrd" + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) tm.assert_series_equal(write_frame["A"], read_frame["A"]) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 26190edaa4960..f2fbcbc2e2f04 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat._optional import import_optional_dependency + import pandas as pd import pandas._testing as tm @@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") - with ExcelFile(path) as excel: + with ExcelFile(path, engine="xlrd") as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") + + +def test_excel_file_warning_with_xlsx_file(datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + has_openpyxl = ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ) + if not has_openpyxl: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + ExcelFile(path, engine=None) + else: + with tm.assert_produces_warning(None): + pd.read_excel(path, "Sheet1", engine=None) + + +def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + has_openpyxl = ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ) + if not has_openpyxl: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + pd.read_excel(path, "Sheet1", engine=None) + else: + with tm.assert_produces_warning(None): + pd.read_excel(path, "Sheet1", engine=None)