Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): add openpyxl as a new/optional engine for read_excel #6183

Merged
merged 23 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
25c5ff6
feat: Add first version of openpyxl both import and exporter for excel.
bvanelli Jan 11, 2023
f580076
fix: Add openpyxl to dev dependencies and format with black.
bvanelli Jan 12, 2023
90eec07
fix: Satisfy mypy and make all excel tests run.
bvanelli Jan 13, 2023
5e415f9
fix: Fix incorrectly placed ignore call overload.
bvanelli Jan 13, 2023
7819b78
fix: Fix and test bytes version and get rid of failing format_path on…
bvanelli Jan 14, 2023
dc568bd
Merge remote-tracking branch 'origin/master' into 5568-excel-writer
bvanelli Jan 14, 2023
162218d
fix: Standarize apis.
bvanelli Jan 21, 2023
819c894
Merge remote-tracking branch 'origin/master' into 5568-excel-writer
bvanelli Jan 21, 2023
f12aff9
fix: Remove extra arg.
bvanelli Jan 21, 2023
e5a5108
refactor: Replace use_openpyxl by engine.
bvanelli Jan 21, 2023
c9176e1
Merge remote-tracking branch 'origin/master' into 5568-excel-writer
bvanelli Mar 13, 2023
f6a5b03
fix: Exclude previous write excel and rework file position of io.excel.
bvanelli Mar 13, 2023
2e4e5f2
Extra fixes from merge request.
bvanelli Mar 13, 2023
2f74aac
Merge remote-tracking branch 'origin/master' into 5568-excel-writer
bvanelli Mar 19, 2023
e62a7be
fix: Fix merge request tests and merge origin (except perhaps mypy).
bvanelli Mar 19, 2023
da4d048
refactor: Reformat with black.
bvanelli Mar 19, 2023
a02a774
refactor: Rerun all formatting
bvanelli Mar 19, 2023
eea78b2
Merge remote-tracking branch 'origin/master' into 5568-excel-writer
bvanelli Mar 20, 2023
1d9a699
Merge remote-tracking branch 'origin/main' into 5568-excel-writer
bvanelli Jun 27, 2023
83cf521
fix: Fix mypy warnings
bvanelli Jun 27, 2023
e77ffd9
fix: Use safe loading of workbook.
bvanelli Sep 3, 2023
bc602ea
Merge remote-tracking branch 'origin/main' into 5568-excel-writer
bvanelli Sep 3, 2023
b4bf1ca
docs: Update documentation to reflect changes.
bvanelli Sep 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ You can also install the dependencies directly.
| fsspec | Support for reading from remote file systems |
| connectorx | Support for reading from SQL databases |
| xlsx2csv | Support for reading from Excel files |
| openpyxl | Support for reading from Excel files with native types |
| deltalake | Support for reading from Delta Lake Tables |
| timezone | Timezone support, only needed if are on Python<3.9 or you are on Windows |

Expand Down
105 changes: 88 additions & 17 deletions py-polars/polars/io/excel/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def read_excel(
sheet_name: str,
xlsx2csv_options: dict[str, Any] | None = ...,
read_csv_options: dict[str, Any] | None = ...,
engine: Literal["xlsx2csv", "openpyxl"] | None = ...,
raise_if_empty: bool = ...,
) -> pl.DataFrame:
...
Expand All @@ -35,6 +36,7 @@ def read_excel(
sheet_name: None = ...,
xlsx2csv_options: dict[str, Any] | None = ...,
read_csv_options: dict[str, Any] | None = ...,
engine: Literal["xlsx2csv", "openpyxl"] | None = ...,
raise_if_empty: bool = ...,
) -> pl.DataFrame:
...
Expand All @@ -48,6 +50,7 @@ def read_excel(
sheet_name: str,
xlsx2csv_options: dict[str, Any] | None = ...,
read_csv_options: dict[str, Any] | None = ...,
engine: Literal["xlsx2csv", "openpyxl"] | None = ...,
raise_if_empty: bool = ...,
) -> NoReturn:
...
Expand All @@ -63,6 +66,7 @@ def read_excel(
sheet_name: None = ...,
xlsx2csv_options: dict[str, Any] | None = ...,
read_csv_options: dict[str, Any] | None = ...,
engine: Literal["xlsx2csv", "openpyxl"] | None = ...,
raise_if_empty: bool = ...,
) -> dict[str, pl.DataFrame]:
...
Expand All @@ -76,6 +80,7 @@ def read_excel(
sheet_name: None = ...,
xlsx2csv_options: dict[str, Any] | None = ...,
read_csv_options: dict[str, Any] | None = ...,
engine: Literal["xlsx2csv", "openpyxl"] | None = ...,
raise_if_empty: bool = ...,
) -> pl.DataFrame:
...
Expand All @@ -88,13 +93,18 @@ def read_excel(
sheet_name: str | None = None,
xlsx2csv_options: dict[str, Any] | None = None,
read_csv_options: dict[str, Any] | None = None,
engine: Literal["xlsx2csv", "openpyxl"] | None = None,
raise_if_empty: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
"""
Read Excel (XLSX) sheet into a DataFrame.

Converts an Excel sheet with ``xlsx2csv.Xlsx2csv().convert()`` to CSV and parses the
CSV output with :func:`read_csv`.
If using the ``xlsx2csv`` engine, converts an Excel sheet with
``xlsx2csv.Xlsx2csv().convert()`` to CSV and parses the CSV output with
:func:`read_csv`.

When using the ``openpyxl`` engine, reads an Excel sheet with
``openpyxl.load_workbook(source)``.

Parameters
----------
Expand All @@ -116,6 +126,11 @@ def read_excel(
``xlsx2csv.Xlsx2csv().convert()``
e.g.: ``{"has_header": False, "new_columns": ["a", "b", "c"],
"infer_schema_length": None}``
engine
Library used to parse Excel, either openpyxl or xlsx2csv (default is xlsx2csv).
Please note that xlsx2csv converts first to csv, making type inference worse
than openpyxl. To remedy that, you can use the extra options defined on
`xlsx2csv_options` and `read_csv_options`
raise_if_empty
When there is no data in the sheet,``NoDataError`` is raised. If this parameter
is set to False, an empty DataFrame (with no columns) is returned instead.
Expand Down Expand Up @@ -155,20 +170,22 @@ def read_excel(
... read_csv_options={"infer_schema_length": None},
... ) # doctest: +SKIP

The ``openpyxl`` engine can also be used to provide automatic type inference.
To do so, specify the right engine (`xlsx2csv_options` and `read_csv_options`
will be ignored):

>>> pl.read_excel(
... source="test.xlsx",
... engine="openpyxl",
... ) # doctest: +SKIP

If :func:`read_excel` does not work or you need to read other types of
spreadsheet files, you can try pandas ``pd.read_excel()``
(supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt`).

>>> pl.from_pandas(pd.read_excel("test.xlsx")) # doctest: +SKIP

"""
try:
import xlsx2csv
except ImportError:
raise ModuleNotFoundError(
"xlsx2csv is not installed\n\nPlease run: `pip install xlsx2csv`"
) from None

if sheet_id is not None and sheet_name is not None:
raise ValueError(
f"cannot specify both `sheet_name` ({sheet_name!r}) and `sheet_id` ({sheet_id!r})"
Expand All @@ -182,27 +199,50 @@ def read_excel(
if read_csv_options is None:
read_csv_options = {}

# convert sheets to csv
parser = xlsx2csv.Xlsx2csv(source, **xlsx2csv_options)
reader_fn: Any # make mypy happy
# do conditions imports
if engine == "openpyxl":
try:
import openpyxl
except ImportError:
raise ImportError(
"openpyxl is not installed\n\nPlease run `pip install openpyxl`"
) from None
parser = openpyxl.load_workbook(source)
sheets = [
{"index": i + 1, "name": sheet.title} for i, sheet in enumerate(parser)
]
reader_fn = _read_excel_sheet_openpyxl
elif engine == "xlsx2csv" or engine is None: # default
try:
import xlsx2csv
except ImportError:
raise ModuleNotFoundError(
"xlsx2csv is not installed\n\nPlease run: `pip install xlsx2csv`"
) from None
# convert sheets to csv
parser = xlsx2csv.Xlsx2csv(source, **xlsx2csv_options)
sheets = parser.workbook.sheets
reader_fn = _read_excel_sheet_xlsx2csv
else:
raise NotImplementedError(f"Cannot find the engine `{engine}`")

if sheet_id == 0:
# read ALL sheets
return {
sheet["name"]: _read_excel_sheet(
sheet["name"]: reader_fn(
parser=parser,
sheet_id=sheet["index"],
sheet_name=None,
read_csv_options=read_csv_options,
raise_if_empty=raise_if_empty,
)
for sheet in parser.workbook.sheets
for sheet in sheets
}
else:
# read a specific sheet by id or name
if sheet_name is None:
sheet_id = sheet_id or 1

return _read_excel_sheet(
return reader_fn(
parser=parser,
sheet_id=sheet_id,
sheet_name=sheet_name,
Expand All @@ -211,7 +251,38 @@ def read_excel(
)


def _read_excel_sheet(
def _read_excel_sheet_openpyxl(
parser: Any,
sheet_id: int | None,
sheet_name: str | None,
read_csv_options: dict[str, Any] | None,
raise_if_empty: bool,
) -> pl.DataFrame:
# read requested sheet if provided on kwargs, otherwise read active sheet
if sheet_name is not None:
ws = parser[sheet_name]
elif sheet_id is not None:
ws = parser.worksheets[sheet_id - 1]
else:
ws = parser.active

rows_iter = iter(ws.rows)

# check whether to include or omit the header
header = [str(cell.value) for cell in next(rows_iter)]

df = pl.DataFrame(
{key: cell.value for key, cell in zip(header, row)} for row in rows_iter
)
if raise_if_empty and len(df) == 0:
raise NoDataError(
"Empty Excel sheet; if you want to read this as "
"an empty DataFrame, set `raise_if_empty=False`"
)
return df


def _read_excel_sheet_xlsx2csv(
parser: Any,
sheet_id: int | None,
sheet_name: str | None,
Expand Down
2 changes: 2 additions & 0 deletions py-polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ numpy = ["numpy >= 1.16.0"]
fsspec = ["fsspec"]
connectorx = ["connectorx"]
xlsx2csv = ["xlsx2csv >= 0.8.0"]
openpyxl = ["openpyxl >= 3.0.0"]
deltalake = ["deltalake >= 0.10.0"]
timezone = ["backports.zoneinfo; python_version < '3.9'", "tzdata; platform_system == 'Windows'"]
matplotlib = ["matplotlib"]
Expand Down Expand Up @@ -84,6 +85,7 @@ module = [
"sqlalchemy.*",
"xlsx2csv",
"xlsxwriter.*",
"openpyxl",
"zoneinfo",
]
ignore_missing_imports = true
Expand Down
1 change: 1 addition & 0 deletions py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ backports.zoneinfo; python_version < '3.9'
tzdata; platform_system == 'Windows'
SQLAlchemy
xlsx2csv
openpyxl
XlsxWriter
adbc_driver_sqlite; python_version >= '3.9' and platform_system != 'Windows'
connectorx
Expand Down
52 changes: 51 additions & 1 deletion py-polars/tests/unit/io/test_excel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from datetime import date
from datetime import date, datetime
from io import BytesIO
from typing import TYPE_CHECKING, Any

Expand Down Expand Up @@ -42,6 +42,56 @@ def test_read_excel_all_sheets(excel_file_path: Path) -> None:
assert_frame_equal(df["Sheet2"], expected2)


def test_read_excel_all_sheets_openpyxl(excel_file_path: Path) -> None:
df = pl.read_excel(excel_file_path, sheet_id=0, engine="openpyxl")

expected1 = pl.DataFrame({"hello": ["Row 1", "Row 2"]})
expected2 = pl.DataFrame({"world": ["Row 3", "Row 4"]})

assert_frame_equal(df["Sheet1"], expected1)
assert_frame_equal(df["Sheet2"], expected2)


def test_basic_datatypes_openpyxl_read_excel() -> None:
df = pl.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"fruits": ["banana", "banana", "apple", "apple", "banana"],
"floats": [1.1, 1.2, 1.3, 1.4, 1.5],
"datetime": [datetime(2023, 1, x) for x in range(1, 6)],
"nulls": [1, None, None, None, 1],
}
)
xls = BytesIO()
df.write_excel(xls)
# check if can be read as it was written
# we use openpyxl because type inference is better
df_by_default = pl.read_excel(xls, engine="openpyxl")
df_by_sheet_id = pl.read_excel(xls, sheet_id=1, engine="openpyxl")
df_by_sheet_name = pl.read_excel(xls, sheet_name="Sheet1", engine="openpyxl")

assert_frame_equal(df, df_by_default)
assert_frame_equal(df, df_by_sheet_id)
assert_frame_equal(df, df_by_sheet_name)


def test_write_excel_bytes() -> None:
df = pl.DataFrame(
{
"A": [1, 2, 3, 4, 5],
}
)
excel_bytes = BytesIO()
df.write_excel(excel_bytes)
df_read = pl.read_excel(excel_bytes)
assert_frame_equal(df, df_read)


def test_unsupported_engine() -> None:
with pytest.raises(NotImplementedError):
pl.read_excel(None, engine="foo") # type: ignore[call-overload]


def test_read_excel_all_sheets_with_sheet_name(excel_file_path: Path) -> None:
with pytest.raises(
ValueError,
Expand Down