From c20e149b1d80c66204e9e45ed2088fcc6d057d7c Mon Sep 17 00:00:00 2001
From: alexander-beedie <alexander.m.beedie@icloud.com>
Date: Mon, 22 Jan 2024 01:09:38 +0400
Subject: [PATCH 1/2] feat(python): add "calamine" support to `read_excel`,
 using `fastexcel`

---
 .../reference/lazyframe/descriptive.rst       |   1 +
 py-polars/polars/dataframe/frame.py           |  41 +--
 py-polars/polars/dependencies.py              |  44 ++++
 py-polars/polars/io/spreadsheet/functions.py  | 241 ++++++++++--------
 py-polars/polars/lazyframe/frame.py           |  40 ++-
 py-polars/polars/type_aliases.py              |   6 +-
 py-polars/requirements-dev.txt                |   7 +-
 py-polars/tests/unit/io/test_spreadsheet.py   | 175 +++++++++++--
 8 files changed, 406 insertions(+), 149 deletions(-)

diff --git a/py-polars/docs/source/reference/lazyframe/descriptive.rst b/py-polars/docs/source/reference/lazyframe/descriptive.rst
index 6de20f675f4b..0f05afae8960 100644
--- a/py-polars/docs/source/reference/lazyframe/descriptive.rst
+++ b/py-polars/docs/source/reference/lazyframe/descriptive.rst
@@ -6,5 +6,6 @@ Descriptive
 .. autosummary::
    :toctree: api/
 
+    LazyFrame.describe
     LazyFrame.explain
     LazyFrame.show_graph
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index a7c499017a12..9f6279aa2010 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -52,6 +52,7 @@
     _check_for_pyarrow,
     dataframe_api_compat,
     hvplot,
+    import_optional,
 )
 from polars.dependencies import numpy as np
 from polars.dependencies import pandas as pd
@@ -3073,15 +3074,8 @@ def write_excel(
         ...     sheet_zoom=125,
         ... )
         """  # noqa: W505
-        try:
-            import xlsxwriter
-            from xlsxwriter.utility import xl_cell_to_rowcol
-        except ImportError:
-            msg = (
-                "Excel export requires xlsxwriter"
-                "\n\nPlease run: pip install XlsxWriter"
-            )
-            raise ImportError(msg) from None
+        xlsxwriter = import_optional("xlsxwriter", err_prefix="Excel export requires")
+        from xlsxwriter.utility import xl_cell_to_rowcol
 
         # setup workbook/worksheet
         wb, ws, can_close = _xl_setup_workbook(workbook, worksheet)
@@ -6751,7 +6745,10 @@ def drop_in_place(self, name: str) -> Series:
 
     def cast(
         self,
-        dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType,
+        dtypes: (
+            Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType]
+            | PolarsDataType
+        ),
         *,
         strict: bool = True,
     ) -> DataFrame:
@@ -6792,12 +6789,19 @@ def cast(
         │ 3.0 ┆ 8   ┆ 2022-05-06 │
         └─────┴─────┴────────────┘
 
-        Cast all frame columns to the specified dtype:
+        Cast all frame columns matching one dtype (or dtype group) to another dtype:
 
-        >>> df.cast(pl.String).to_dict(as_series=False)
-        {'foo': ['1', '2', '3'],
-         'bar': ['6.0', '7.0', '8.0'],
-         'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
+        >>> df.cast({pl.Date: pl.Datetime})
+        shape: (3, 3)
+        ┌─────┬─────┬─────────────────────┐
+        │ foo ┆ bar ┆ ham                 │
+        │ --- ┆ --- ┆ ---                 │
+        │ i64 ┆ f64 ┆ datetime[μs]        │
+        ╞═════╪═════╪═════════════════════╡
+        │ 1   ┆ 6.0 ┆ 2020-01-02 00:00:00 │
+        │ 2   ┆ 7.0 ┆ 2021-03-04 00:00:00 │
+        │ 3   ┆ 8.0 ┆ 2022-05-06 00:00:00 │
+        └─────┴─────┴─────────────────────┘
 
         Use selectors to define the columns being cast:
 
@@ -6813,6 +6817,13 @@ def cast(
         │ 2   ┆ 7   ┆ 2021-03-04 │
         │ 3   ┆ 8   ┆ 2022-05-06 │
         └─────┴─────┴────────────┘
+
+        Cast all frame columns to the specified dtype:
+
+        >>> df.cast(pl.String).to_dict(as_series=False)
+        {'foo': ['1', '2', '3'],
+         'bar': ['6.0', '7.0', '8.0'],
+         'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
         """
         return self.lazy().cast(dtypes, strict=strict).collect(_eager=True)
 
diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py
index 1dfbe51deea4..d987b36d6ac4 100644
--- a/py-polars/polars/dependencies.py
+++ b/py-polars/polars/dependencies.py
@@ -229,6 +229,50 @@ def _check_for_pydantic(obj: Any, *, check_type: bool = True) -> bool:
     )
 
 
+def import_optional(
+    module_name: str,
+    err_prefix: str = "Required package",
+    err_suffix: str = "not installed",
+    min_version: str | tuple[int, ...] | None = None,
+) -> Any:
+    """
+    Import an optional dependency, returning the module.
+
+    Parameters
+    ----------
+    module_name : str
+        Name of the dependency to import.
+    err_prefix : str, optional
+        Error prefix to use in the raised exception (appears before the module name).
+    err_suffix: str, optional
+        Error suffix to use in the raised exception (follows the module name).
+    min_version : {str, tuple[int]}, optional
+        If a minimum module version is required, specify it here.
+    """
+    from polars.exceptions import ModuleUpgradeRequired
+    from polars.utils.various import parse_version
+
+    try:
+        module = import_module(module_name)
+    except ImportError:
+        prefix = f"{err_prefix.strip(' ')} " if err_prefix else ""
+        suffix = f" {err_prefix.strip(' ')}" if err_suffix else ""
+        err_message = (
+            f"{prefix}'{module_name}'{suffix}.\n"
+            f"Please install it using the command `pip install {module_name}`."
+        )
+        raise ImportError(err_message) from None
+
+    if min_version:
+        min_version = parse_version(min_version)
+        mod_version = parse_version(module.__version__)
+        if mod_version < min_version:
+            msg = f"requires module_name {min_version} or higher, found {mod_version}"
+            raise ModuleUpgradeRequired(msg)
+
+    return module
+
+
 __all__ = [
     # lazy-load rarely-used/heavy builtins (for fast startup)
     "dataclasses",
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
index f98ed64afcf4..a7fe63d90d3f 100644
--- a/py-polars/polars/io/spreadsheet/functions.py
+++ b/py-polars/polars/io/spreadsheet/functions.py
@@ -1,23 +1,27 @@
 from __future__ import annotations
 
 import re
-from io import StringIO
+from contextlib import nullcontext
+from datetime import time
+from io import BytesIO, StringIO
 from pathlib import Path
+from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Any, BinaryIO, Callable, NoReturn, Sequence, overload
 
 import polars._reexport as pl
 from polars import functions as F
-from polars.datatypes import Date, Datetime, String
+from polars.datatypes import FLOAT_DTYPES, Date, Datetime, Int64, Null, String
+from polars.dependencies import import_optional
 from polars.exceptions import NoDataError, ParameterCollisionError
 from polars.io._utils import _looks_like_url, _process_file_url
 from polars.io.csv.functions import read_csv
+from polars.utils.deprecation import deprecate_renamed_parameter
 from polars.utils.various import normalize_filepath
 
 if TYPE_CHECKING:
-    from io import BytesIO
     from typing import Literal
 
-    from polars.type_aliases import SchemaDict
+    from polars.type_aliases import ExcelSpreadsheetEngine, SchemaDict
 
 
 @overload
@@ -26,8 +30,8 @@ def read_excel(
     *,
     sheet_id: None = ...,
     sheet_name: str,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ...,
-    xlsx2csv_options: dict[str, Any] | None = ...,
+    engine: ExcelSpreadsheetEngine | None = ...,
+    engine_options: dict[str, Any] | None = ...,
     read_csv_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     raise_if_empty: bool = ...,
@@ -41,8 +45,8 @@ def read_excel(
     *,
     sheet_id: None = ...,
     sheet_name: None = ...,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ...,
-    xlsx2csv_options: dict[str, Any] | None = ...,
+    engine: ExcelSpreadsheetEngine | None = ...,
+    engine_options: dict[str, Any] | None = ...,
     read_csv_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     raise_if_empty: bool = ...,
@@ -56,8 +60,8 @@ def read_excel(
     *,
     sheet_id: int,
     sheet_name: str,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ...,
-    xlsx2csv_options: dict[str, Any] | None = ...,
+    engine: ExcelSpreadsheetEngine | None = ...,
+    engine_options: dict[str, Any] | None = ...,
     read_csv_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     raise_if_empty: bool = ...,
@@ -73,8 +77,8 @@ def read_excel(
     *,
     sheet_id: Literal[0] | Sequence[int],
     sheet_name: None = ...,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ...,
-    xlsx2csv_options: dict[str, Any] | None = ...,
+    engine: ExcelSpreadsheetEngine | None = ...,
+    engine_options: dict[str, Any] | None = ...,
     read_csv_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     raise_if_empty: bool = ...,
@@ -88,8 +92,8 @@ def read_excel(
     *,
     sheet_id: int,
     sheet_name: None = ...,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ...,
-    xlsx2csv_options: dict[str, Any] | None = ...,
+    engine: ExcelSpreadsheetEngine | None = ...,
+    engine_options: dict[str, Any] | None = ...,
     read_csv_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     raise_if_empty: bool = ...,
@@ -103,8 +107,8 @@ def read_excel(
     *,
     sheet_id: None,
     sheet_name: list[str] | tuple[str],
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ...,
-    xlsx2csv_options: dict[str, Any] | None = ...,
+    engine: ExcelSpreadsheetEngine | None = ...,
+    engine_options: dict[str, Any] | None = ...,
     read_csv_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     raise_if_empty: bool = ...,
@@ -112,24 +116,27 @@ def read_excel(
     ...
 
 
+@deprecate_renamed_parameter("xlsx2csv_options", "engine_options", version="0.20.6")
 def read_excel(
     source: str | BytesIO | Path | BinaryIO | bytes,
     *,
     sheet_id: int | Sequence[int] | None = None,
     sheet_name: str | list[str] | tuple[str] | None = None,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = None,
-    xlsx2csv_options: dict[str, Any] | None = None,
+    engine: ExcelSpreadsheetEngine | None = None,
+    engine_options: dict[str, Any] | None = None,
     read_csv_options: dict[str, Any] | None = None,
     schema_overrides: SchemaDict | None = None,
     raise_if_empty: bool = True,
 ) -> pl.DataFrame | dict[str, pl.DataFrame]:
     """
-    Read Excel (XLSX) spreadsheet data into a DataFrame.
+    Read Excel spreadsheet data into a DataFrame.
 
+    .. versionadded:: 0.20.6
+        Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls).
     .. versionadded:: 0.19.4
-        Added support for "pyxlsb" engine for reading Excel Binary Workbooks (.xlsb).
+        Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb).
     .. versionadded:: 0.19.3
-        Added support for "openpyxl" engine, and added `schema_overrides` parameter.
+        Added "openpyxl" engine, and added `schema_overrides` parameter.
 
     Parameters
     ----------
@@ -145,11 +152,12 @@ def read_excel(
         Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more
         than one is given then a `{sheetname:frame,}` dict is returned.
     engine
-        Library used to parse the spreadsheet file; defaults to "xlsx2csv" if not set.
+        Library used to parse the spreadsheet file; currently defaults to "xlsx2csv"
+        if not explicitly set.
 
-        * "xlsx2csv": the fastest engine; converts the data to an in-memory CSV before
-          using the native polars `read_csv` method to parse the result. You can
-          pass `xlsx2csv_options` and `read_csv_options` to refine the conversion.
+        * "xlsx2csv": converts the data to an in-memory CSV before using the native
+          polars `read_csv` method to parse the result. You can pass `engine_options`
+          and `read_csv_options` to refine the conversion.
         * "openpyxl": this engine is significantly slower than `xlsx2csv` but supports
           additional automatic type inference; potentially useful if you are otherwise
           unable to parse your sheet with the (default) `xlsx2csv` engine in
@@ -157,15 +165,18 @@ def read_excel(
         * "pyxlsb": this engine is used for Excel Binary Workbooks (`.xlsb` files).
           Note that you have to use `schema_overrides` to correctly load date/datetime
           columns (or these will be read as floats representing offset Julian values).
+        * "calamine": this engine can be used for reading all major types of Excel
+          Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the
+          other options, using the `fastexcel` module to bind calamine.
 
-    xlsx2csv_options
-        Extra options passed to `xlsx2csv.Xlsx2csv()`,
-        e.g. `{"skip_empty_lines": True}`
+    engine_options
+        Extra options passed to the underlying engine's Workbook-reading constructor.
+        For example, if using `xlsx2csv` you could pass `{"skip_empty_lines": True}`.
     read_csv_options
         Extra options passed to :func:`read_csv` for parsing the CSV file returned by
-        `xlsx2csv.Xlsx2csv().convert()`
-        e.g.: ``{"has_header": False, "new_columns": ["a", "b", "c"],
-        "infer_schema_length": None}``
+        `xlsx2csv.Xlsx2csv().convert()`. This option is *only* applicable when using
+        the `xlsx2csv` engine. For example, you could pass ``{"has_header": False,
+        "new_columns": ["a", "b", "c"], "infer_schema_length": None}``
     schema_overrides
         Support type specification or override of one or more columns.
     raise_if_empty
@@ -203,7 +214,7 @@ def read_excel(
     >>> pl.read_excel(
     ...     source="test.xlsx",
     ...     sheet_id=3,
-    ...     xlsx2csv_options={"skip_empty_lines": True},
+    ...     engine_options={"skip_empty_lines": True},
     ...     read_csv_options={"has_header": False, "new_columns": ["a", "b", "c"]},
     ... )  # doctest: +SKIP
 
@@ -223,7 +234,7 @@ def read_excel(
     The `openpyxl` package can also be used to parse Excel data; it has slightly
     better default type detection, but is slower than `xlsx2csv`. If you have a sheet
     that is better read using this package you can set the engine as "openpyxl" (if you
-    use this engine then neither `xlsx2csv_options` nor `read_csv_options` can be set).
+    use this engine then `read_csv_options` cannot be set).
 
     >>> pl.read_excel(
     ...     source="test.xlsx",
@@ -231,20 +242,16 @@ def read_excel(
     ...     schema_overrides={"dt": pl.Datetime, "value": pl.Int32},
     ... )  # doctest: +SKIP
     """
-    if engine and engine != "xlsx2csv":
-        if xlsx2csv_options:
-            msg = f"cannot specify `xlsx2csv_options` when engine={engine!r}"
-            raise ValueError(msg)
-        if read_csv_options:
-            msg = f"cannot specify `read_csv_options` when engine={engine!r}"
-            raise ValueError(msg)
+    if engine and engine != "xlsx2csv" and read_csv_options:
+        msg = f"cannot specify `read_csv_options` when engine={engine!r}"
+        raise ValueError(msg)
 
     return _read_spreadsheet(
         sheet_id,
         sheet_name,
         source=source,
         engine=engine,
-        engine_options=xlsx2csv_options,
+        engine_options=engine_options,
         read_csv_options=read_csv_options,
         schema_overrides=schema_overrides,
         raise_if_empty=raise_if_empty,
@@ -393,7 +400,7 @@ def _read_spreadsheet(
     sheet_id: int | Sequence[int] | None,
     sheet_name: str | list[str] | tuple[str] | None,
     source: str | BytesIO | Path | BinaryIO | bytes,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"] | None,
+    engine: ExcelSpreadsheetEngine | Literal["ods"] | None,
     engine_options: dict[str, Any] | None = None,
     read_csv_options: dict[str, Any] | None = None,
     schema_overrides: SchemaDict | None = None,
@@ -489,19 +496,13 @@ def _get_sheet_names(
 
 
 def _initialise_spreadsheet_parser(
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"],
+    engine: str | None,
     source: str | BytesIO | Path | BinaryIO | bytes,
     engine_options: dict[str, Any],
 ) -> tuple[Callable[..., pl.DataFrame], Any, list[dict[str, Any]]]:
     """Instantiate the indicated spreadsheet parser and establish related properties."""
     if engine == "xlsx2csv":  # default
-        try:
-            import xlsx2csv
-        except ImportError:
-            msg = (
-                "required package not installed" "\n\nPlease run: pip install xlsx2csv"
-            )
-            raise ModuleNotFoundError(msg) from None
+        xlsx2csv = import_optional("xlsx2csv")
 
         # establish sensible defaults for unset options
         for option, value in {
@@ -517,23 +518,34 @@ def _initialise_spreadsheet_parser(
         return _read_spreadsheet_xlsx2csv, parser, sheets
 
     elif engine == "openpyxl":
-        try:
-            import openpyxl
-        except ImportError:
-            msg = (
-                "required package not installed" "\n\nPlease run: pip install openpyxl"
-            )
-            raise ImportError(msg) from None
+        openpyxl = import_optional("openpyxl")
         parser = openpyxl.load_workbook(source, data_only=True, **engine_options)
         sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)]
         return _read_spreadsheet_openpyxl, parser, sheets
 
+    elif engine == "calamine":
+        # note: can't read directly from bytes (yet) so
+        if read_bytesio := isinstance(source, BytesIO):
+            temp_data = NamedTemporaryFile(delete=True)
+        with nullcontext() if not read_bytesio else temp_data as xldata:  # type: ignore[attr-defined]
+            if read_bytesio:
+                xldata.write(source.getvalue())  # type: ignore[union-attr]
+                xldata = xldata.file.name
+            else:
+                xldata = source
+
+            if not Path(xldata).exists():
+                raise FileNotFoundError(xldata)
+
+            fxl = import_optional("fastexcel", min_version="0.7.0")
+            parser = fxl.read_excel(xldata, **engine_options)
+            sheets = [
+                {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names)
+            ]
+            return _read_spreadsheet_calamine, parser, sheets
+
     elif engine == "pyxlsb":
-        try:
-            import pyxlsb
-        except ImportError:
-            msg = "required package not installed" "\n\nPlease run: pip install pyxlsb"
-            raise ImportError(msg) from None
+        pyxlsb = import_optional("pyxlsb")
         try:
             parser = pyxlsb.open_workbook(source, **engine_options)
         except KeyError as err:
@@ -547,14 +559,7 @@ def _initialise_spreadsheet_parser(
         return _read_spreadsheet_pyxlsb, parser, sheets
 
     elif engine == "ods":
-        try:
-            import ezodf
-        except ImportError:
-            msg = (
-                "required package not installed"
-                "\n\nPlease run: pip install ezodf lxml"
-            )
-            raise ImportError(msg) from None
+        ezodf = import_optional("ezodf")
         parser = ezodf.opendoc(source, **engine_options)
         sheets = [
             {"index": i + 1, "name": ws.name} for i, ws in enumerate(parser.sheets)
@@ -602,21 +607,33 @@ def _csv_buffer_to_frame(
         separator=separator,
         **read_csv_options,
     )
-    return _drop_unnamed_null_columns(df)
+    return _drop_null_data(df, raise_if_empty=raise_if_empty)
 
 
-def _drop_unnamed_null_columns(df: pl.DataFrame) -> pl.DataFrame:
-    """If DataFrame contains unnamed columns that contain only nulls, drop them."""
+def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:
+    """If DataFrame contains columns/rows that contain only nulls, drop them."""
     null_cols = []
     for col_name in df.columns:
-        # note that if multiple unnamed columns are found then all but
-        # the first one will be ones will be named as "_duplicated_{n}"
-        if col_name == "" or re.match(r"_duplicated_\d+$", col_name):
-            if df[col_name].null_count() == len(df):
+        # note that if multiple unnamed columns are found then all but the first one
+        # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine)
+        if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name):
+            col = df[col_name]
+            if col.dtype == Null or col.null_count() == len(df):
                 null_cols.append(col_name)
     if null_cols:
         df = df.drop(*null_cols)
-    return df
+
+    if len(df) == 0 and len(df.columns) == 0:
+        if not raise_if_empty:
+            return df
+        else:
+            msg = (
+                "empty Excel sheet"
+                "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
+            )
+            raise NoDataError(msg)
+
+    return df.filter(~F.all_horizontal(F.all().is_null()))
 
 
 def _read_spreadsheet_ods(
@@ -671,13 +688,6 @@ def _read_spreadsheet_ods(
             schema_overrides=overrides,
         )
 
-    if raise_if_empty and len(df) == 0 and len(df.columns) == 0:
-        msg = (
-            "empty Excel sheet"
-            "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
-        )
-        raise NoDataError(msg)
-
     if strptime_cols:
         df = df.with_columns(
             (
@@ -689,8 +699,9 @@ def _read_spreadsheet_ods(
             )
             for nm, dtype in strptime_cols.items()
         )
+
     df.columns = headers
-    return _drop_unnamed_null_columns(df)
+    return _drop_null_data(df, raise_if_empty=raise_if_empty)
 
 
 def _read_spreadsheet_openpyxl(
@@ -738,13 +749,49 @@ def _read_spreadsheet_openpyxl(
         {s.name: s for s in series_data},
         schema_overrides=schema_overrides,
     )
-    if raise_if_empty and len(df) == 0 and len(df.columns) == 0:
-        msg = (
-            "empty Excel sheet"
-            "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
-        )
-        raise NoDataError(msg)
-    return _drop_unnamed_null_columns(df)
+    return _drop_null_data(df, raise_if_empty=raise_if_empty)
+
+
+def _read_spreadsheet_calamine(
+    parser: Any,
+    sheet_name: str | None,
+    read_csv_options: dict[str, Any] | None,
+    schema_overrides: SchemaDict | None,
+    *,
+    raise_if_empty: bool,
+) -> pl.DataFrame:
+    ws = parser.load_sheet_by_name(sheet_name)
+    df = ws.to_polars()
+
+    if schema_overrides:
+        df = df.cast(dtypes=schema_overrides)
+
+    df = _drop_null_data(df, raise_if_empty=raise_if_empty)
+
+    # calamine may read integer data as float; cast back to int where possible.
+    # do a similar downcast check for datetime -> date dtypes.
+    type_checks = []
+    for c, dtype in df.schema.items():
+        if dtype in FLOAT_DTYPES:
+            check_cast = [F.col(c).floor().eq_missing(F.col(c)), F.col(c).cast(Int64)]
+            type_checks.append(check_cast)
+        elif dtype == Datetime:
+            check_cast = [
+                F.col(c).drop_nulls().dt.time().eq_missing(time(0, 0, 0)),
+                F.col(c).cast(Date),
+            ]
+            type_checks.append(check_cast)
+
+    if type_checks:
+        apply_downcast = df.select([d[0] for d in type_checks]).row(0)
+
+        # do a similar check for datetime columns that have only 00:00:00 times.
+        if downcast := [
+            cast for apply, (_, cast) in zip(apply_downcast, type_checks) if apply
+        ]:
+            df = df.with_columns(*downcast)
+
+    return df
 
 
 def _read_spreadsheet_pyxlsb(
@@ -799,13 +846,7 @@ def _read_spreadsheet_pyxlsb(
         {s.name: s for s in series_data},
         schema_overrides=schema_overrides,
     )
-    if raise_if_empty and len(df) == 0 and len(df.columns) == 0:
-        msg = (
-            "empty Excel sheet"
-            "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
-        )
-        raise NoDataError(msg)
-    return _drop_unnamed_null_columns(df)
+    return _drop_null_data(df, raise_if_empty=raise_if_empty)
 
 
 def _read_spreadsheet_xlsx2csv(
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 38baa6637ed1..7957ffd35d23 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -30,6 +30,7 @@
     N_INFER_DEFAULT,
     Boolean,
     Categorical,
+    DataTypeGroup,
     Date,
     Datetime,
     Duration,
@@ -49,6 +50,7 @@
     UInt32,
     UInt64,
     Unknown,
+    is_polars_dtype,
     py_type_to_dtype,
 )
 from polars.dependencies import dataframe_api_compat, subprocess
@@ -58,7 +60,7 @@
 from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec
 from polars.lazyframe.group_by import LazyGroupBy
 from polars.lazyframe.in_process import InProcessQuery
-from polars.selectors import _expand_selectors, expand_selector
+from polars.selectors import _expand_selectors, by_dtype, expand_selector
 from polars.slice import LazyPolarsSlice
 from polars.utils._async import _AioDataFrameResult, _GeventDataFrameResult
 from polars.utils._parse_expr_input import (
@@ -2600,7 +2602,10 @@ def cache(self) -> Self:
 
     def cast(
         self,
-        dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType,
+        dtypes: (
+            Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType]
+            | PolarsDataType
+        ),
         *,
         strict: bool = True,
     ) -> Self:
@@ -2641,12 +2646,19 @@ def cast(
         │ 3.0 ┆ 8   ┆ 2022-05-06 │
         └─────┴─────┴────────────┘
 
-        Cast all frame columns to the specified dtype:
+        Cast all frame columns matching one dtype (or dtype group) to another dtype:
 
-        >>> lf.cast(pl.String).collect().to_dict(as_series=False)
-        {'foo': ['1', '2', '3'],
-         'bar': ['6.0', '7.0', '8.0'],
-         'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
+        >>> lf.cast({pl.Date: pl.Datetime}).collect()
+        shape: (3, 3)
+        ┌─────┬─────┬─────────────────────┐
+        │ foo ┆ bar ┆ ham                 │
+        │ --- ┆ --- ┆ ---                 │
+        │ i64 ┆ f64 ┆ datetime[μs]        │
+        ╞═════╪═════╪═════════════════════╡
+        │ 1   ┆ 6.0 ┆ 2020-01-02 00:00:00 │
+        │ 2   ┆ 7.0 ┆ 2021-03-04 00:00:00 │
+        │ 3   ┆ 8.0 ┆ 2022-05-06 00:00:00 │
+        └─────┴─────┴─────────────────────┘
 
         Use selectors to define the columns being cast:
 
@@ -2662,17 +2674,29 @@ def cast(
         │ 2   ┆ 7   ┆ 2021-03-04 │
         │ 3   ┆ 8   ┆ 2022-05-06 │
         └─────┴─────┴────────────┘
+
+        Cast all frame columns to the specified dtype:
+
+        >>> lf.cast(pl.String).collect().to_dict(as_series=False)
+        {'foo': ['1', '2', '3'],
+         'bar': ['6.0', '7.0', '8.0'],
+         'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
         """
         if not isinstance(dtypes, Mapping):
             return self._from_pyldf(self._ldf.cast_all(dtypes, strict))
 
         cast_map = {}
         for c, dtype in dtypes.items():
+            if (is_polars_dtype(c) or isinstance(c, DataTypeGroup)) or (
+                isinstance(c, Collection) and all(is_polars_dtype(x) for x in c)
+            ):
+                c = by_dtype(c)  # type: ignore[arg-type]
+
             dtype = py_type_to_dtype(dtype)
             cast_map.update(
                 {c: dtype}
                 if isinstance(c, str)
-                else {x: dtype for x in expand_selector(self, c)}
+                else {x: dtype for x in expand_selector(self, c)}  # type: ignore[arg-type]
             )
 
         return self._from_pyldf(self._ldf.cast(cast_map, strict))
diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py
index 26785724f4f6..eee5e670d8a6 100644
--- a/py-polars/polars/type_aliases.py
+++ b/py-polars/polars/type_aliases.py
@@ -208,9 +208,13 @@
 # typevars for core polars types
 PolarsType = TypeVar("PolarsType", "DataFrame", "LazyFrame", "Series", "Expr")
 FrameType = TypeVar("FrameType", "DataFrame", "LazyFrame")
-
 BufferInfo: TypeAlias = Tuple[int, int, int]
 
+# type alias for supported spreadsheet engines
+ExcelSpreadsheetEngine: TypeAlias = Literal[
+    "xlsx2csv", "openpyxl", "calamine", "pyxlsb"
+]
+
 
 class SeriesBuffers(TypedDict):
     """Underlying buffers of a Series."""
diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt
index d97002186a6b..c7a0592bf09b 100644
--- a/py-polars/requirements-dev.txt
+++ b/py-polars/requirements-dev.txt
@@ -19,7 +19,7 @@ patchelf; platform_system == 'Linux'  # Extra dependency for maturin, only for L
 numpy
 pandas
 pyarrow
-pydantic >= 2.0.0
+pydantic>=2.0.0
 # Datetime / time zones
 backports.zoneinfo; python_version < '3.9'
 tzdata; platform_system == 'Windows'
@@ -37,14 +37,15 @@ s3fs[boto3]
 # Spreadsheet
 ezodf
 lxml
+fastexcel>=0.7.0; platform_system != 'Windows'
 openpyxl
 pyxlsb
 xlsx2csv
 XlsxWriter
 deltalake>=0.14.0
 # Dataframe interchange protocol
-dataframe-api-compat >= 0.1.6
-pyiceberg >= 0.5.0
+dataframe-api-compat>=0.1.6
+pyiceberg>=0.5.0
 # Csv
 zstandard
 # Plotting
diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
index 6886916fa72a..91ee0d634145 100644
--- a/py-polars/tests/unit/io/test_spreadsheet.py
+++ b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -1,10 +1,11 @@
 from __future__ import annotations
 
+import sys
 import warnings
 from collections import OrderedDict
 from datetime import date, datetime
 from io import BytesIO
-from typing import TYPE_CHECKING, Any, Callable, Literal
+from typing import TYPE_CHECKING, Any, Callable
 
 import pytest
 
@@ -16,7 +17,7 @@
 if TYPE_CHECKING:
     from pathlib import Path
 
-    from polars.type_aliases import SchemaDict, SelectorType
+    from polars.type_aliases import ExcelSpreadsheetEngine, SchemaDict, SelectorType
 
 pytestmark = pytest.mark.slow()
 
@@ -69,9 +70,26 @@ def path_ods_mixed(io_files_path: Path) -> Path:
 @pytest.mark.parametrize(
     ("read_spreadsheet", "source", "engine_params"),
     [
+        # xlsx file
         (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
         (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
+        pytest.param(
+            *(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+        # xlsb file (binary)
+        pytest.param(
+            *(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
         (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
+        # open document
         (pl.read_ods, "path_ods", {}),
     ],
 )
@@ -100,9 +118,26 @@ def test_read_spreadsheet(
 @pytest.mark.parametrize(
     ("read_spreadsheet", "source", "params"),
     [
+        # xlsx file
         (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
         (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
+        pytest.param(
+            *(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+        # xlsb file (binary)
+        pytest.param(
+            *(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
         (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
+        # open document
         (pl.read_ods, "path_ods", {}),
     ],
 )
@@ -138,9 +173,26 @@ def test_read_excel_multi_sheets(
 @pytest.mark.parametrize(
     ("read_spreadsheet", "source", "params"),
     [
+        # xlsx file
         (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
         (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
+        pytest.param(
+            *(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+        # xlsb file (binary)
+        pytest.param(
+            *(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
         (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
+        # open document
         (pl.read_ods, "path_ods", {}),
     ],
 )
@@ -179,11 +231,18 @@ def test_read_excel_all_sheets(
     ("engine", "schema_overrides"),
     [
         ("xlsx2csv", {"datetime": pl.Datetime}),
+        pytest.param(
+            *("calamine", None),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
         ("openpyxl", None),
     ],
 )
 def test_read_excel_basic_datatypes(
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"],
+    engine: ExcelSpreadsheetEngine,
     schema_overrides: SchemaDict | None,
 ) -> None:
     df = pl.DataFrame(
@@ -213,9 +272,26 @@ def test_read_excel_basic_datatypes(
 @pytest.mark.parametrize(
     ("read_spreadsheet", "source", "params"),
     [
+        # xlsx file
         (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
         (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
+        pytest.param(
+            *(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+        # xlsb file (binary)
+        pytest.param(
+            *(pl.read_excel, "path_xlsb", {"engine": "calamine"}),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
         (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}),
+        # open document
         (pl.read_ods, "path_ods", {}),
     ],
 )
@@ -297,9 +373,22 @@ def test_read_mixed_dtype_columns(
     )
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"])
-def test_write_excel_bytes(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> None:
-    df = pl.DataFrame({"A": [1, 2, 3, 4, 5]})
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "xlsx2csv",
+        "openpyxl",
+        pytest.param(
+            "calamine",
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+    ],
+)
+def test_write_excel_bytes(engine: ExcelSpreadsheetEngine) -> None:
+    df = pl.DataFrame({"A": [1.5, -2, 0, 3.0, -4.5, 5.0]})
 
     excel_bytes = BytesIO()
     df.write_excel(excel_bytes)
@@ -403,7 +492,20 @@ def test_unsupported_binary_workbook(path_xlsx: Path, path_xlsb: Path) -> None:
         pl.read_excel(path_xlsb, engine="openpyxl")
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"])
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "xlsx2csv",
+        "openpyxl",
+        pytest.param(
+            "calamine",
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+    ],
+)
 def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None:
     with pytest.raises(
         ValueError,
@@ -548,9 +650,22 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None:
     assert_frame_equal(df, xldf)
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"])
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "xlsx2csv",
+        "openpyxl",
+        pytest.param(
+            "calamine",
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+    ],
+)
 def test_excel_compound_types(
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"],
+    engine: ExcelSpreadsheetEngine,
 ) -> None:
     df = pl.DataFrame(
         {"x": [[1, 2], [3, 4], [5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]}
@@ -567,8 +682,21 @@ def test_excel_compound_types(
     ]
 
 
-@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"])
-def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> None:
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "xlsx2csv",
+        "openpyxl",
+        pytest.param(
+            "calamine",
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+    ],
+)
+def test_excel_sparklines(engine: ExcelSpreadsheetEngine) -> None:
     from xlsxwriter import Workbook
 
     # note that we don't (quite) expect sparkline export to round-trip as we
@@ -581,7 +709,7 @@ def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) ->
             "q3": [-50, 0, 40, 80, 80],
             "q4": [75, 55, 25, -10, -55],
         }
-    )
+    ).cast(dtypes={pl.Int64: pl.Float64})
 
     # also: confirm that we can use a Workbook directly with "write_excel"
     xls = BytesIO()
@@ -637,10 +765,12 @@ def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) ->
     # └─────┴──────┴─────┴─────┴─────┴─────┴───────┴─────┴─────┘
 
     for sparkline_col in ("+/-", "trend"):
-        assert set(xldf[sparkline_col]) == {None}
+        assert set(xldf[sparkline_col]) in ({None}, {""})
 
     assert xldf.columns == ["id", "+/-", "q1", "q2", "q3", "q4", "trend", "h1", "h2"]
-    assert_frame_equal(df, xldf.drop("+/-", "trend", "h1", "h2"))
+    assert_frame_equal(
+        df, xldf.drop("+/-", "trend", "h1", "h2").cast(dtypes={pl.Int64: pl.Float64})
+    )
 
 
 def test_excel_write_multiple_tables() -> None:
@@ -733,13 +863,20 @@ def test_excel_empty_sheet(
     [
         ("xlsx2csv", ["a"]),
         ("openpyxl", ["a", "b"]),
+        pytest.param(
+            *("calamine", ["a", "b"]),
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
         ("xlsx2csv", cs.numeric()),
         ("openpyxl", cs.last()),
     ],
 )
 def test_excel_hidden_columns(
     hidden_columns: list[str] | SelectorType,
-    engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"],
+    engine: ExcelSpreadsheetEngine,
 ) -> None:
     df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})
 
@@ -751,16 +888,10 @@ def test_excel_hidden_columns(
 
 
 def test_invalid_engine_options() -> None:
+    # read_csv_options only applicable with 'xlsx2csv' engine
     with pytest.raises(ValueError, match="cannot specify `read_csv_options`"):
         pl.read_excel(
             "",
             engine="openpyxl",
             read_csv_options={"sep": "\t"},
         )
-
-    with pytest.raises(ValueError, match="cannot specify `xlsx2csv_options`"):
-        pl.read_excel(
-            "",
-            engine="openpyxl",
-            xlsx2csv_options={"skip_empty_lines": True},
-        )

From ef7da424e746e3b488759744e1f3498577620057 Mon Sep 17 00:00:00 2001
From: alexander-beedie <alexander.m.beedie@icloud.com>
Date: Fri, 26 Jan 2024 10:05:48 +0400
Subject: [PATCH 2/2] fix issue with py3.8

---
 .../source/reference/lazyframe/descriptive.rst     |  1 -
 py-polars/polars/io/spreadsheet/functions.py       | 14 ++++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/py-polars/docs/source/reference/lazyframe/descriptive.rst b/py-polars/docs/source/reference/lazyframe/descriptive.rst
index 0f05afae8960..6de20f675f4b 100644
--- a/py-polars/docs/source/reference/lazyframe/descriptive.rst
+++ b/py-polars/docs/source/reference/lazyframe/descriptive.rst
@@ -6,6 +6,5 @@ Descriptive
 .. autosummary::
    :toctree: api/
 
-    LazyFrame.describe
     LazyFrame.explain
     LazyFrame.show_graph
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
index a7fe63d90d3f..c731a207a94e 100644
--- a/py-polars/polars/io/spreadsheet/functions.py
+++ b/py-polars/polars/io/spreadsheet/functions.py
@@ -527,18 +527,16 @@ def _initialise_spreadsheet_parser(
         # note: can't read directly from bytes (yet) so
         if read_bytesio := isinstance(source, BytesIO):
             temp_data = NamedTemporaryFile(delete=True)
-        with nullcontext() if not read_bytesio else temp_data as xldata:  # type: ignore[attr-defined]
+        with nullcontext() if not read_bytesio else temp_data as tmp:  # type: ignore[attr-defined]
             if read_bytesio:
-                xldata.write(source.getvalue())  # type: ignore[union-attr]
-                xldata = xldata.file.name
-            else:
-                xldata = source
+                tmp.write(source.getvalue())  # type: ignore[union-attr]
+                source = temp_data.name
 
-            if not Path(xldata).exists():
-                raise FileNotFoundError(xldata)
+            if not Path(source).exists():  # type: ignore[arg-type]
+                raise FileNotFoundError(source)
 
             fxl = import_optional("fastexcel", min_version="0.7.0")
-            parser = fxl.read_excel(xldata, **engine_options)
+            parser = fxl.read_excel(source, **engine_options)
             sheets = [
                 {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names)
             ]