From c20e149b1d80c66204e9e45ed2088fcc6d057d7c Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Mon, 22 Jan 2024 01:09:38 +0400 Subject: [PATCH 1/2] feat(python): add "calamine" support to `read_excel`, using `fastexcel` --- .../reference/lazyframe/descriptive.rst | 1 + py-polars/polars/dataframe/frame.py | 41 +-- py-polars/polars/dependencies.py | 44 ++++ py-polars/polars/io/spreadsheet/functions.py | 241 ++++++++++-------- py-polars/polars/lazyframe/frame.py | 40 ++- py-polars/polars/type_aliases.py | 6 +- py-polars/requirements-dev.txt | 7 +- py-polars/tests/unit/io/test_spreadsheet.py | 175 +++++++++++-- 8 files changed, 406 insertions(+), 149 deletions(-) diff --git a/py-polars/docs/source/reference/lazyframe/descriptive.rst b/py-polars/docs/source/reference/lazyframe/descriptive.rst index 6de20f675f4b..0f05afae8960 100644 --- a/py-polars/docs/source/reference/lazyframe/descriptive.rst +++ b/py-polars/docs/source/reference/lazyframe/descriptive.rst @@ -6,5 +6,6 @@ Descriptive .. autosummary:: :toctree: api/ + LazyFrame.describe LazyFrame.explain LazyFrame.show_graph diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index a7c499017a12..9f6279aa2010 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -52,6 +52,7 @@ _check_for_pyarrow, dataframe_api_compat, hvplot, + import_optional, ) from polars.dependencies import numpy as np from polars.dependencies import pandas as pd @@ -3073,15 +3074,8 @@ def write_excel( ... sheet_zoom=125, ... ) """ # noqa: W505 - try: - import xlsxwriter - from xlsxwriter.utility import xl_cell_to_rowcol - except ImportError: - msg = ( - "Excel export requires xlsxwriter" - "\n\nPlease run: pip install XlsxWriter" - ) - raise ImportError(msg) from None + xlsxwriter = import_optional("xlsxwriter", err_prefix="Excel export requires") + from xlsxwriter.utility import xl_cell_to_rowcol # setup workbook/worksheet wb, ws, can_close = _xl_setup_workbook(workbook, worksheet) @@ -6751,7 +6745,10 @@ def drop_in_place(self, name: str) -> Series: def cast( self, - dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, + dtypes: ( + Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] + | PolarsDataType + ), *, strict: bool = True, ) -> DataFrame: @@ -6792,12 +6789,19 @@ def cast( │ 3.0 ┆ 8 ┆ 2022-05-06 │ └─────┴─────┴────────────┘ - Cast all frame columns to the specified dtype: + Cast all frame columns matching one dtype (or dtype group) to another dtype: - >>> df.cast(pl.String).to_dict(as_series=False) - {'foo': ['1', '2', '3'], - 'bar': ['6.0', '7.0', '8.0'], - 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} + >>> df.cast({pl.Date: pl.Datetime}) + shape: (3, 3) + ┌─────┬─────┬─────────────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ datetime[μs] │ + ╞═════╪═════╪═════════════════════╡ + │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │ + │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │ + │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │ + └─────┴─────┴─────────────────────┘ Use selectors to define the columns being cast: @@ -6813,6 +6817,13 @@ def cast( │ 2 ┆ 7 ┆ 2021-03-04 │ │ 3 ┆ 8 ┆ 2022-05-06 │ └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> df.cast(pl.String).to_dict(as_series=False) + {'foo': ['1', '2', '3'], + 'bar': ['6.0', '7.0', '8.0'], + 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} """ return self.lazy().cast(dtypes, strict=strict).collect(_eager=True) diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py index 1dfbe51deea4..d987b36d6ac4 100644 --- a/py-polars/polars/dependencies.py +++ b/py-polars/polars/dependencies.py @@ -229,6 +229,50 @@ def _check_for_pydantic(obj: Any, *, check_type: bool = True) -> bool: ) +def import_optional( + module_name: str, + err_prefix: str = "Required package", + err_suffix: str = "not installed", + min_version: str | tuple[int, ...] | None = None, +) -> Any: + """ + Import an optional dependency, returning the module. + + Parameters + ---------- + module_name : str + Name of the dependency to import. + err_prefix : str, optional + Error prefix to use in the raised exception (appears before the module name). + err_suffix: str, optional + Error suffix to use in the raised exception (follows the module name). + min_version : {str, tuple[int]}, optional + If a minimum module version is required, specify it here. + """ + from polars.exceptions import ModuleUpgradeRequired + from polars.utils.various import parse_version + + try: + module = import_module(module_name) + except ImportError: + prefix = f"{err_prefix.strip(' ')} " if err_prefix else "" + suffix = f" {err_prefix.strip(' ')}" if err_suffix else "" + err_message = ( + f"{prefix}'{module_name}'{suffix}.\n" + f"Please install it using the command `pip install {module_name}`." + ) + raise ImportError(err_message) from None + + if min_version: + min_version = parse_version(min_version) + mod_version = parse_version(module.__version__) + if mod_version < min_version: + msg = f"requires module_name {min_version} or higher, found {mod_version}" + raise ModuleUpgradeRequired(msg) + + return module + + __all__ = [ # lazy-load rarely-used/heavy builtins (for fast startup) "dataclasses", diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index f98ed64afcf4..a7fe63d90d3f 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -1,23 +1,27 @@ from __future__ import annotations import re -from io import StringIO +from contextlib import nullcontext +from datetime import time +from io import BytesIO, StringIO from pathlib import Path +from tempfile import NamedTemporaryFile from typing import TYPE_CHECKING, Any, BinaryIO, Callable, NoReturn, Sequence, overload import polars._reexport as pl from polars import functions as F -from polars.datatypes import Date, Datetime, String +from polars.datatypes import FLOAT_DTYPES, Date, Datetime, Int64, Null, String +from polars.dependencies import import_optional from polars.exceptions import NoDataError, ParameterCollisionError from polars.io._utils import _looks_like_url, _process_file_url from polars.io.csv.functions import read_csv +from polars.utils.deprecation import deprecate_renamed_parameter from polars.utils.various import normalize_filepath if TYPE_CHECKING: - from io import BytesIO from typing import Literal - from polars.type_aliases import SchemaDict + from polars.type_aliases import ExcelSpreadsheetEngine, SchemaDict @overload @@ -26,8 +30,8 @@ def read_excel( *, sheet_id: None = ..., sheet_name: str, - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., - xlsx2csv_options: dict[str, Any] | None = ..., + engine: ExcelSpreadsheetEngine | None = ..., + engine_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = ..., raise_if_empty: bool = ..., @@ -41,8 +45,8 @@ def read_excel( *, sheet_id: None = ..., sheet_name: None = ..., - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., - xlsx2csv_options: dict[str, Any] | None = ..., + engine: ExcelSpreadsheetEngine | None = ..., + engine_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = ..., raise_if_empty: bool = ..., @@ -56,8 +60,8 @@ def read_excel( *, sheet_id: int, sheet_name: str, - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., - xlsx2csv_options: dict[str, Any] | None = ..., + engine: ExcelSpreadsheetEngine | None = ..., + engine_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = ..., raise_if_empty: bool = ..., @@ -73,8 +77,8 @@ def read_excel( *, sheet_id: Literal[0] | Sequence[int], sheet_name: None = ..., - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., - xlsx2csv_options: dict[str, Any] | None = ..., + engine: ExcelSpreadsheetEngine | None = ..., + engine_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = ..., raise_if_empty: bool = ..., @@ -88,8 +92,8 @@ def read_excel( *, sheet_id: int, sheet_name: None = ..., - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., - xlsx2csv_options: dict[str, Any] | None = ..., + engine: ExcelSpreadsheetEngine | None = ..., + engine_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = ..., raise_if_empty: bool = ..., @@ -103,8 +107,8 @@ def read_excel( *, sheet_id: None, sheet_name: list[str] | tuple[str], - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = ..., - xlsx2csv_options: dict[str, Any] | None = ..., + engine: ExcelSpreadsheetEngine | None = ..., + engine_options: dict[str, Any] | None = ..., read_csv_options: dict[str, Any] | None = ..., schema_overrides: SchemaDict | None = ..., raise_if_empty: bool = ..., @@ -112,24 +116,27 @@ def read_excel( ... +@deprecate_renamed_parameter("xlsx2csv_options", "engine_options", version="0.20.6") def read_excel( source: str | BytesIO | Path | BinaryIO | bytes, *, sheet_id: int | Sequence[int] | None = None, sheet_name: str | list[str] | tuple[str] | None = None, - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"] | None = None, - xlsx2csv_options: dict[str, Any] | None = None, + engine: ExcelSpreadsheetEngine | None = None, + engine_options: dict[str, Any] | None = None, read_csv_options: dict[str, Any] | None = None, schema_overrides: SchemaDict | None = None, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: """ - Read Excel (XLSX) spreadsheet data into a DataFrame. + Read Excel spreadsheet data into a DataFrame. + .. versionadded:: 0.20.6 + Added "calamine" fastexcel engine for Excel Workbooks (.xlsx, .xlsb, .xls). .. versionadded:: 0.19.4 - Added support for "pyxlsb" engine for reading Excel Binary Workbooks (.xlsb). + Added "pyxlsb" engine for Excel Binary Workbooks (.xlsb). .. versionadded:: 0.19.3 - Added support for "openpyxl" engine, and added `schema_overrides` parameter. + Added "openpyxl" engine, and added `schema_overrides` parameter. Parameters ---------- @@ -145,11 +152,12 @@ def read_excel( Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more than one is given then a `{sheetname:frame,}` dict is returned. engine - Library used to parse the spreadsheet file; defaults to "xlsx2csv" if not set. + Library used to parse the spreadsheet file; currently defaults to "xlsx2csv" + if not explicitly set. - * "xlsx2csv": the fastest engine; converts the data to an in-memory CSV before - using the native polars `read_csv` method to parse the result. You can - pass `xlsx2csv_options` and `read_csv_options` to refine the conversion. + * "xlsx2csv": converts the data to an in-memory CSV before using the native + polars `read_csv` method to parse the result. You can pass `engine_options` + and `read_csv_options` to refine the conversion. * "openpyxl": this engine is significantly slower than `xlsx2csv` but supports additional automatic type inference; potentially useful if you are otherwise unable to parse your sheet with the (default) `xlsx2csv` engine in @@ -157,15 +165,18 @@ def read_excel( * "pyxlsb": this engine is used for Excel Binary Workbooks (`.xlsb` files). Note that you have to use `schema_overrides` to correctly load date/datetime columns (or these will be read as floats representing offset Julian values). + * "calamine": this engine can be used for reading all major types of Excel + Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the + other options, using the `fastexcel` module to bind calamine. - xlsx2csv_options - Extra options passed to `xlsx2csv.Xlsx2csv()`, - e.g. `{"skip_empty_lines": True}` + engine_options + Extra options passed to the underlying engine's Workbook-reading constructor. + For example, if using `xlsx2csv` you could pass `{"skip_empty_lines": True}`. read_csv_options Extra options passed to :func:`read_csv` for parsing the CSV file returned by - `xlsx2csv.Xlsx2csv().convert()` - e.g.: ``{"has_header": False, "new_columns": ["a", "b", "c"], - "infer_schema_length": None}`` + `xlsx2csv.Xlsx2csv().convert()`. This option is *only* applicable when using + the `xlsx2csv` engine. For example, you could pass ``{"has_header": False, + "new_columns": ["a", "b", "c"], "infer_schema_length": None}`` schema_overrides Support type specification or override of one or more columns. raise_if_empty @@ -203,7 +214,7 @@ def read_excel( >>> pl.read_excel( ... source="test.xlsx", ... sheet_id=3, - ... xlsx2csv_options={"skip_empty_lines": True}, + ... engine_options={"skip_empty_lines": True}, ... read_csv_options={"has_header": False, "new_columns": ["a", "b", "c"]}, ... ) # doctest: +SKIP @@ -223,7 +234,7 @@ def read_excel( The `openpyxl` package can also be used to parse Excel data; it has slightly better default type detection, but is slower than `xlsx2csv`. If you have a sheet that is better read using this package you can set the engine as "openpyxl" (if you - use this engine then neither `xlsx2csv_options` nor `read_csv_options` can be set). + use this engine then `read_csv_options` cannot be set). >>> pl.read_excel( ... source="test.xlsx", @@ -231,20 +242,16 @@ def read_excel( ... schema_overrides={"dt": pl.Datetime, "value": pl.Int32}, ... ) # doctest: +SKIP """ - if engine and engine != "xlsx2csv": - if xlsx2csv_options: - msg = f"cannot specify `xlsx2csv_options` when engine={engine!r}" - raise ValueError(msg) - if read_csv_options: - msg = f"cannot specify `read_csv_options` when engine={engine!r}" - raise ValueError(msg) + if engine and engine != "xlsx2csv" and read_csv_options: + msg = f"cannot specify `read_csv_options` when engine={engine!r}" + raise ValueError(msg) return _read_spreadsheet( sheet_id, sheet_name, source=source, engine=engine, - engine_options=xlsx2csv_options, + engine_options=engine_options, read_csv_options=read_csv_options, schema_overrides=schema_overrides, raise_if_empty=raise_if_empty, @@ -393,7 +400,7 @@ def _read_spreadsheet( sheet_id: int | Sequence[int] | None, sheet_name: str | list[str] | tuple[str] | None, source: str | BytesIO | Path | BinaryIO | bytes, - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"] | None, + engine: ExcelSpreadsheetEngine | Literal["ods"] | None, engine_options: dict[str, Any] | None = None, read_csv_options: dict[str, Any] | None = None, schema_overrides: SchemaDict | None = None, @@ -489,19 +496,13 @@ def _get_sheet_names( def _initialise_spreadsheet_parser( - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb", "ods"], + engine: str | None, source: str | BytesIO | Path | BinaryIO | bytes, engine_options: dict[str, Any], ) -> tuple[Callable[..., pl.DataFrame], Any, list[dict[str, Any]]]: """Instantiate the indicated spreadsheet parser and establish related properties.""" if engine == "xlsx2csv": # default - try: - import xlsx2csv - except ImportError: - msg = ( - "required package not installed" "\n\nPlease run: pip install xlsx2csv" - ) - raise ModuleNotFoundError(msg) from None + xlsx2csv = import_optional("xlsx2csv") # establish sensible defaults for unset options for option, value in { @@ -517,23 +518,34 @@ def _initialise_spreadsheet_parser( return _read_spreadsheet_xlsx2csv, parser, sheets elif engine == "openpyxl": - try: - import openpyxl - except ImportError: - msg = ( - "required package not installed" "\n\nPlease run: pip install openpyxl" - ) - raise ImportError(msg) from None + openpyxl = import_optional("openpyxl") parser = openpyxl.load_workbook(source, data_only=True, **engine_options) sheets = [{"index": i + 1, "name": ws.title} for i, ws in enumerate(parser)] return _read_spreadsheet_openpyxl, parser, sheets + elif engine == "calamine": + # note: can't read directly from bytes (yet) so + if read_bytesio := isinstance(source, BytesIO): + temp_data = NamedTemporaryFile(delete=True) + with nullcontext() if not read_bytesio else temp_data as xldata: # type: ignore[attr-defined] + if read_bytesio: + xldata.write(source.getvalue()) # type: ignore[union-attr] + xldata = xldata.file.name + else: + xldata = source + + if not Path(xldata).exists(): + raise FileNotFoundError(xldata) + + fxl = import_optional("fastexcel", min_version="0.7.0") + parser = fxl.read_excel(xldata, **engine_options) + sheets = [ + {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names) + ] + return _read_spreadsheet_calamine, parser, sheets + elif engine == "pyxlsb": - try: - import pyxlsb - except ImportError: - msg = "required package not installed" "\n\nPlease run: pip install pyxlsb" - raise ImportError(msg) from None + pyxlsb = import_optional("pyxlsb") try: parser = pyxlsb.open_workbook(source, **engine_options) except KeyError as err: @@ -547,14 +559,7 @@ def _initialise_spreadsheet_parser( return _read_spreadsheet_pyxlsb, parser, sheets elif engine == "ods": - try: - import ezodf - except ImportError: - msg = ( - "required package not installed" - "\n\nPlease run: pip install ezodf lxml" - ) - raise ImportError(msg) from None + ezodf = import_optional("ezodf") parser = ezodf.opendoc(source, **engine_options) sheets = [ {"index": i + 1, "name": ws.name} for i, ws in enumerate(parser.sheets) @@ -602,21 +607,33 @@ def _csv_buffer_to_frame( separator=separator, **read_csv_options, ) - return _drop_unnamed_null_columns(df) + return _drop_null_data(df, raise_if_empty=raise_if_empty) -def _drop_unnamed_null_columns(df: pl.DataFrame) -> pl.DataFrame: - """If DataFrame contains unnamed columns that contain only nulls, drop them.""" +def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame: + """If DataFrame contains columns/rows that contain only nulls, drop them.""" null_cols = [] for col_name in df.columns: - # note that if multiple unnamed columns are found then all but - # the first one will be ones will be named as "_duplicated_{n}" - if col_name == "" or re.match(r"_duplicated_\d+$", col_name): - if df[col_name].null_count() == len(df): + # note that if multiple unnamed columns are found then all but the first one + # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) + if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name): + col = df[col_name] + if col.dtype == Null or col.null_count() == len(df): null_cols.append(col_name) if null_cols: df = df.drop(*null_cols) - return df + + if len(df) == 0 and len(df.columns) == 0: + if not raise_if_empty: + return df + else: + msg = ( + "empty Excel sheet" + "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." + ) + raise NoDataError(msg) + + return df.filter(~F.all_horizontal(F.all().is_null())) def _read_spreadsheet_ods( @@ -671,13 +688,6 @@ def _read_spreadsheet_ods( schema_overrides=overrides, ) - if raise_if_empty and len(df) == 0 and len(df.columns) == 0: - msg = ( - "empty Excel sheet" - "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." - ) - raise NoDataError(msg) - if strptime_cols: df = df.with_columns( ( @@ -689,8 +699,9 @@ def _read_spreadsheet_ods( ) for nm, dtype in strptime_cols.items() ) + df.columns = headers - return _drop_unnamed_null_columns(df) + return _drop_null_data(df, raise_if_empty=raise_if_empty) def _read_spreadsheet_openpyxl( @@ -738,13 +749,49 @@ def _read_spreadsheet_openpyxl( {s.name: s for s in series_data}, schema_overrides=schema_overrides, ) - if raise_if_empty and len(df) == 0 and len(df.columns) == 0: - msg = ( - "empty Excel sheet" - "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." - ) - raise NoDataError(msg) - return _drop_unnamed_null_columns(df) + return _drop_null_data(df, raise_if_empty=raise_if_empty) + + +def _read_spreadsheet_calamine( + parser: Any, + sheet_name: str | None, + read_csv_options: dict[str, Any] | None, + schema_overrides: SchemaDict | None, + *, + raise_if_empty: bool, +) -> pl.DataFrame: + ws = parser.load_sheet_by_name(sheet_name) + df = ws.to_polars() + + if schema_overrides: + df = df.cast(dtypes=schema_overrides) + + df = _drop_null_data(df, raise_if_empty=raise_if_empty) + + # calamine may read integer data as float; cast back to int where possible. + # do a similar downcast check for datetime -> date dtypes. + type_checks = [] + for c, dtype in df.schema.items(): + if dtype in FLOAT_DTYPES: + check_cast = [F.col(c).floor().eq_missing(F.col(c)), F.col(c).cast(Int64)] + type_checks.append(check_cast) + elif dtype == Datetime: + check_cast = [ + F.col(c).drop_nulls().dt.time().eq_missing(time(0, 0, 0)), + F.col(c).cast(Date), + ] + type_checks.append(check_cast) + + if type_checks: + apply_downcast = df.select([d[0] for d in type_checks]).row(0) + + # do a similar check for datetime columns that have only 00:00:00 times. + if downcast := [ + cast for apply, (_, cast) in zip(apply_downcast, type_checks) if apply + ]: + df = df.with_columns(*downcast) + + return df def _read_spreadsheet_pyxlsb( @@ -799,13 +846,7 @@ def _read_spreadsheet_pyxlsb( {s.name: s for s in series_data}, schema_overrides=schema_overrides, ) - if raise_if_empty and len(df) == 0 and len(df.columns) == 0: - msg = ( - "empty Excel sheet" - "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`." - ) - raise NoDataError(msg) - return _drop_unnamed_null_columns(df) + return _drop_null_data(df, raise_if_empty=raise_if_empty) def _read_spreadsheet_xlsx2csv( diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 38baa6637ed1..7957ffd35d23 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -30,6 +30,7 @@ N_INFER_DEFAULT, Boolean, Categorical, + DataTypeGroup, Date, Datetime, Duration, @@ -49,6 +50,7 @@ UInt32, UInt64, Unknown, + is_polars_dtype, py_type_to_dtype, ) from polars.dependencies import dataframe_api_compat, subprocess @@ -58,7 +60,7 @@ from polars.io.parquet.anonymous_scan import _scan_parquet_fsspec from polars.lazyframe.group_by import LazyGroupBy from polars.lazyframe.in_process import InProcessQuery -from polars.selectors import _expand_selectors, expand_selector +from polars.selectors import _expand_selectors, by_dtype, expand_selector from polars.slice import LazyPolarsSlice from polars.utils._async import _AioDataFrameResult, _GeventDataFrameResult from polars.utils._parse_expr_input import ( @@ -2600,7 +2602,10 @@ def cache(self) -> Self: def cast( self, - dtypes: Mapping[ColumnNameOrSelector, PolarsDataType] | PolarsDataType, + dtypes: ( + Mapping[ColumnNameOrSelector | PolarsDataType, PolarsDataType] + | PolarsDataType + ), *, strict: bool = True, ) -> Self: @@ -2641,12 +2646,19 @@ def cast( │ 3.0 ┆ 8 ┆ 2022-05-06 │ └─────┴─────┴────────────┘ - Cast all frame columns to the specified dtype: + Cast all frame columns matching one dtype (or dtype group) to another dtype: - >>> lf.cast(pl.String).collect().to_dict(as_series=False) - {'foo': ['1', '2', '3'], - 'bar': ['6.0', '7.0', '8.0'], - 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} + >>> lf.cast({pl.Date: pl.Datetime}).collect() + shape: (3, 3) + ┌─────┬─────┬─────────────────────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ datetime[μs] │ + ╞═════╪═════╪═════════════════════╡ + │ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │ + │ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │ + │ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │ + └─────┴─────┴─────────────────────┘ Use selectors to define the columns being cast: @@ -2662,17 +2674,29 @@ def cast( │ 2 ┆ 7 ┆ 2021-03-04 │ │ 3 ┆ 8 ┆ 2022-05-06 │ └─────┴─────┴────────────┘ + + Cast all frame columns to the specified dtype: + + >>> lf.cast(pl.String).collect().to_dict(as_series=False) + {'foo': ['1', '2', '3'], + 'bar': ['6.0', '7.0', '8.0'], + 'ham': ['2020-01-02', '2021-03-04', '2022-05-06']} """ if not isinstance(dtypes, Mapping): return self._from_pyldf(self._ldf.cast_all(dtypes, strict)) cast_map = {} for c, dtype in dtypes.items(): + if (is_polars_dtype(c) or isinstance(c, DataTypeGroup)) or ( + isinstance(c, Collection) and all(is_polars_dtype(x) for x in c) + ): + c = by_dtype(c) # type: ignore[arg-type] + dtype = py_type_to_dtype(dtype) cast_map.update( {c: dtype} if isinstance(c, str) - else {x: dtype for x in expand_selector(self, c)} + else {x: dtype for x in expand_selector(self, c)} # type: ignore[arg-type] ) return self._from_pyldf(self._ldf.cast(cast_map, strict)) diff --git a/py-polars/polars/type_aliases.py b/py-polars/polars/type_aliases.py index 26785724f4f6..eee5e670d8a6 100644 --- a/py-polars/polars/type_aliases.py +++ b/py-polars/polars/type_aliases.py @@ -208,9 +208,13 @@ # typevars for core polars types PolarsType = TypeVar("PolarsType", "DataFrame", "LazyFrame", "Series", "Expr") FrameType = TypeVar("FrameType", "DataFrame", "LazyFrame") - BufferInfo: TypeAlias = Tuple[int, int, int] +# type alias for supported spreadsheet engines +ExcelSpreadsheetEngine: TypeAlias = Literal[ + "xlsx2csv", "openpyxl", "calamine", "pyxlsb" +] + class SeriesBuffers(TypedDict): """Underlying buffers of a Series.""" diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt index d97002186a6b..c7a0592bf09b 100644 --- a/py-polars/requirements-dev.txt +++ b/py-polars/requirements-dev.txt @@ -19,7 +19,7 @@ patchelf; platform_system == 'Linux' # Extra dependency for maturin, only for L numpy pandas pyarrow -pydantic >= 2.0.0 +pydantic>=2.0.0 # Datetime / time zones backports.zoneinfo; python_version < '3.9' tzdata; platform_system == 'Windows' @@ -37,14 +37,15 @@ s3fs[boto3] # Spreadsheet ezodf lxml +fastexcel>=0.7.0; platform_system != 'Windows' openpyxl pyxlsb xlsx2csv XlsxWriter deltalake>=0.14.0 # Dataframe interchange protocol -dataframe-api-compat >= 0.1.6 -pyiceberg >= 0.5.0 +dataframe-api-compat>=0.1.6 +pyiceberg>=0.5.0 # Csv zstandard # Plotting diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 6886916fa72a..91ee0d634145 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -1,10 +1,11 @@ from __future__ import annotations +import sys import warnings from collections import OrderedDict from datetime import date, datetime from io import BytesIO -from typing import TYPE_CHECKING, Any, Callable, Literal +from typing import TYPE_CHECKING, Any, Callable import pytest @@ -16,7 +17,7 @@ if TYPE_CHECKING: from pathlib import Path - from polars.type_aliases import SchemaDict, SelectorType + from polars.type_aliases import ExcelSpreadsheetEngine, SchemaDict, SelectorType pytestmark = pytest.mark.slow() @@ -69,9 +70,26 @@ def path_ods_mixed(io_files_path: Path) -> Path: @pytest.mark.parametrize( ("read_spreadsheet", "source", "engine_params"), [ + # xlsx file (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + pytest.param( + *(pl.read_excel, "path_xlsx", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + # xlsb file (binary) + pytest.param( + *(pl.read_excel, "path_xlsb", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + # open document (pl.read_ods, "path_ods", {}), ], ) @@ -100,9 +118,26 @@ def test_read_spreadsheet( @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [ + # xlsx file (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + pytest.param( + *(pl.read_excel, "path_xlsx", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + # xlsb file (binary) + pytest.param( + *(pl.read_excel, "path_xlsb", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + # open document (pl.read_ods, "path_ods", {}), ], ) @@ -138,9 +173,26 @@ def test_read_excel_multi_sheets( @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [ + # xlsx file (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + pytest.param( + *(pl.read_excel, "path_xlsx", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + # xlsb file (binary) + pytest.param( + *(pl.read_excel, "path_xlsb", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + # open document (pl.read_ods, "path_ods", {}), ], ) @@ -179,11 +231,18 @@ def test_read_excel_all_sheets( ("engine", "schema_overrides"), [ ("xlsx2csv", {"datetime": pl.Datetime}), + pytest.param( + *("calamine", None), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), ("openpyxl", None), ], ) def test_read_excel_basic_datatypes( - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"], + engine: ExcelSpreadsheetEngine, schema_overrides: SchemaDict | None, ) -> None: df = pl.DataFrame( @@ -213,9 +272,26 @@ def test_read_excel_basic_datatypes( @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [ + # xlsx file (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + pytest.param( + *(pl.read_excel, "path_xlsx", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + # xlsb file (binary) + pytest.param( + *(pl.read_excel, "path_xlsb", {"engine": "calamine"}), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), (pl.read_excel, "path_xlsb", {"engine": "pyxlsb"}), + # open document (pl.read_ods, "path_ods", {}), ], ) @@ -297,9 +373,22 @@ def test_read_mixed_dtype_columns( ) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) -def test_write_excel_bytes(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> None: - df = pl.DataFrame({"A": [1, 2, 3, 4, 5]}) +@pytest.mark.parametrize( + "engine", + [ + "xlsx2csv", + "openpyxl", + pytest.param( + "calamine", + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + ], +) +def test_write_excel_bytes(engine: ExcelSpreadsheetEngine) -> None: + df = pl.DataFrame({"A": [1.5, -2, 0, 3.0, -4.5, 5.0]}) excel_bytes = BytesIO() df.write_excel(excel_bytes) @@ -403,7 +492,20 @@ def test_unsupported_binary_workbook(path_xlsx: Path, path_xlsb: Path) -> None: pl.read_excel(path_xlsb, engine="openpyxl") -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) +@pytest.mark.parametrize( + "engine", + [ + "xlsx2csv", + "openpyxl", + pytest.param( + "calamine", + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + ], +) def test_read_excel_all_sheets_with_sheet_name(path_xlsx: Path, engine: str) -> None: with pytest.raises( ValueError, @@ -548,9 +650,22 @@ def test_excel_round_trip(write_params: dict[str, Any]) -> None: assert_frame_equal(df, xldf) -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) +@pytest.mark.parametrize( + "engine", + [ + "xlsx2csv", + "openpyxl", + pytest.param( + "calamine", + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + ], +) def test_excel_compound_types( - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"], + engine: ExcelSpreadsheetEngine, ) -> None: df = pl.DataFrame( {"x": [[1, 2], [3, 4], [5, 6]], "y": ["a", "b", "c"], "z": [9, 8, 7]} @@ -567,8 +682,21 @@ def test_excel_compound_types( ] -@pytest.mark.parametrize("engine", ["xlsx2csv", "openpyxl"]) -def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> None: +@pytest.mark.parametrize( + "engine", + [ + "xlsx2csv", + "openpyxl", + pytest.param( + "calamine", + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), + ], +) +def test_excel_sparklines(engine: ExcelSpreadsheetEngine) -> None: from xlsxwriter import Workbook # note that we don't (quite) expect sparkline export to round-trip as we @@ -581,7 +709,7 @@ def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> "q3": [-50, 0, 40, 80, 80], "q4": [75, 55, 25, -10, -55], } - ) + ).cast(dtypes={pl.Int64: pl.Float64}) # also: confirm that we can use a Workbook directly with "write_excel" xls = BytesIO() @@ -637,10 +765,12 @@ def test_excel_sparklines(engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"]) -> # └─────┴──────┴─────┴─────┴─────┴─────┴───────┴─────┴─────┘ for sparkline_col in ("+/-", "trend"): - assert set(xldf[sparkline_col]) == {None} + assert set(xldf[sparkline_col]) in ({None}, {""}) assert xldf.columns == ["id", "+/-", "q1", "q2", "q3", "q4", "trend", "h1", "h2"] - assert_frame_equal(df, xldf.drop("+/-", "trend", "h1", "h2")) + assert_frame_equal( + df, xldf.drop("+/-", "trend", "h1", "h2").cast(dtypes={pl.Int64: pl.Float64}) + ) def test_excel_write_multiple_tables() -> None: @@ -733,13 +863,20 @@ def test_excel_empty_sheet( [ ("xlsx2csv", ["a"]), ("openpyxl", ["a", "b"]), + pytest.param( + *("calamine", ["a", "b"]), + marks=pytest.mark.skipif( + sys.platform == "win32", + reason="fastexcel not yet available on Windows", + ), + ), ("xlsx2csv", cs.numeric()), ("openpyxl", cs.last()), ], ) def test_excel_hidden_columns( hidden_columns: list[str] | SelectorType, - engine: Literal["xlsx2csv", "openpyxl", "pyxlsb"], + engine: ExcelSpreadsheetEngine, ) -> None: df = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]}) @@ -751,16 +888,10 @@ def test_excel_hidden_columns( def test_invalid_engine_options() -> None: + # read_csv_options only applicable with 'xlsx2csv' engine with pytest.raises(ValueError, match="cannot specify `read_csv_options`"): pl.read_excel( "", engine="openpyxl", read_csv_options={"sep": "\t"}, ) - - with pytest.raises(ValueError, match="cannot specify `xlsx2csv_options`"): - pl.read_excel( - "", - engine="openpyxl", - xlsx2csv_options={"skip_empty_lines": True}, - ) From ef7da424e746e3b488759744e1f3498577620057 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Fri, 26 Jan 2024 10:05:48 +0400 Subject: [PATCH 2/2] fix issue with py3.8 --- .../source/reference/lazyframe/descriptive.rst | 1 - py-polars/polars/io/spreadsheet/functions.py | 14 ++++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/py-polars/docs/source/reference/lazyframe/descriptive.rst b/py-polars/docs/source/reference/lazyframe/descriptive.rst index 0f05afae8960..6de20f675f4b 100644 --- a/py-polars/docs/source/reference/lazyframe/descriptive.rst +++ b/py-polars/docs/source/reference/lazyframe/descriptive.rst @@ -6,6 +6,5 @@ Descriptive .. autosummary:: :toctree: api/ - LazyFrame.describe LazyFrame.explain LazyFrame.show_graph diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index a7fe63d90d3f..c731a207a94e 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -527,18 +527,16 @@ def _initialise_spreadsheet_parser( # note: can't read directly from bytes (yet) so if read_bytesio := isinstance(source, BytesIO): temp_data = NamedTemporaryFile(delete=True) - with nullcontext() if not read_bytesio else temp_data as xldata: # type: ignore[attr-defined] + with nullcontext() if not read_bytesio else temp_data as tmp: # type: ignore[attr-defined] if read_bytesio: - xldata.write(source.getvalue()) # type: ignore[union-attr] - xldata = xldata.file.name - else: - xldata = source + tmp.write(source.getvalue()) # type: ignore[union-attr] + source = temp_data.name - if not Path(xldata).exists(): - raise FileNotFoundError(xldata) + if not Path(source).exists(): # type: ignore[arg-type] + raise FileNotFoundError(source) fxl = import_optional("fastexcel", min_version="0.7.0") - parser = fxl.read_excel(xldata, **engine_options) + parser = fxl.read_excel(source, **engine_options) sheets = [ {"index": i + 1, "name": nm} for i, nm in enumerate(parser.sheet_names) ]