From 0e198c3e0bc816e262f1c00ab3abfe36d35ed0fb Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 24 Dec 2024 07:14:04 +0000 Subject: [PATCH] feat(python): Add "drop_empty_cols" parameter for `read_excel` and `read_ods` (#20430) --- py-polars/polars/_utils/various.py | 13 +- py-polars/polars/io/spreadsheet/functions.py | 122 +++++++++++++------ py-polars/tests/unit/io/files/example.xlsx | Bin 17179 -> 17191 bytes py-polars/tests/unit/io/test_spreadsheet.py | 43 +++++++ 4 files changed, 140 insertions(+), 38 deletions(-) diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 4f72d8947765..126929d6d627 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -5,6 +5,7 @@ import re import sys import warnings +from collections import Counter from collections.abc import ( Collection, Generator, @@ -42,7 +43,7 @@ from polars.dependencies import numpy as np if TYPE_CHECKING: - from collections.abc import Iterator, Reversible + from collections.abc import Iterator, MutableMapping, Reversible from polars import DataFrame, Expr from polars._typing import PolarsDataType, SizeUnit @@ -247,6 +248,16 @@ def ordered_unique(values: Sequence[Any]) -> list[Any]: return [v for v in values if not (v in seen or add_(v))] +def deduplicate_names(names: Iterable[str]) -> list[str]: + """Ensure name uniqueness by appending a counter to subsequent duplicates.""" + seen: MutableMapping[str, int] = Counter() + deduped = [] + for nm in names: + deduped.append(f"{nm}{seen[nm] - 1}" if nm in seen else nm) + seen[nm] += 1 + return deduped + + @overload def scale_bytes(sz: int, unit: SizeUnit) -> int | float: ... diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index 700d3e5cea13..1d32589302bc 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -1,12 +1,12 @@ from __future__ import annotations +import os import re import warnings from collections.abc import Sequence from datetime import time from glob import glob from io import BufferedReader, BytesIO, StringIO, TextIOWrapper -from os import PathLike from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, NoReturn, overload @@ -17,7 +17,7 @@ deprecate_renamed_parameter, issue_deprecation_warning, ) -from polars._utils.various import normalize_filepath, parse_version +from polars._utils.various import deduplicate_names, normalize_filepath, parse_version from polars.datatypes import ( N_INFER_DEFAULT, Boolean, @@ -57,14 +57,20 @@ def _sources( source = [source] # type: ignore[assignment] for src in source: # type: ignore[union-attr] - if isinstance(src, (str, PathLike)) and not Path(src).exists(): - sources.extend(glob(str(src), recursive=True)) # noqa: PTH207 + if isinstance(src, (str, os.PathLike)) and not Path(src).exists(): + src = os.path.expanduser(str(src)) # noqa: PTH111 + sources.extend(glob(src, recursive=True)) # noqa: PTH207 else: sources.append(src) return sources, read_multiple_workbooks +def _standardize_duplicates(s: str) -> str: + """Standardize columns with '_duplicated_n' names.""" + return re.sub(r"_duplicated_(\d+)", repl=r"\1", string=s) + + @overload def read_excel( source: FileSource, @@ -79,6 +85,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -97,6 +104,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -115,6 +123,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> NoReturn: ... @@ -135,6 +144,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -153,6 +163,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -171,6 +182,7 @@ def read_excel( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -190,6 +202,7 @@ def read_excel( schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, drop_empty_rows: bool = True, + drop_empty_cols: bool = True, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: """ @@ -262,6 +275,10 @@ def read_excel( this parameter. drop_empty_rows Indicate whether to omit empty rows when reading data into the DataFrame. + drop_empty_cols + Indicate whether to omit empty columns (with no headers) when reading data into + the DataFrame (note that empty column identification may vary depending on the + underlying engine being used). raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -335,6 +352,7 @@ def read_excel( has_header=has_header, columns=columns, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, read_multiple_workbooks=read_multiple_workbooks, ) for src in sources @@ -355,6 +373,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -370,6 +389,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -385,6 +405,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> NoReturn: ... @@ -400,6 +421,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -415,6 +437,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> pl.DataFrame: ... @@ -430,6 +453,7 @@ def read_ods( schema_overrides: SchemaDict | None = ..., infer_schema_length: int | None = ..., drop_empty_rows: bool = ..., + drop_empty_cols: bool = ..., raise_if_empty: bool = ..., ) -> dict[str, pl.DataFrame]: ... @@ -444,6 +468,7 @@ def read_ods( schema_overrides: SchemaDict | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, drop_empty_rows: bool = True, + drop_empty_cols: bool = True, raise_if_empty: bool = True, ) -> pl.DataFrame | dict[str, pl.DataFrame]: """ @@ -479,6 +504,10 @@ def read_ods( large workbooks. drop_empty_rows Indicate whether to omit empty rows when reading data into the DataFrame. + drop_empty_cols + Indicate whether to omit empty columns (with no headers) when reading data into + the DataFrame (note that empty column identification may vary depending on the + underlying engine being used). raise_if_empty When there is no data in the sheet,`NoDataError` is raised. If this parameter is set to False, an empty DataFrame (with no columns) is returned instead. @@ -523,6 +552,7 @@ def read_ods( infer_schema_length=infer_schema_length, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, has_header=has_header, columns=columns, read_multiple_workbooks=read_multiple_workbooks, @@ -548,6 +578,7 @@ def _read_spreadsheet( has_header: bool = True, raise_if_empty: bool = True, drop_empty_rows: bool = True, + drop_empty_cols: bool = True, read_multiple_workbooks: bool = False, ) -> pl.DataFrame | dict[str, pl.DataFrame]: if isinstance(source, (str, Path)): @@ -587,6 +618,7 @@ def _read_spreadsheet( raise_if_empty=raise_if_empty, columns=columns, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) for name in sheet_names } @@ -774,8 +806,9 @@ def _csv_buffer_to_frame( separator: str, read_options: dict[str, Any], schema_overrides: SchemaDict | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: """Translate StringIO buffer containing delimited data as a DataFrame.""" # handle (completely) empty sheet data @@ -810,35 +843,39 @@ def _csv_buffer_to_frame( **read_options, ) return _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + df, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) def _drop_null_data( - df: pl.DataFrame, *, raise_if_empty: bool, drop_empty_rows: bool = True + df: pl.DataFrame, + *, + raise_if_empty: bool, + drop_empty_rows: bool = True, + drop_empty_cols: bool = True, ) -> pl.DataFrame: - """ - If DataFrame contains columns/rows that contain only nulls, drop them. - - If `drop_empty_rows` is set to `False`, empty rows are not dropped. - """ + """If DataFrame contains columns/rows that contain only nulls, drop them.""" null_cols: list[str] = [] - for col_name in df.columns: - # note that if multiple unnamed columns are found then all but the first one - # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) - if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name): - col = df[col_name] - if ( - col.dtype == Null - or col.null_count() == len(df) - or ( - col.dtype in NUMERIC_DTYPES - and col.replace(0, None).null_count() == len(df) - ) - ): - null_cols.append(col_name) - if null_cols: - df = df.drop(*null_cols) + if drop_empty_cols: + for col_name in df.columns: + # note that if multiple unnamed columns are found then all but the first one + # will be named as "_duplicated_{n}" (or "__UNNAMED__{n}" from calamine) + if col_name == "" or re.match(r"(_duplicated_|__UNNAMED__)\d+$", col_name): + col = df[col_name] + if ( + col.dtype == Null + or col.null_count() == len(df) + or ( + col.dtype in NUMERIC_DTYPES + and col.replace(0, None).null_count() == len(df) + ) + ): + null_cols.append(col_name) + if null_cols: + df = df.drop(*null_cols) if len(df) == 0 and len(df.columns) == 0: return _empty_frame(raise_if_empty) @@ -875,8 +912,9 @@ def _read_spreadsheet_openpyxl( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: """Use the 'openpyxl' library to read data from the given worksheet.""" infer_schema_length = read_options.pop("infer_schema_length", None) @@ -916,9 +954,9 @@ def _read_spreadsheet_openpyxl( dtype = String if no_inference else None series_data = [] for name, column_data in zip(header, zip(*rows_iter)): - if name: + if name or not drop_empty_cols: values = [cell.value for cell in column_data] - if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment] + if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment,arg-type] # note: if we initialise the series with mixed-type data (eg: str/int) # then the non-strings will become null, so we handle the cast here values = [str(v) if (v is not None) else v for v in values] @@ -926,15 +964,18 @@ def _read_spreadsheet_openpyxl( s = pl.Series(name, values, dtype=dtype, strict=False) series_data.append(s) + names = deduplicate_names(s.name for s in series_data) df = pl.DataFrame( - {s.name: s for s in series_data}, + dict(zip(names, series_data)), schema_overrides=schema_overrides, infer_schema_length=infer_schema_length, strict=False, ) - df = _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + df, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) df = _reorder_columns(df, columns) return df @@ -947,8 +988,9 @@ def _read_spreadsheet_calamine( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: # if we have 'schema_overrides' and a more recent version of `fastexcel` # we can pass translated dtypes to the engine to refine the initial parse @@ -1002,7 +1044,10 @@ def _read_spreadsheet_calamine( df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)] df = _drop_null_data( - df, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows + df, + raise_if_empty=raise_if_empty, + drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) # note: even if we applied parser dtypes we still re-apply schema_overrides @@ -1050,8 +1095,9 @@ def _read_spreadsheet_xlsx2csv( read_options: dict[str, Any], schema_overrides: SchemaDict | None, columns: Sequence[int] | Sequence[str] | None, - raise_if_empty: bool, drop_empty_rows: bool, + drop_empty_cols: bool, + raise_if_empty: bool, ) -> pl.DataFrame: """Use the 'xlsx2csv' library to read data from the given worksheet.""" csv_buffer = StringIO() @@ -1080,8 +1126,10 @@ def _read_spreadsheet_xlsx2csv( schema_overrides=schema_overrides, raise_if_empty=raise_if_empty, drop_empty_rows=drop_empty_rows, + drop_empty_cols=drop_empty_cols, ) if cast_to_boolean: df = df.with_columns(*cast_to_boolean) + df = df.rename(_standardize_duplicates) return _reorder_columns(df, columns) diff --git a/py-polars/tests/unit/io/files/example.xlsx b/py-polars/tests/unit/io/files/example.xlsx index 13e80e618fa27ddf56c65a61a6920ec6209b59ee..0dc2081a9249e698428226a418ef22737fbc9b92 100644 GIT binary patch delta 6595 zcmZWubwCtRw_my&q>++_rC|Z-?(S|ySb?R5m2ME020>UlrI#)Vr9-+)kX9O$4}I@_ z_3mGD=lt%O^P4&M&WY>@q>>1vkAxUdVzt}1{U`tc1PcHl0002KuYf*a4;Nc7*oDW} z^;Myv1$a}GG|=kBgWUGjjC<$XQXAA@XU;}=4FA!wo&&L7yT)eR7U_iZ?>lo=X6_=> zT3XbEz(5ZVsWC6aeciCYR`}$~ zM9(icGyJGOXz*(-e};|9>wc@$=8m?fG*}O6+;I3>yYm?Y0i}+6y%`RR|5uyfQJl1s;%m1u3UK4IBrLu;F+ z$r;?X{cMS-wmC^Xm)cKV^Sr6C|8(mrj6C(Sfmr&7wM4fYth%vsz@K|lBJ8?Bv<0O=n
  • Fx(2vSI!npHL`3X~3{E&$m29Nw(Rx=UE+z}yXso(AQ0fK{5zKe^M ziKAaLNC)tf+wb4kqDDcuNiBo;o^)pkb@B1U*K%Ve=)e#;S zFGs8C9tyX7uH}^c8F^D-1tCa`%B+p1fXT?Is)Fy0`g59=n6>N0oX&MmPobD%y9No& zoFBK=Zpn$blzU1;6)AhhYoQx^2R*zeJ(o(>r!5$L?HJ-ugI7eXGp?+!m_A;im8^;NtUCGFEJ@!iz+)?zso9i6!)?y+r{qaXK2#{i#FyYxwi5}Yoy0< zd5T?FOlS8eh|t$NANQO3-0qMc3f|qqoE8a1d<4p0?)j{I9~GOhd-L&fz{^Ec|F~2m zv|l~a198{&*5}k?d_Hsi`nS6E`GupOC(EUO=bJZC|;Mm4d1M-a9{VK+LYRu zbvLnO0{T=`_&bv;1D}dKs7&N9qpV%R^TFAdVWKPyp5r-U)lLIHMd^}t1Y;GokIx+X zr~sZbU&?iflUuA0+5+>>TMCX~ih5OY{;vwd+oaUDtYF%BWc7p?*2pk58A6yB9!Y%z zXBIk|1jzOE*ZP;tQmB}`N4I??8$bZ8o~;XOvTryc5FG$8g1O-_LSvSA2_sH$0x|Lf z@@T=YHn|q;<*tfT+z96CxCR;19GD$o4d*N3y7<||*f!*^hSM^^zcX)yTGpcRJ9vv) z4%KLG7!)f@>UMvXe_HF}?%-2geAmhH{?H`)N`32Ye~k&lV2 zVa5`5xzp``J+1sK3L31{h<*NnqRismUWh1ey)@zK7D~Q$Gq3&h>n~4|-B1rkkqV5J zaQEi8pPomv>~V|A%=ioV>c=1vlARzH#Mu->o@iBAa_D+#U-777DTt#P;re)}Ze7;! zV`-+&)bUBa3k}^p`0<-!wA*p=j(KvEW^1 z+jl_xkQ?cj!=-2k%o`^LpJhn;`iW8p?DLQI_ODn_Nsawp-!I(dwJ{^ptzW_# z;TMxQeR!JVdeJY=U+C{h3y+XV#)Mnx#yF`S9v1jQ&VDV3)rlXQk{jt(b01?N%VpCe zzI=r6{4UT|5QJtAAG6kI;9!b!3%Lvx3s~)3l5ADXcDtc}HEkok2 zn!VVYODu%9Qy#^JSt|C1L1A`h24)ugxI&4Oa}) zSw>P`A$uf8=jxzWwmi)n@7yy%vUkeV8btbMI||v#6D;GtS=h>j@w-Abn-$T*Ngp&G z5m2&snAC8;$U9z15)DRcIZ7>N`Xu43^yyE{1eYvC4W9*ai;;<6Ob4}CnUfXc40lAM zy8ehODSIow$R@Tv2C;cTg0;9!silV$-Qb?pvEMtzLj1UBb^EUxl%j>a<;&4o<`7}O zfoXkdMpr>r5Ulwn40BQY=hUA$Y@dkfx#_KCV+_9Q?Z<1&XPeQe%$xX?#e8=keF;eR zPs$$B2>vj0?i<+EXr})o{!uTq;#77=99$B?pQ{XWB<6)4*Utn-f57v4s)5Elr8pg! z)(RwQ$UlbvvD0}hKW!}kQTkr)1qVHmu%;*eQD4CzF%tC`U&cqXCmYJ)p3aQUa^JN7 z=l#%p-=oRGT$})<{Gkrtw^g#p`F~Sw^1pmYEtV1U^}KI#UuShq-`#W z0{^asj^`!RDb$!#=eHt92E8RZ$tag^=02SRx&Zrzj28_W3}}I;(bV@1z7K9e+Wt;d zW8BEHPT0%FOW(pMW?!qs*Z4zMdsM3$3ZDdBq-9uon&PR{J<}m9l-HfyNP2gmL{VOi zw#C_hne5{o=DezlbF0DR@0qW1iK(gEv2;bJSBLRQv7t8?olP-$XM9QXMeB?F6T1xU zhyC&8xj`^=Y-*>@TtfgD*E^A2sPf&xgwo$ZTNCbe?{1(uY~{;WgPQa4 z0Q-d{`L$dI&`$ly&mO5YRH+Q9fuRL=_{MWJiy^kqY!%dV&ZW8%Yc{ zQS<}xekNayDPxYL(B_5an#%Un)$3o zlhcg?K{|7t1RK-GQNLznUv@=-O>PkqIN9$;z924~{It>v6IdmMX4la0eb>G7)$;cmTk| z!($Np@Q^aOf*(Rp+I?o^zRXG^iAz)G(22>^R9zlj?|-MQo9?G$4!R%xHLep`e`R4O z?g9zuaB>X?IBYlq86;D29SGKEQ%jt-LVu=L%QBY?Ig3p6S;LvwVu7;l$9TN)0r1*@ zZ2#d2Ezf0sUOj4>;a!ParB!Rr!T}ybOb(AR=6v}&-?Wp&`WY;gxGbFV*(9tXK?k0u zpPK_gtE<)}5}Lj;o-(A$&w|zFYi2V}Q^W*iL)%y>B=#2nmKo?t)#bfOlw) z0RSK~z=FuBp=ycLBE+2-Ls0r#o`bJdgWYxcC9IcmuD^76ru2IX6ydssr?m>+Y8nZpzONF=HF*WPNbvGa}P)aQoR%0T{<@U2PrMa`*7!_H< zp%M=QpAh4+e#;gvR^GTBp-y`CHzHe%J=k>p?Gj|yxaTaD<`w!>JE+rw((IbjM;!}s znwaGV)uaExee1)LxEsTz5`U=hm^ed z&m?5@(1@ZCpqNbmJ?aki+^{c@-1cQ;x9Ae<%qP!HbEk6#m<1 zYSr$087SGTvomzG1#uhxby-vMaF;kf2@Sgc241@t%D$DbT)Z|~3%vRM%_>Oic_rd? z|8Uo@)9)ouEr0*2u z?#Dkp#uj*5tz=FvN!%?exn8wnH#iG7hH{ZzjJ#phq*-V9Ryi0xhvai^o)$Sc3^~p- z=W<;}iPH-7ymJ4J>WeT39O>qL2~wvKI1$L#5Rq(M?6KCy$ff zrWDW>Tc*eD*TR&;CnPoKc{uZe311G1L?&t{3api5Y|kk4Hg%8K-DmI)#y6kpLN`8@ zc*K!7ir)h&=AE`DRn=cDYjZ@M=kO*(=rI;)?dHC{FqP>-!QCRbV=#Nhqs!*MIqmlr z)SQG8G?nOHcehbff>CjM{YhamI!Ou1y5BwXGvI&pM-bjdYAz|>Vc z7zwp+x@h6W`JCrmbly>zK5>b^V&_Q{5OZm$XsX&8z7ogQwext?_1MaPW zJAYuKx=yFGgC69K`U7i=HuTO(ky2cjQxU&#AE4sfmYHBZ(NLzo<}poW{^^ttECE8a zbSt7`RsWVXctg5{)(~A@TnkEX`Q9Rr!;>FiAE9xOnltU{uRXl|POBc@xv{+MeWv&` z!+zgqi*a&;st$XK^?To4XR;;tmm#)u=*|lwe7#vCLIuZs1ub-8QPBHbB1eaGTZJV% z1+@yR$d|PJ@toZa8v5%O)IDketWmv{^|j_x z#tL~^LHvQx$aV(<_b4$#hZ)z^8iyXQBcVB&0)MUOxtvyVQ9`cin9nrR4gE`kl_2q_ zUK(wmLb`v*+d(7Ogq_0XNWcLhR>b=I~BZ>VHu$J*Z4@hzL58@Z6T?J1FB?$s;0 z&Qv!C#Ot=L(x&#x)EKdY1M3pXl@g%)hQ2|)= zl}D)0w2E4|t;%LTFe_bfJv>g;qw8%&e7wkS444icDcU|7p0aBFHJvdMNv!6uDpuQ$ z7$vuHW62x5#Z+(VdRzK(Bs9PCysu0!0!ARz(35cQ+jTkb+Kp;#<<%69OC~{n@}!Pi zB?ar6w6>u(DD3%cs#^WEN-&RZ|5D9r!x!v#823l70s$msKmv2XpUB$o-5C)Vt?;U`2+0CL|Xw^GDX2Tk_f%@_z(WdRL!@&W^8u)$%C;5k!Z*+b`ndsGa&rk-#J$Uh|2=S0PvCP2<_tz}SAHKW- zPEw2bZfVrnwN-b*(zZkM>5HClOylpbT`2V^Pdu;3YO4?Hc!2h7OZ}<1U~La9s2)5< zqmwXS4q9l>%RXLQ^z}RI6YDnzHeQRV{4rZ}=mBph--o7`MrXP@q)qXFr4sSb-4bqt zu!0K{t$Jc{m^~<5T1<>|rC-&0*4FGS?eA=;VB1oQAMnX?JJI_IUioB2DdfiM4+r~6 zuqhk?dZwt9qCg-!>@?3ze+4__rAa_{Plc<&k%8Su9@WWBQqakY8lS=ysORU?!@|k! zI7flESEXdsRYB|IR%aZ-Y6$MLn9n|H?-SJ!oM&OL0;92NX4=O`F-9C_7E{1Z9b4H$ ze1*kYqVf8gw1ySQJk2iRJlSIS4~0L}MnwLwh9RU0VIk~CuvA&X`g5@tC@>Q|T&$Td z=VS3m0Kj=Yk0>`1G@1m1gS4oOt(296p^+>qySBNi$DL~_@YnN%>EbHs<#zF{$>WhU zcpx{Nn}U-{NZus50F%@OpLH}5>p9}(OY1nC_ph5EO!H|7URrP_zK0NZ=uc) zak887K9_g-1_`cpa49X*kdX6xy^xV5W|c)Yn-VxpW}RFNMKKSB=ZM9a^%!5p?6G|| z0hWm?tQ>8=8kf#;qL*@jICEYTmlNmZ{dN$^v+6dVJH{d4MVS#QM!D@pq2Usc70=Vg zuoQZOpKK}uy-4ClJ3Y(yPx*Ki92{L7QpG;xjN6TtaxKyvKiO^AaAss;eO_e6AUdmo zF7bno^t~PHHaFBiS4&$#S9bH9VQd=2(EsZLeQ2WhNyS4Is|8#V|E_D(A}1+h+cUW_ zhO$Ll0t+?!nL$OoR57-Gdj?wDo#Fi6t!Bm3KiLsXACCw|A&XzHBQ%3tugSlU46~4o zqWSMuz$5g32QmgsMT!BoBT0pPjtxVS;(9D3QX%W(!SXn%VBxZaunQhS-2Vdu00_cI z0U4}UhK%NKjS=te`QPn0Lgzmj*|&SEd7l2@8OOH0C+tb^O!CE+dy#qZGLz{{9ZYD!h&T}?gR@GB)A2455a@ma_`&S z`}S@3AKlee=c_vPJ9WCMPH`AQdKf}A7^M#Q(ijK}aJfq#Km-6h&;S4|007`^&*tUi zYG>-?WXJ05U|*p1&MBV{FVM*53Gqyjn^p|SNV*-?s3sUWsnB|-!M}#*guvOy6(MuJ zx=RRjNV82ol!`dr-rUsha2wc8G01wwrl6&dM?nc8fa+jJg+P=kZW|+qv7wC8Q6ZLj z#xRP7I`n{++w-`giuf@*iYyR0eHpk=E*YUuGov;`3zBD!lwlt-TpC>H->u2;O0g`7 zBiU-Mkr`_@2~uK#y2VdoTDQ)T9??9io2!R22rc_|&$w+dQ?q@_@6Mk*=3&!E$rfZf z!dZC5oYvn84nsZkD53o{}HiCo^|u7GVi^0-~mqB*5inshz#yrHqP(;a_!;tT$P*6 zvA9Xz5m0{xh3b|1bmPlQaD^1k)#ihAQ{ZV<&T`Dz7}y=Zw%g)mx_RnHr_n=}U=xVr z=l+W`Z^3IwuJHKp`;dnaqN$F`O0Sw%oRN9OsfG`ZXkbcqE4 z<$VQkmxBh;6x6x-T|}vL7@~GRWU9HYW{wv-)b8_{0aYUOAed1WrM6JqSCsmL2L zXi-fpC~U1Bv;O=l2qTaG9UdGuqS4x0bs6YYAnk5rS>AcysJe44^l>;>JMCimG)FjI-o znQb4|!3oF3#qxOO!!+grruW!Jt}gUQOT@B1yCFMstysUG9hQ3J}&rLKhg{!Q(}?B_EWq zv!>bRyRnZOcVv-&{ae&}|K-ATJaS3oTkJCdiiOQ9kElZ4h+#fhul-*j zqxe~VzB1mWs=h zE%TWll!$%UR%mr?zUzGNb>cZTm%6t1sId0u;PBU_!tW9Pi)(Nun$>Hzluek{d1vdp zlW>IY_#U1h2PG&1F&)F;wO;LI=+Zrdc(^;3@M3zb&|-bRxiJpZYh>`7%ZT-BX{rrn zD|NET0FU`<?op3QIK6*7(PMHTX z0H8|(9|ymJE51U7#V(SA!;YCQNWoW_BHxMMxoFLs=N~C*UU)0A$Errxythx(x;r8r zu=!G=R?jrw_kL$HJ2m_CL8);s)s9L-d&3%R+ACPZI8h;Yzj;SVXkyNM$hc4zo8mnN z>d5xLXJ27#=K<{!Amnn=IF=v2l)$iSF0jn;UFlM)s4pOZ75PdlRPk5%$J@)2guP18 zaK9F#|HFczIU9W1VoE$H==a`wMeXISB9*UOo~-9fCAIdIKM3lzD7|X+-ABTBRD+Sl zn#EiOjqz8katxA!AY}|ESAo7NqfBa~sUIZ^nC}%Z)2gP%WR>Uz;g#o|Xt;$^=T_@P z-s0b#;CP@hSot+jeg$=ClwF(Ez+OsoL$r=P4u0c|w^e6@GfXxWPL5TaPi|14%h%Z| zzuCY4GAo)1fz(r%V&niv6eCJH%?$d=5Agxd-_GzMe^qYKDf+9S^eNbACp!~yakI?r zP_4A#B@D=&it6 z=16#r^Xo0jpS~%L z=MM`;SCEwO{ccw$O7V1|O&dn?1fEE(W=;d0e-p~EboyFdAMwKBESi@e?OFTIN1IEl zmVi3$h#bYNQ!f1Yx5Oj!qg%p@)PDoRYf0(!7AhZl8MP*I70LR%OvHO}(cW&gbH~_} za^UWZs>7_&J|}po)ti`|7rg_YN&#_voD{Ege|1Q$mF7m%;XDKWiyRO88dn6wvB}>T zkzYucsMZv8n(X-H8sWj zK%y0SOylnP-r71<`l$$8&@ET?5mgD=zI-fHxrK>cXdNnip&~~;mc^SnS#Hm9#X-WU z6iXC`I(AJtHMYti+tmOqYowyS`>=a=%?aD#45V4!g`wic27uVQS=K=K?W1aZEw!*b zV&*^^=9ccInL4gLpy`A?VKOvE zb4Z^8!qdpwWw0%ztJaOA3CBF^*}Qbstvu%$O57?b^rpM&{W+Ivm`qVV`zsV#E|=DmEg}~=S+u^& zP&4)Jt3E~_7w0fczTuh8lfY&^l0p}}-gTq4t^t(~FK1@g9`rdMwFmPF`chz0vKbrK zFrNFz@cZXK*3;8-YWnn)G_i~sOiuWI{hdojvHv|GpE*-Iwv0WgCa#soRZs!zc@CfH z2}zqjJuj7XMzH1zPA+Pk6F5kAj5h(P+|J~-Y9DJ35v+_G8Pv6qmc@n6=4imG z!^kkcu1UYF5@!(6?+MSeEVeqB&A~Hz*E1`BXsFWk$4*5`*0ni|HM9X{>(#P=Z{EEZ zi@SpHLA*VYVN5k1^yRmjx`GZR@3vm_5mW^59;ikwyD|73LwOvvwmFKPf30_sW(pO@ zDGZ0!xLJb%B&T^@B@*;lL>1Y{4lPJFFpkh6R=e~z?HyETyl2Xp;U#Cw683vM**_2O z6o&oTyXi5iZE0YXcSM2FsH-2%#2Gz3JU~{AIv-Ddo$~Ha)17wrcHZ9%rF1@>3S3lg zEa9_t1e~4i!y&5y7rV-Fqzj8Aq@PbHB5F=5*UgS;qfCM@?@kd!Gke)3dK1O4_dH9E zh6aF!@5q03dsH8GFOul=A+-s0+Rc9@J8hwkO|6otL0U44_9TRP39OZ+4V|1gXDKYh z&s!o|BgPZ#j#%Z&0av@6Ioca)^D)&HJD^CMh$6nON%hVe$!O9qt&$(NXN92~3e^#R zj=)tPK2Fg{x*F4{vDQnPejMtyAP-qM8(DFv>M60C5%xEyGI=(HJ@8}ccHfXStLkqx zekl%a(vB$eEhx+t*7b(Al8su*HvAiApWUQWIKZ_k$&6Gt!oAS`yvI zTZ>WBnzrcFb~1>2aCMZ$f>7gJ!$<_Dpul-)@uQlhiWS)mtb!K`UvdRf*T^KWjO=B?F_H5wu2qUJ!I8g#GgF{bUB9u>itu5_i&y@(ugEW( zE=RzscEK9smw2W_D4zSbLl)-hfH5YqQn39kUyi?)eq6!wxu)vt6dwg{t?*J<3J znU|8NST*+S3z0Ua6Qq2;9@~(oI38pPwQ$QLrRd^&nI=!j!->v2?L?2C+@r84irXr}A zvT>5fxvS5Ab@8$FuqkVoE?}XGcCU+JpJWe)tTPLae7e)^`9xS4_nI^_WmhJ*}7aZ`0c2U`cEfcC{ zDUk)Ot5-TdmVI9(EvKF*>24>BiEBp`*yY78(9vOILWzxdAN9fY*9b>YdNqE+_KnvD z%=m}(Y+K|ce^M-!K$MDS-e>cyZtjT6;R$6XGD79~w65F(R-w;jh?nik%e-VWKirUQ zcC3oNGya-(hwq0AZh^j>Nlw=?7CJsCvA9;+v9@61%=pN)txpF;A$GB|WDi3Xb=#DB zBJ1i@7wb8hQ8ARKd9EEiWcY+nE9pcBV41^u9%9S~a+%k+KbG2NEK$6IgnXT0e8p;e zDiTg9RP?u#!~;n@#!*Xf-5Au>Hhwymr$7dKlx4X&mLwvRPO!!YwOs7RFQJ}w;LV$^ZqW!n%Lx- zcMU&sbR9fxLoFMG>0h)f<&XdVDG+K{@$MMpyIeg%B6QMD1;SjIaa1|rMQ?CzdWULD zV-c9ITY{1-Cs@)%{E+iN&SYl1%@APT0l2blZG3{)fbo$e`tms6)D1D}AmI1~=xIIM zJ3DCIRVHi^uSP&r$HQEPfcS@m0KUz_2K$LfF+O&xY0`0?OrU#Q2>A&mlc34ZU((@F z>Iljy&v8ZdD`!fvJcR=d3L=|~+HI2Y$+7-S9*Dax(}41WyB+NVKVggKlG|^t4Df_> zqty-p`;joJV+ar~xcr+b;&VFt2j0|Ki_}@l66!kc-D&-%9f(v3L3SkRKR?(Chq)Kx z<``pw7>P;uNRUn&Ah(}3TL#i7VYZ4^^o?zNX%t2s=_qf+oj7<4kS_IU_A!5^j}^HD zTmEzhhkuH@?~N43)7i6RzoK;C?8!I2@*zmlR^@MU+#ihxQpXchZ(?y(zM&s>a`Ma}$kYK{}yGy#8++#|c*RT>@kr`(L zR9Fm%PUUgAcXr~Q_OyIFfo2APqJ`J%#&lwJ>YpVQyE`2C=jh|Ai)49aeQuuTgNfq$ z>(r)nhe6nUT+E2!n<;H=RE`{O1_5*|gD8^Xas--1q?4Toalr{WewMhNR%m`tPr-w*zxyUlztuKoUjP8j zSbz0R>LlSk@?YGJPK z?rP&`<@Q%U)qY|`sfQf{?HPHNIwR zcM@d1^fG7}V7Ms|3wAww4WCWQE`{*Xjw7Q0&XnvAeCgU6gX&n7g3gvuMvII^>AY`0 z_*GKt(a3_<^pk><@I+6d*jy~r<(pYtVh2;@lbcdbDtpbS59JhG`Ys)8M2#8pm5SR0 z51BKiKan>CL{5FQ(ALzXIn@4wm8i5U%ArB<>g+rm=%61wo^X-_>?TgU2r-+%r@m+9`J3xQveg zCTOm5ckvqDFxqY#c)0O9}e{(fr16GRE%K4SPJ|6AHW z*Kz+>28aiD2H_`?Bf*ix!6g5;F#2~eOUQGXL__lTW{&iHRPl@g`gcBH`!$?HoQ&k3 zIQ>se+SGq(f^&-DKSwGUEZj%2D-G2m37~%Hfw9o692>5~N8H5O4F3Jep zW&IoPP!u1&CP9QN1_c1joy=5Rot)j+%skxOogCl@;&|{p2_hgjH@riFlH`9^I}iZC y{s;c~4Z;V9Nl?Kxq5p2zPXYLoxEL@@7%nb>hllj+IsgFY`7V3jhTe(%Mg13bkRnh3 diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 2bfc830c4712..433d652b07ec 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -309,6 +309,49 @@ def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None: ) +@pytest.mark.parametrize( + ("read_spreadsheet", "source", "params"), + [ + # TODO: uncomment once fastexcel offers a suitable param + # (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}), + (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}), + ], +) +def test_read_dropped_cols( + read_spreadsheet: Callable[..., dict[str, pl.DataFrame]], + source: str, + params: dict[str, str], + request: pytest.FixtureRequest, +) -> None: + spreadsheet_path = request.getfixturevalue(source) + + df1 = read_spreadsheet( + spreadsheet_path, + sheet_name="test4", + **params, + ) + df2 = read_spreadsheet( + spreadsheet_path, + sheet_name="test4", + drop_empty_cols=False, + **params, + ) + assert df1.to_dict(as_series=False) == { # type: ignore[attr-defined] + "cardinality": [1, 3, 15, 30, 150, 300], + "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677], + "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945], + } + assert df2.to_dict(as_series=False) == { # type: ignore[attr-defined] + "": [None, None, None, None, None, None], + "cardinality": [1, 3, 15, 30, 150, 300], + "rows_by_key": [0.05059, 0.04478, 0.04414, 0.05245, 0.05395, 0.05677], + "iter_groups": [0.04806, 0.04223, 0.04774, 0.04864, 0.0572, 0.06945], + "0": [None, None, None, None, None, None], + "1": [None, None, None, None, None, None], + } + + @pytest.mark.parametrize( ("read_spreadsheet", "source", "params"), [