Skip to content

Commit

Permalink
fix(python): fix dtype refinement check in read_excel when using th…
Browse files Browse the repository at this point in the history
…e new "calamine" engine
  • Loading branch information
alexander-beedie committed Jan 27, 2024
1 parent c088edb commit 9dfa406
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 8 deletions.
17 changes: 9 additions & 8 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,26 +766,27 @@ def _read_spreadsheet_calamine(

df = _drop_null_data(df, raise_if_empty=raise_if_empty)

# calamine may read integer data as float; cast back to int where possible.
# do a similar downcast check for datetime -> date dtypes.
# refine dtypes
type_checks = []
for c, dtype in df.schema.items():
# may read integer data as float; cast back to int where possible.
if dtype in FLOAT_DTYPES:
check_cast = [F.col(c).floor().eq_missing(F.col(c)), F.col(c).cast(Int64)]
check_cast = [F.col(c).floor().eq(F.col(c)), F.col(c).cast(Int64)]
type_checks.append(check_cast)
# do a similar check for datetime columns that have only 00:00:00 times.
elif dtype == Datetime:
check_cast = [
F.col(c).drop_nulls().dt.time().eq_missing(time(0, 0, 0)),
F.col(c).dt.time().eq(time(0, 0, 0)),
F.col(c).cast(Date),
]
type_checks.append(check_cast)

if type_checks:
apply_downcast = df.select([d[0] for d in type_checks]).row(0)

# do a similar check for datetime columns that have only 00:00:00 times.
apply_cast = df.select(
[d[0].all(ignore_nulls=True) for d in type_checks],
).row(0)
if downcast := [
cast for apply, (_, cast) in zip(apply_downcast, type_checks) if apply
cast for apply, (_, cast) in zip(apply_cast, type_checks) if apply
]:
df = df.with_columns(*downcast)

Expand Down
43 changes: 43 additions & 0 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,49 @@ def test_excel_hidden_columns(
assert_frame_equal(df, read_df)


@pytest.mark.parametrize(
"engine",
[
"xlsx2csv",
"openpyxl",
pytest.param(
"calamine",
marks=pytest.mark.skipif(
sys.platform == "win32",
reason="fastexcel not yet available on Windows",
),
),
],
)
def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
df = pl.DataFrame(
{
"a": [1, 2, None],
"b": [1.0, None, 3.5],
"c": ["x", None, "z"],
"d": [True, False, None],
"e": [date(2023, 1, 1), None, date(2023, 1, 4)],
"f": [
datetime(2023, 1, 1),
datetime(2000, 10, 10, 10, 10),
None,
],
}
)
xls = BytesIO()
df.write_excel(xls)

read_df = pl.read_excel(
xls,
engine=engine,
schema_overrides={
"e": pl.Date,
"f": pl.Datetime("us"),
},
)
assert_frame_equal(df, read_df)


def test_invalid_engine_options() -> None:
# read_csv_options only applicable with 'xlsx2csv' engine
with pytest.raises(ValueError, match="cannot specify `read_csv_options`"):
Expand Down

0 comments on commit 9dfa406

Please sign in to comment.