fix(python): fix dtype refinement check in read_excel when using th…

…e new "calamine" engine
pola-rs · Jan 27, 2024 · 9dfa406 · 9dfa406
1 parent c088edb
commit 9dfa406
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 8 deletions.
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -766,26 +766,27 @@ def _read_spreadsheet_calamine(
 
     df = _drop_null_data(df, raise_if_empty=raise_if_empty)
 
-    # calamine may read integer data as float; cast back to int where possible.
-    # do a similar downcast check for datetime -> date dtypes.
+    # refine dtypes
     type_checks = []
     for c, dtype in df.schema.items():
+        # may read integer data as float; cast back to int where possible.
         if dtype in FLOAT_DTYPES:
-            check_cast = [F.col(c).floor().eq_missing(F.col(c)), F.col(c).cast(Int64)]
+            check_cast = [F.col(c).floor().eq(F.col(c)), F.col(c).cast(Int64)]
             type_checks.append(check_cast)
+        # do a similar check for datetime columns that have only 00:00:00 times.
         elif dtype == Datetime:
             check_cast = [
-                F.col(c).drop_nulls().dt.time().eq_missing(time(0, 0, 0)),
+                F.col(c).dt.time().eq(time(0, 0, 0)),
                 F.col(c).cast(Date),
             ]
             type_checks.append(check_cast)
 
     if type_checks:
-        apply_downcast = df.select([d[0] for d in type_checks]).row(0)
-
-        # do a similar check for datetime columns that have only 00:00:00 times.
+        apply_cast = df.select(
+            [d[0].all(ignore_nulls=True) for d in type_checks],
+        ).row(0)
         if downcast := [
-            cast for apply, (_, cast) in zip(apply_downcast, type_checks) if apply
+            cast for apply, (_, cast) in zip(apply_cast, type_checks) if apply
         ]:
             df = df.with_columns(*downcast)
 

diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -887,6 +887,49 @@ def test_excel_hidden_columns(
     assert_frame_equal(df, read_df)
 
 
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "xlsx2csv",
+        "openpyxl",
+        pytest.param(
+            "calamine",
+            marks=pytest.mark.skipif(
+                sys.platform == "win32",
+                reason="fastexcel not yet available on Windows",
+            ),
+        ),
+    ],
+)
+def test_excel_type_inference_with_nulls(engine: ExcelSpreadsheetEngine) -> None:
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, None],
+            "b": [1.0, None, 3.5],
+            "c": ["x", None, "z"],
+            "d": [True, False, None],
+            "e": [date(2023, 1, 1), None, date(2023, 1, 4)],
+            "f": [
+                datetime(2023, 1, 1),
+                datetime(2000, 10, 10, 10, 10),
+                None,
+            ],
+        }
+    )
+    xls = BytesIO()
+    df.write_excel(xls)
+
+    read_df = pl.read_excel(
+        xls,
+        engine=engine,
+        schema_overrides={
+            "e": pl.Date,
+            "f": pl.Datetime("us"),
+        },
+    )
+    assert_frame_equal(df, read_df)
+
+
 def test_invalid_engine_options() -> None:
     # read_csv_options only applicable with 'xlsx2csv' engine
     with pytest.raises(ValueError, match="cannot specify `read_csv_options`"):