ToucanToco · lukapeschke · Jul 1, 2024 · Dec 13, 2023 · Dec 22, 2023 · Feb 20, 2024
diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py
@@ -131,7 +131,7 @@ def load_sheet(
         use_columns: list[str] | list[int] | str | None = None,
         dtypes: DTypeMap | None = None,
     ) -> ExcelSheet:
-        """Loads a sheet by index or name.
+        """Loads a sheet lazily by index or name.
 
         :param idx_or_name: The index (starting at 0) or the name of the sheet to load.
         :param header_row: The index of the row containing the column labels, default index is 0.
@@ -165,9 +165,41 @@ def load_sheet(
                 schema_sample_rows=schema_sample_rows,
                 use_columns=use_columns,
                 dtypes=dtypes,
+                eager=False,
             )
         )
 
+    def load_sheet_eager(
+        self,
+        idx_or_name: int | str,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int = 0,
+        n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
+        use_columns: list[str] | list[int] | str | None = None,
+        dtypes: DTypeMap | None = None,
+    ) -> pa.RecordBatch:
+        """Loads a sheet eagerly by index or name.
+
+        For xlsx files, this will be faster and more memory-efficient, as it will use
+        `worksheet_range_ref` under the hood, which returns borrowed types.
+
+        Refer to `load_sheet` for parameter documentation
+        """
+        return self._reader.load_sheet(
+            idx_or_name=idx_or_name,
+            header_row=header_row,
+            column_names=column_names,
+            skip_rows=skip_rows,
+            n_rows=n_rows,
+            schema_sample_rows=schema_sample_rows,
+            use_columns=use_columns,
+            dtypes=dtypes,
+            eager=True,
+        )
+
     def load_sheet_by_name(
         self,
         name: str,
@@ -184,17 +216,15 @@ def load_sheet_by_name(
 
         Refer to `load_sheet` for parameter documentation
         """
-        return ExcelSheet(
-            self._reader.load_sheet(
-                name,
-                header_row=header_row,
-                column_names=column_names,
-                skip_rows=skip_rows,
-                n_rows=n_rows,
-                schema_sample_rows=schema_sample_rows,
-                use_columns=use_columns,
-                dtypes=dtypes,
-            )
+        return self.load_sheet(
+            name,
+            header_row=header_row,
+            column_names=column_names,
+            skip_rows=skip_rows,
+            n_rows=n_rows,
+            schema_sample_rows=schema_sample_rows,
+            use_columns=use_columns,
+            dtypes=dtypes,
         )
 
     def load_sheet_by_idx(
@@ -213,17 +243,15 @@ def load_sheet_by_idx(
 
         Refer to `load_sheet` for parameter documentation
         """
-        return ExcelSheet(
-            self._reader.load_sheet(
-                idx,
-                header_row=header_row,
-                column_names=column_names,
-                skip_rows=skip_rows,
-                n_rows=n_rows,
-                schema_sample_rows=schema_sample_rows,
-                use_columns=use_columns,
-                dtypes=dtypes,
-            )
+        return self.load_sheet(
+            idx,
+            header_row=header_row,
+            column_names=column_names,
+            skip_rows=skip_rows,
+            n_rows=n_rows,
+            schema_sample_rows=schema_sample_rows,
+            use_columns=use_columns,
+            dtypes=dtypes,
         )
 
     def __repr__(self) -> str:

diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import typing
 from typing import Literal
 
 import pyarrow as pa
@@ -61,6 +62,7 @@ class _ExcelSheet:
 class _ExcelReader:
     """A class representing an open Excel file and allowing to read its sheets"""
 
+    @typing.overload
     def load_sheet(
         self,
         idx_or_name: str | int,
@@ -72,7 +74,22 @@ class _ExcelReader:
         schema_sample_rows: int | None = 1_000,
         use_columns: list[str] | list[int] | str | None = None,
         dtypes: DTypeMap | None = None,
+        eager: Literal[False] = ...,
     ) -> _ExcelSheet: ...
+    @typing.overload
+    def load_sheet(
+        self,
+        idx_or_name: str | int,
+        *,
+        header_row: int | None = 0,
+        column_names: list[str] | None = None,
+        skip_rows: int = 0,
+        n_rows: int | None = None,
+        schema_sample_rows: int | None = 1_000,
+        use_columns: list[str] | list[int] | str | None = None,
+        dtypes: DTypeMap | None = None,
+        eager: Literal[True] = ...,
+    ) -> pa.RecordBatch: ...
     @property
     def sheet_names(self) -> list[str]: ...
 

diff --git a/python/tests/test_eagerness.py b/python/tests/test_eagerness.py
@@ -0,0 +1,54 @@
+from datetime import date, datetime, timedelta
+
+import fastexcel
+import polars as pl
+from pandas.testing import assert_frame_equal as pd_assert_frame_equal
+from polars.testing import assert_frame_equal as pl_assert_frame_equal
+from pyarrow import RecordBatch
+from utils import path_for_fixture
+
+
+def test_load_sheet_eager_single_sheet() -> None:
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))
+
+    eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
+    lazy_pandas = excel_reader.load_sheet(0).to_pandas()
+    pd_assert_frame_equal(eager_pandas, lazy_pandas)
+
+    eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
+    assert isinstance(eager_polars, pl.DataFrame)
+    lazy_polars = excel_reader.load_sheet(0).to_polars()
+    pl_assert_frame_equal(eager_polars, lazy_polars)
+
+
+def test_multiple_sheets_with_unnamed_columns():
+    excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))
+
+    eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
+    lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
+    pd_assert_frame_equal(eager_pandas, lazy_pandas)
+
+    eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
+    assert isinstance(eager_polars, pl.DataFrame)
+    lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
+    pl_assert_frame_equal(eager_polars, lazy_polars)
+
+
+def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None:
+    ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))
+
+    record_batch = ods_reader.load_sheet_eager(0)
+    assert isinstance(record_batch, RecordBatch)
+    pl_df = pl.from_arrow(record_batch)
+    assert isinstance(pl_df, pl.DataFrame)
+    pl_assert_frame_equal(
+        pl_df,
+        pl.DataFrame(
+            {
+                "date": [date(2023, 6, 1)],
+                "datestr": ["2023-06-01T02:03:04+02:00"],
+                "time": [timedelta(hours=1, minutes=2, seconds=3)],
+                "datetime": [datetime(2023, 6, 1, 2, 3, 4)],
+            }
+        ).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))),
+    )
diff --git a/src/error.rs b/src/error.rs
@@ -1,5 +1,7 @@
 use std::{error::Error, fmt::Display};
 
+use calamine::XlsxError;
+
 use crate::types::idx_or_name::IdxOrName;
 
 #[derive(Debug)]
@@ -14,6 +16,7 @@ pub(crate) enum FastExcelErrorKind {
     // the actual type has not much value for us, so we just store a string context
     ArrowError(String),
     InvalidParameters(String),
+    Internal(String),
 }
 
 impl Display for FastExcelErrorKind {
@@ -41,6 +44,7 @@ impl Display for FastExcelErrorKind {
             }
             FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"),
             FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"),
+            FastExcelErrorKind::Internal(err) => write!(f, "fastexcel error: {err}"),
         }
     }
 }
@@ -99,6 +103,12 @@ impl From<FastExcelErrorKind> for FastExcelError {
     }
 }
 
+impl From<XlsxError> for FastExcelError {
+    fn from(err: XlsxError) -> Self {
+        FastExcelErrorKind::CalamineError(calamine::Error::Xlsx(err)).into()
+    }
+}
+
 pub(crate) type FastExcelResult<T> = Result<T, FastExcelError>;
 
 impl<T> ErrorContext for FastExcelResult<T> {
@@ -181,6 +191,13 @@ pub(crate) mod py_errors {
         FastExcelError,
         "Provided parameters are invalid"
     );
+    // Internal error
+    create_exception!(
+        _fastexcel,
+        InternalError,
+        FastExcelError,
+        "Internal fastexcel error"
+    );
 
     pub(crate) trait IntoPyResult {
         type Inner;
@@ -217,6 +234,7 @@ pub(crate) mod py_errors {
                         FastExcelErrorKind::InvalidParameters(_) => {
                             InvalidParametersError::new_err(message)
                         }
+                        FastExcelErrorKind::Internal(_) => ArrowError::new_err(message),
                     })
                 }
             }

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,6 @@
 mod error;
 mod types;
+mod utils;
 
 use error::{py_errors, ErrorContext};
 use pyo3::prelude::*;