Skip to content

Commit

Permalink
feat: introduce eager loading functions (#147)
Browse files Browse the repository at this point in the history
* chore(deps): Upgrade calamine 0.22.1 -> 0.23.0

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* feat: introduce eager loading functions

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt to recent changes

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* feat: added support for schema_sample_rows

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* solve merge conflicts

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt to recent changes on main

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt to recent changes on main

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt error message

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt to recent changes on main

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* fat refactor, might support non-eager by-ref

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* add iterations to test.py

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* remove unused file

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt to recent changes on main

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* fix: ensure eager=True always returns a RecordBatch

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* remove commented out code

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* simplify lifetime annotations

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* adapt code to recent changes

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* remove dbg!

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* fix typing

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* chore: clippy rust 1.79

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

* docs: improve docstrings

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>

---------

Signed-off-by: Luka Peschke <luka.peschke@toucantoco.com>
  • Loading branch information
lukapeschke authored Jul 1, 2024
1 parent 4332278 commit 2147bb5
Show file tree
Hide file tree
Showing 13 changed files with 831 additions and 361 deletions.
74 changes: 51 additions & 23 deletions python/fastexcel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def load_sheet(
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
) -> ExcelSheet:
"""Loads a sheet by index or name.
"""Loads a sheet lazily by index or name.
:param idx_or_name: The index (starting at 0) or the name of the sheet to load.
:param header_row: The index of the row containing the column labels, default index is 0.
Expand Down Expand Up @@ -165,9 +165,41 @@ def load_sheet(
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
eager=False,
)
)

def load_sheet_eager(
self,
idx_or_name: int | str,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
) -> pa.RecordBatch:
"""Loads a sheet eagerly by index or name.
For xlsx files, this will be faster and more memory-efficient, as it will use
`worksheet_range_ref` under the hood, which returns borrowed types.
Refer to `load_sheet` for parameter documentation
"""
return self._reader.load_sheet(
idx_or_name=idx_or_name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
eager=True,
)

def load_sheet_by_name(
self,
name: str,
Expand All @@ -184,17 +216,15 @@ def load_sheet_by_name(
Refer to `load_sheet` for parameter documentation
"""
return ExcelSheet(
self._reader.load_sheet(
name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)
return self.load_sheet(
name,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)

def load_sheet_by_idx(
Expand All @@ -213,17 +243,15 @@ def load_sheet_by_idx(
Refer to `load_sheet` for parameter documentation
"""
return ExcelSheet(
self._reader.load_sheet(
idx,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)
return self.load_sheet(
idx,
header_row=header_row,
column_names=column_names,
skip_rows=skip_rows,
n_rows=n_rows,
schema_sample_rows=schema_sample_rows,
use_columns=use_columns,
dtypes=dtypes,
)

def __repr__(self) -> str:
Expand Down
17 changes: 17 additions & 0 deletions python/fastexcel/_fastexcel.pyi
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import typing
from typing import Literal

import pyarrow as pa
Expand Down Expand Up @@ -61,6 +62,7 @@ class _ExcelSheet:
class _ExcelReader:
"""A class representing an open Excel file and allowing to read its sheets"""

@typing.overload
def load_sheet(
self,
idx_or_name: str | int,
Expand All @@ -72,7 +74,22 @@ class _ExcelReader:
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
eager: Literal[False] = ...,
) -> _ExcelSheet: ...
@typing.overload
def load_sheet(
self,
idx_or_name: str | int,
*,
header_row: int | None = 0,
column_names: list[str] | None = None,
skip_rows: int = 0,
n_rows: int | None = None,
schema_sample_rows: int | None = 1_000,
use_columns: list[str] | list[int] | str | None = None,
dtypes: DTypeMap | None = None,
eager: Literal[True] = ...,
) -> pa.RecordBatch: ...
@property
def sheet_names(self) -> list[str]: ...

Expand Down
54 changes: 54 additions & 0 deletions python/tests/test_eagerness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from datetime import date, datetime, timedelta

import fastexcel
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from pyarrow import RecordBatch
from utils import path_for_fixture


def test_load_sheet_eager_single_sheet() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx"))

eager_pandas = excel_reader.load_sheet_eager(0).to_pandas()
lazy_pandas = excel_reader.load_sheet(0).to_pandas()
pd_assert_frame_equal(eager_pandas, lazy_pandas)

eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0))
assert isinstance(eager_polars, pl.DataFrame)
lazy_polars = excel_reader.load_sheet(0).to_polars()
pl_assert_frame_equal(eager_polars, lazy_polars)


def test_multiple_sheets_with_unnamed_columns():
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx"))

eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas()
lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas()
pd_assert_frame_equal(eager_pandas, lazy_pandas)

eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns"))
assert isinstance(eager_polars, pl.DataFrame)
lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars()
pl_assert_frame_equal(eager_polars, lazy_polars)


def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None:
ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods"))

record_batch = ods_reader.load_sheet_eager(0)
assert isinstance(record_batch, RecordBatch)
pl_df = pl.from_arrow(record_batch)
assert isinstance(pl_df, pl.DataFrame)
pl_assert_frame_equal(
pl_df,
pl.DataFrame(
{
"date": [date(2023, 6, 1)],
"datestr": ["2023-06-01T02:03:04+02:00"],
"time": [timedelta(hours=1, minutes=2, seconds=3)],
"datetime": [datetime(2023, 6, 1, 2, 3, 4)],
}
).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))),
)
18 changes: 18 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::{error::Error, fmt::Display};

use calamine::XlsxError;

use crate::types::idx_or_name::IdxOrName;

#[derive(Debug)]
Expand All @@ -14,6 +16,7 @@ pub(crate) enum FastExcelErrorKind {
// the actual type has not much value for us, so we just store a string context
ArrowError(String),
InvalidParameters(String),
Internal(String),
}

impl Display for FastExcelErrorKind {
Expand Down Expand Up @@ -41,6 +44,7 @@ impl Display for FastExcelErrorKind {
}
FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"),
FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"),
FastExcelErrorKind::Internal(err) => write!(f, "fastexcel error: {err}"),
}
}
}
Expand Down Expand Up @@ -99,6 +103,12 @@ impl From<FastExcelErrorKind> for FastExcelError {
}
}

impl From<XlsxError> for FastExcelError {
fn from(err: XlsxError) -> Self {
FastExcelErrorKind::CalamineError(calamine::Error::Xlsx(err)).into()
}
}

pub(crate) type FastExcelResult<T> = Result<T, FastExcelError>;

impl<T> ErrorContext for FastExcelResult<T> {
Expand Down Expand Up @@ -181,6 +191,13 @@ pub(crate) mod py_errors {
FastExcelError,
"Provided parameters are invalid"
);
// Internal error
create_exception!(
_fastexcel,
InternalError,
FastExcelError,
"Internal fastexcel error"
);

pub(crate) trait IntoPyResult {
type Inner;
Expand Down Expand Up @@ -217,6 +234,7 @@ pub(crate) mod py_errors {
FastExcelErrorKind::InvalidParameters(_) => {
InvalidParametersError::new_err(message)
}
FastExcelErrorKind::Internal(_) => ArrowError::new_err(message),
})
}
}
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod error;
mod types;
mod utils;

use error::{py_errors, ErrorContext};
use pyo3::prelude::*;
Expand Down
Loading

0 comments on commit 2147bb5

Please sign in to comment.