diff --git a/python/fastexcel/__init__.py b/python/fastexcel/__init__.py index 8717e1b..60415cd 100644 --- a/python/fastexcel/__init__.py +++ b/python/fastexcel/__init__.py @@ -131,7 +131,7 @@ def load_sheet( use_columns: list[str] | list[int] | str | None = None, dtypes: DTypeMap | None = None, ) -> ExcelSheet: - """Loads a sheet by index or name. + """Loads a sheet lazily by index or name. :param idx_or_name: The index (starting at 0) or the name of the sheet to load. :param header_row: The index of the row containing the column labels, default index is 0. @@ -165,9 +165,41 @@ def load_sheet( schema_sample_rows=schema_sample_rows, use_columns=use_columns, dtypes=dtypes, + eager=False, ) ) + def load_sheet_eager( + self, + idx_or_name: int | str, + *, + header_row: int | None = 0, + column_names: list[str] | None = None, + skip_rows: int = 0, + n_rows: int | None = None, + schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | str | None = None, + dtypes: DTypeMap | None = None, + ) -> pa.RecordBatch: + """Loads a sheet eagerly by index or name. + + For xlsx files, this will be faster and more memory-efficient, as it will use + `worksheet_range_ref` under the hood, which returns borrowed types. + + Refer to `load_sheet` for parameter documentation + """ + return self._reader.load_sheet( + idx_or_name=idx_or_name, + header_row=header_row, + column_names=column_names, + skip_rows=skip_rows, + n_rows=n_rows, + schema_sample_rows=schema_sample_rows, + use_columns=use_columns, + dtypes=dtypes, + eager=True, + ) + def load_sheet_by_name( self, name: str, @@ -184,17 +216,15 @@ def load_sheet_by_name( Refer to `load_sheet` for parameter documentation """ - return ExcelSheet( - self._reader.load_sheet( - name, - header_row=header_row, - column_names=column_names, - skip_rows=skip_rows, - n_rows=n_rows, - schema_sample_rows=schema_sample_rows, - use_columns=use_columns, - dtypes=dtypes, - ) + return self.load_sheet( + name, + header_row=header_row, + column_names=column_names, + skip_rows=skip_rows, + n_rows=n_rows, + schema_sample_rows=schema_sample_rows, + use_columns=use_columns, + dtypes=dtypes, ) def load_sheet_by_idx( @@ -213,17 +243,15 @@ def load_sheet_by_idx( Refer to `load_sheet` for parameter documentation """ - return ExcelSheet( - self._reader.load_sheet( - idx, - header_row=header_row, - column_names=column_names, - skip_rows=skip_rows, - n_rows=n_rows, - schema_sample_rows=schema_sample_rows, - use_columns=use_columns, - dtypes=dtypes, - ) + return self.load_sheet( + idx, + header_row=header_row, + column_names=column_names, + skip_rows=skip_rows, + n_rows=n_rows, + schema_sample_rows=schema_sample_rows, + use_columns=use_columns, + dtypes=dtypes, ) def __repr__(self) -> str: diff --git a/python/fastexcel/_fastexcel.pyi b/python/fastexcel/_fastexcel.pyi index 0d6d3e1..59e892e 100644 --- a/python/fastexcel/_fastexcel.pyi +++ b/python/fastexcel/_fastexcel.pyi @@ -1,5 +1,6 @@ from __future__ import annotations +import typing from typing import Literal import pyarrow as pa @@ -61,6 +62,7 @@ class _ExcelSheet: class _ExcelReader: """A class representing an open Excel file and allowing to read its sheets""" + @typing.overload def load_sheet( self, idx_or_name: str | int, @@ -72,7 +74,22 @@ class _ExcelReader: schema_sample_rows: int | None = 1_000, use_columns: list[str] | list[int] | str | None = None, dtypes: DTypeMap | None = None, + eager: Literal[False] = ..., ) -> _ExcelSheet: ... + @typing.overload + def load_sheet( + self, + idx_or_name: str | int, + *, + header_row: int | None = 0, + column_names: list[str] | None = None, + skip_rows: int = 0, + n_rows: int | None = None, + schema_sample_rows: int | None = 1_000, + use_columns: list[str] | list[int] | str | None = None, + dtypes: DTypeMap | None = None, + eager: Literal[True] = ..., + ) -> pa.RecordBatch: ... @property def sheet_names(self) -> list[str]: ... diff --git a/python/tests/test_eagerness.py b/python/tests/test_eagerness.py new file mode 100644 index 0000000..80baba9 --- /dev/null +++ b/python/tests/test_eagerness.py @@ -0,0 +1,54 @@ +from datetime import date, datetime, timedelta + +import fastexcel +import polars as pl +from pandas.testing import assert_frame_equal as pd_assert_frame_equal +from polars.testing import assert_frame_equal as pl_assert_frame_equal +from pyarrow import RecordBatch +from utils import path_for_fixture + + +def test_load_sheet_eager_single_sheet() -> None: + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet.xlsx")) + + eager_pandas = excel_reader.load_sheet_eager(0).to_pandas() + lazy_pandas = excel_reader.load_sheet(0).to_pandas() + pd_assert_frame_equal(eager_pandas, lazy_pandas) + + eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager(0)) + assert isinstance(eager_polars, pl.DataFrame) + lazy_polars = excel_reader.load_sheet(0).to_polars() + pl_assert_frame_equal(eager_polars, lazy_polars) + + +def test_multiple_sheets_with_unnamed_columns(): + excel_reader = fastexcel.read_excel(path_for_fixture("fixture-multi-sheet.xlsx")) + + eager_pandas = excel_reader.load_sheet_eager("With unnamed columns").to_pandas() + lazy_pandas = excel_reader.load_sheet("With unnamed columns").to_pandas() + pd_assert_frame_equal(eager_pandas, lazy_pandas) + + eager_polars = pl.from_arrow(data=excel_reader.load_sheet_eager("With unnamed columns")) + assert isinstance(eager_polars, pl.DataFrame) + lazy_polars = excel_reader.load_sheet("With unnamed columns").to_polars() + pl_assert_frame_equal(eager_polars, lazy_polars) + + +def test_eager_with_an_ods_file_should_return_a_recordbatch() -> None: + ods_reader = fastexcel.read_excel(path_for_fixture("dates.ods")) + + record_batch = ods_reader.load_sheet_eager(0) + assert isinstance(record_batch, RecordBatch) + pl_df = pl.from_arrow(record_batch) + assert isinstance(pl_df, pl.DataFrame) + pl_assert_frame_equal( + pl_df, + pl.DataFrame( + { + "date": [date(2023, 6, 1)], + "datestr": ["2023-06-01T02:03:04+02:00"], + "time": [timedelta(hours=1, minutes=2, seconds=3)], + "datetime": [datetime(2023, 6, 1, 2, 3, 4)], + } + ).with_columns(*(pl.col(col).dt.cast_time_unit("ms") for col in ("datetime", "time"))), + ) diff --git a/src/error.rs b/src/error.rs index e5e2226..29a1233 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error, fmt::Display}; +use calamine::XlsxError; + use crate::types::idx_or_name::IdxOrName; #[derive(Debug)] @@ -14,6 +16,7 @@ pub(crate) enum FastExcelErrorKind { // the actual type has not much value for us, so we just store a string context ArrowError(String), InvalidParameters(String), + Internal(String), } impl Display for FastExcelErrorKind { @@ -41,6 +44,7 @@ impl Display for FastExcelErrorKind { } FastExcelErrorKind::ArrowError(err) => write!(f, "arrow error: {err}"), FastExcelErrorKind::InvalidParameters(err) => write!(f, "invalid parameters: {err}"), + FastExcelErrorKind::Internal(err) => write!(f, "fastexcel error: {err}"), } } } @@ -99,6 +103,12 @@ impl From for FastExcelError { } } +impl From for FastExcelError { + fn from(err: XlsxError) -> Self { + FastExcelErrorKind::CalamineError(calamine::Error::Xlsx(err)).into() + } +} + pub(crate) type FastExcelResult = Result; impl ErrorContext for FastExcelResult { @@ -181,6 +191,13 @@ pub(crate) mod py_errors { FastExcelError, "Provided parameters are invalid" ); + // Internal error + create_exception!( + _fastexcel, + InternalError, + FastExcelError, + "Internal fastexcel error" + ); pub(crate) trait IntoPyResult { type Inner; @@ -217,6 +234,7 @@ pub(crate) mod py_errors { FastExcelErrorKind::InvalidParameters(_) => { InvalidParametersError::new_err(message) } + FastExcelErrorKind::Internal(_) => ArrowError::new_err(message), }) } } diff --git a/src/lib.rs b/src/lib.rs index 3ae7070..2fe608e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ mod error; mod types; +mod utils; use error::{py_errors, ErrorContext}; use pyo3::prelude::*; diff --git a/src/types/dtype.rs b/src/types/dtype.rs index fb544ba..e77fe53 100644 --- a/src/types/dtype.rs +++ b/src/types/dtype.rs @@ -1,11 +1,12 @@ use std::{ collections::{HashMap, HashSet}, + fmt::{Debug, Display}, str::FromStr, sync::OnceLock, }; use arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; -use calamine::{CellErrorType, Data as CalData, DataType, Range}; +use calamine::{CellErrorType, CellType, DataType, Range}; use pyo3::{FromPyObject, PyAny, PyObject, PyResult, Python, ToPyObject}; use crate::error::{py_errors::IntoPyResult, FastExcelError, FastExcelErrorKind, FastExcelResult}; @@ -45,9 +46,9 @@ impl FromStr for DType { } } -impl ToString for DType { - fn to_string(&self) -> String { - match self { +impl Display for DType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { DType::Null => "null", DType::Int => "int", DType::Float => "float", @@ -56,8 +57,7 @@ impl ToString for DType { DType::DateTime => "datetime", DType::Date => "date", DType::Duration => "duration", - } - .to_string() + }) } } @@ -104,42 +104,68 @@ const NULL_STRING_VALUES: [&str; 19] = [ "", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null", ]; -fn get_cell_dtype(data: &Range, row: usize, col: usize) -> FastExcelResult { +fn get_cell_dtype( + data: &Range
, + row: usize, + col: usize, +) -> FastExcelResult { let cell = data .get((row, col)) .ok_or_else(|| FastExcelErrorKind::CannotRetrieveCellData(row, col))?; - match cell { - CalData::Int(_) => Ok(DType::Int), - CalData::Float(_) => Ok(DType::Float), - CalData::String(v) => match v { - v if NULL_STRING_VALUES.contains(&v.as_str()) => Ok(DType::Null), - _ => Ok(DType::String), - }, - CalData::Bool(_) => Ok(DType::Bool), + if cell.is_int() { + Ok(DType::Int) + } else if cell.is_float() { + Ok(DType::Float) + } else if cell.is_string() { + if NULL_STRING_VALUES.contains(&cell.get_string().unwrap()) { + Ok(DType::Null) + } else { + Ok(DType::String) + } + } else if cell.is_bool() { + Ok(DType::Bool) + } else if cell.is_datetime() { // Since calamine 0.24.0, a new ExcelDateTime exists for the Datetime type. It can either be // a duration or a datatime - CalData::DateTime(excel_datetime) => Ok(if excel_datetime.is_datetime() { + let excel_datetime = cell + .get_datetime() + .expect("calamine indicated that cell is a datetime but get_datetime returned None"); + Ok(if excel_datetime.is_datetime() { DType::DateTime } else { DType::Duration - }), - // These types contain an ISO8601 representation of a date/datetime or a duration - CalData::DateTimeIso(_) => match cell.as_datetime() { - Some(_) => Ok(DType::DateTime), + }) + } + // These types contain an ISO8601 representation of a date/datetime or a durat + else if cell.is_datetime_iso() { + match cell.as_datetime() { // If we cannot convert the cell to a datetime, we're working on a date + Some(_) => Ok(DType::DateTime), // NOTE: not using the Date64 type on purpose, as pyarrow converts it to a datetime // rather than a date None => Ok(DType::Date), - }, - // A simple duration - CalData::DurationIso(_) => Ok(DType::Duration), - // Errors and nulls - CalData::Error(err) => match err { - CellErrorType::NA | CellErrorType::Value | CellErrorType::Null => Ok(DType::Null), - _ => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()), - }, - CalData::Empty => Ok(DType::Null), + } + } + // Simple durations + else if cell.is_duration_iso() { + Ok(DType::Duration) + } + // Empty cell + else if cell.is_empty() { + Ok(DType::Null) + } else if cell.is_error() { + match cell.get_error() { + // considering cells with #N/A! as null + Some(CellErrorType::NA | CellErrorType::Value | CellErrorType::Null) => Ok(DType::Null), + Some(err) => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()), + None => Err(FastExcelErrorKind::Internal(format!( + "cell is an error but get_error returned None: {cell:?}" + )) + .into()), + } + } else { + Err(FastExcelErrorKind::Internal(format!("unsupported cell type: {cell:?}")).into()) } } @@ -159,8 +185,8 @@ fn string_types() -> &'static HashSet { STRING_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::String])) } -pub(crate) fn get_dtype_for_column( - data: &Range, +pub(crate) fn get_dtype_for_column( + data: &Range
, start_row: usize, end_row: usize, col: usize, @@ -198,7 +224,7 @@ pub(crate) fn get_dtype_for_column( #[cfg(test)] mod tests { - use calamine::Cell; + use calamine::{Cell, Data as CalData}; use rstest::{fixture, rstest}; use super::*; diff --git a/src/types/python/excelreader.rs b/src/types/python/excelreader.rs index e295e6c..35fdec6 100644 --- a/src/types/python/excelreader.rs +++ b/src/types/python/excelreader.rs @@ -3,8 +3,15 @@ use std::{ io::{BufReader, Cursor}, }; -use calamine::{open_workbook_auto, open_workbook_auto_from_rs, Data, Range, Reader, Sheets}; -use pyo3::{pyclass, pymethods, PyAny, PyResult}; +use arrow::{ + datatypes::{Field, Schema}, + pyarrow::ToPyArrow, + record_batch::RecordBatch, +}; +use calamine::{ + open_workbook_auto, open_workbook_auto_from_rs, Data, DataRef, Range, Reader, Sheets, +}; +use pyo3::{prelude::PyObject, pyclass, pymethods, IntoPy, PyAny, PyResult, Python}; use crate::{ error::{ @@ -13,6 +20,13 @@ use crate::{ types::{dtype::DTypeMap, idx_or_name::IdxOrName}, }; +use crate::utils::schema::get_schema_sample_rows; + +use super::excelsheet::record_batch_from_data_and_schema; +use super::excelsheet::{ + column_info::{build_available_columns, build_available_columns_info}, + sheet_data::ExcelSheetData, +}; use super::excelsheet::{ExcelSheet, Header, Pagination, SelectedColumns}; enum ExcelSheets { @@ -37,6 +51,25 @@ impl ExcelSheets { Self::Bytes(sheets) => sheets.sheet_names(), } } + + fn supports_by_ref(&self) -> bool { + matches!( + self, + Self::File(Sheets::Xlsx(_)) | Self::Bytes(Sheets::Xlsx(_)) + ) + } + + fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> FastExcelResult>> { + match self { + ExcelSheets::File(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?), + ExcelSheets::Bytes(Sheets::Xlsx(sheets)) => Ok(sheets.worksheet_range_ref(name)?), + _ => Err(FastExcelErrorKind::Internal( + "sheets do not support worksheet_range_ref".to_string(), + ) + .into()), + } + .with_context(|| format!("Error while loading sheet {name}")) + } } #[pyclass(name = "_ExcelReader")] @@ -48,6 +81,10 @@ pub(crate) struct ExcelReader { } impl ExcelReader { + fn build_selected_columns(use_columns: Option<&PyAny>) -> FastExcelResult { + use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")) + } + // NOTE: Not implementing TryFrom here, because we're aren't building the file from the passed // string, but rather from the file pointed by it. Semantically, try_from_path is clearer pub(crate) fn try_from_path(path: &str) -> FastExcelResult { @@ -62,8 +99,44 @@ impl ExcelReader { }) } - fn build_selected_columns(use_columns: Option<&PyAny>) -> FastExcelResult { - use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | None, got {use_columns:?}")) + fn load_sheet_eager( + data: &ExcelSheetData, + pagination: Pagination, + header: Header, + sample_rows: Option, + selected_columns: &SelectedColumns, + dtypes: Option<&DTypeMap>, + ) -> FastExcelResult { + let offset = header.offset() + pagination.offset(); + let limit = { + let upper_bound = data.height(); + if let Some(n_rows) = pagination.n_rows() { + // minimum value between (offset+n_rows) and the data's height + std::cmp::min(offset + n_rows, upper_bound) + } else { + upper_bound + } + }; + + let sample_rows_limit = get_schema_sample_rows(sample_rows, offset, limit); + let available_columns_info = build_available_columns_info(data, selected_columns, &header)?; + + let available_columns = build_available_columns( + available_columns_info, + data, + offset, + sample_rows_limit, + dtypes, + )?; + + let fields = available_columns + .iter() + .map(Into::::into) + .collect::>(); + + let schema = Schema::new(fields); + + record_batch_from_data_and_schema(schema, data, offset, limit) } #[allow(clippy::too_many_arguments)] @@ -77,21 +150,44 @@ impl ExcelReader { schema_sample_rows: Option, use_columns: Option<&PyAny>, dtypes: Option, - ) -> FastExcelResult { - let range = self.sheets.worksheet_range(&name)?; - + eager: bool, + py: Python<'_>, + ) -> PyResult { let header = Header::new(header_row, column_names); - let pagination = Pagination::new(skip_rows, n_rows, &range)?; - let selected_columns = Self::build_selected_columns(use_columns)?; - ExcelSheet::try_new( - name, - range, - header, - pagination, - schema_sample_rows, - selected_columns, - dtypes, - ) + let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?; + if eager && self.sheets.supports_by_ref() { + let range = self.sheets.worksheet_range_ref(&name).into_pyresult()?; + let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; + Self::load_sheet_eager( + &range.into(), + pagination, + header, + schema_sample_rows, + &selected_columns, + dtypes.as_ref(), + ) + .into_pyresult() + .and_then(|rb| rb.to_pyarrow(py)) + } else { + let range = self.sheets.worksheet_range(&name).into_pyresult()?; + let pagination = Pagination::new(skip_rows, n_rows, &range).into_pyresult()?; + let sheet = ExcelSheet::try_new( + name, + range.into(), + header, + pagination, + schema_sample_rows, + selected_columns, + dtypes, + ) + .into_pyresult()?; + + if eager { + sheet.to_arrow(py) + } else { + Ok(sheet.into_py(py)) + } + } } } @@ -128,6 +224,7 @@ impl ExcelReader { schema_sample_rows = 1_000, use_columns = None, dtypes = None, + eager = false, ))] #[allow(clippy::too_many_arguments)] pub fn load_sheet( @@ -140,7 +237,9 @@ impl ExcelReader { schema_sample_rows: Option, use_columns: Option<&PyAny>, dtypes: Option, - ) -> PyResult { + eager: bool, + py: Python<'_>, + ) -> PyResult { let name = idx_or_name .try_into() .and_then(|idx_or_name| match idx_or_name { @@ -179,7 +278,8 @@ impl ExcelReader { schema_sample_rows, use_columns, dtypes, + eager, + py, ) - .into_pyresult() } } diff --git a/src/types/python/excelsheet/column_info.rs b/src/types/python/excelsheet/column_info.rs index 8e502ac..0947d22 100644 --- a/src/types/python/excelsheet/column_info.rs +++ b/src/types/python/excelsheet/column_info.rs @@ -1,6 +1,6 @@ -use std::{str::FromStr, usize}; +use std::{fmt::Display, str::FromStr}; -use calamine::{Data as CalData, Range}; +use arrow::datatypes::Field; use pyo3::{pyclass, pymethods, PyResult}; use crate::{ @@ -8,11 +8,13 @@ use crate::{ py_errors::IntoPyResult, ErrorContext, FastExcelError, FastExcelErrorKind, FastExcelResult, }, types::{ - dtype::{get_dtype_for_column, DType, DTypeMap}, + dtype::{DType, DTypeMap}, idx_or_name::IdxOrName, }, }; +use super::{sheet_data::ExcelSheetData, Header, SelectedColumns}; + #[derive(Debug, Clone, PartialEq)] pub(crate) enum ColumnNameFrom { Provided, @@ -36,14 +38,13 @@ impl FromStr for ColumnNameFrom { } } -impl ToString for ColumnNameFrom { - fn to_string(&self) -> String { - match self { +impl Display for ColumnNameFrom { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { ColumnNameFrom::Provided => "provided", ColumnNameFrom::LookedUp => "looked_up", ColumnNameFrom::Generated => "generated", - } - .to_string() + }) } } @@ -54,14 +55,13 @@ pub(crate) enum DTypeFrom { Guessed, } -impl ToString for DTypeFrom { - fn to_string(&self) -> String { - match self { +impl Display for DTypeFrom { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { DTypeFrom::ProvidedByIndex => "provided_by_index", DTypeFrom::ProvidedByName => "provided_by_name", DTypeFrom::Guessed => "guessed", - } - .to_string() + }) } } @@ -127,6 +127,12 @@ impl ColumnInfo { } } +impl From<&ColumnInfo> for Field { + fn from(col_info: &ColumnInfo) -> Self { + Field::new(col_info.name(), col_info.dtype().into(), true) + } +} + #[pymethods] impl ColumnInfo { /// Creates a new ColumnInfo object. @@ -182,7 +188,7 @@ impl ColumnInfo { } pub fn __repr__(&self) -> String { - format!("ColumnInfo(name=\"{name}\", index={index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" )", name=self.name, index=self.index, dtype=self.dtype.to_string(), dtype_from=self.dtype_from.to_string(), column_name_from=self.column_name_from.to_string()) + format!("ColumnInfo(name=\"{name}\", index={index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" )", name=self.name, index=self.index, dtype=self.dtype, dtype_from=self.dtype_from, column_name_from=self.column_name_from) } pub fn __eq__(&self, other: &Self) -> bool { @@ -191,7 +197,7 @@ impl ColumnInfo { } #[derive(Debug)] -pub(super) struct ColumnInfoBuilder { +pub(crate) struct ColumnInfoBuilder { name: String, index: usize, column_name_from: ColumnNameFrom, @@ -227,7 +233,7 @@ impl ColumnInfoBuilder { fn dtype_info( &self, - data: &Range, + data: &ExcelSheetData<'_>, start_row: usize, end_row: usize, specified_dtypes: Option<&DTypeMap>, @@ -247,14 +253,14 @@ impl ColumnInfoBuilder { .map(FastExcelResult::Ok) // If we could not look up a dtype, guess it from the data .unwrap_or_else(|| { - get_dtype_for_column(data, start_row, end_row, self.index) + data.dtype_for_column(start_row, end_row, self.index) .map(|dtype| (dtype, DTypeFrom::Guessed)) }) } pub(super) fn finish( self, - data: &Range, + data: &ExcelSheetData<'_>, start_row: usize, end_row: usize, specified_dtypes: Option<&DTypeMap>, @@ -271,3 +277,144 @@ impl ColumnInfoBuilder { )) } } + +pub(crate) fn build_available_columns_info( + data: &ExcelSheetData<'_>, + selected_columns: &SelectedColumns, + header: &Header, +) -> FastExcelResult> { + let width = data.width(); + match header { + Header::None => Ok((0..width) + .map(|col_idx| { + ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ) + }) + .collect()), + Header::At(row_idx) => Ok((0..width) + .map(|col_idx| { + data.get_as_string((*row_idx, col_idx)) + .map(|col_name| { + ColumnInfoBuilder::new(col_name, col_idx, ColumnNameFrom::LookedUp) + }) + .unwrap_or_else(|| { + ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ) + }) + }) + .collect()), + Header::With(names) => { + if let SelectedColumns::Selection(column_selection) = selected_columns { + if column_selection.len() != names.len() { + return Err(FastExcelErrorKind::InvalidParameters( + "column_names and use_columns must have the same length".to_string(), + ) + .into()); + } + let selected_indices = column_selection + .iter() + .map(|idx_or_name| { + match idx_or_name { + IdxOrName::Idx(idx) => Ok(*idx), + IdxOrName::Name(name) => Err(FastExcelErrorKind::InvalidParameters( + format!("use_columns can only contain integers when used with columns_names, got \"{name}\"") + ) + .into()), + } + }) + .collect::>>()?; + + Ok((0..width) + .map(|col_idx| { + let provided_name_opt = if let Some(pos_in_names) = + selected_indices.iter().position(|idx| idx == &col_idx) + { + names.get(pos_in_names).cloned() + } else { + None + }; + + match provided_name_opt { + Some(provided_name) => ColumnInfoBuilder::new( + provided_name, + col_idx, + ColumnNameFrom::Provided, + ), + None => ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ), + } + }) + .collect()) + } else { + let nameless_start_idx = names.len(); + Ok(names + .iter() + .enumerate() + .map(|(col_idx, name)| { + ColumnInfoBuilder::new(name.to_owned(), col_idx, ColumnNameFrom::Provided) + }) + .chain((nameless_start_idx..width).map(|col_idx| { + ColumnInfoBuilder::new( + format!("__UNNAMED__{col_idx}"), + col_idx, + ColumnNameFrom::Generated, + ) + })) + .collect()) + } + } + } +} + +fn alias_for_name(name: &str, existing_names: &[String]) -> String { + #[inline] + fn rec(name: &str, existing_names: &[String], depth: usize) -> String { + let alias = if depth == 0 { + name.to_owned() + } else { + format!("{name}_{depth}") + }; + match existing_names + .iter() + .any(|existing_name| existing_name == &alias) + { + true => rec(name, existing_names, depth + 1), + false => alias, + } + } + + rec(name, existing_names, 0) +} + +pub(crate) fn build_available_columns( + available_columns_info: Vec, + data: &ExcelSheetData, + start_row: usize, + end_row: usize, + specified_dtypes: Option<&DTypeMap>, +) -> FastExcelResult> { + let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len()); + + available_columns_info + .into_iter() + .map(|mut column_info_builder| { + // Setting the right alias for every column + let alias = alias_for_name(column_info_builder.name(), &aliased_available_columns); + if alias != column_info_builder.name() { + column_info_builder = column_info_builder.with_name(alias.clone()); + } + aliased_available_columns.push(alias); + // Setting the dtype info + column_info_builder.finish(data, start_row, end_row, specified_dtypes) + }) + .collect() +} diff --git a/src/types/python/excelsheet/mod.rs b/src/types/python/excelsheet/mod.rs index 4792891..6f54f59 100644 --- a/src/types/python/excelsheet/mod.rs +++ b/src/types/python/excelsheet/mod.rs @@ -1,6 +1,8 @@ pub(crate) mod column_info; +pub(crate) mod sheet_data; -use std::{cmp, collections::HashSet, str::FromStr, sync::Arc}; +use calamine::{CellType, Range}; +use std::{cmp, collections::HashSet, fmt::Debug, str::FromStr, sync::Arc}; use crate::{ error::{ @@ -11,18 +13,14 @@ use crate::{ idx_or_name::IdxOrName, }, }; +use sheet_data::ExcelSheetData; use arrow::{ - array::{ - Array, BooleanArray, Date32Array, DurationMillisecondArray, Float64Array, Int64Array, - NullArray, StringArray, TimestampMillisecondArray, - }, - datatypes::{Field, Schema}, + array::NullArray, + datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit}, pyarrow::ToPyArrow, record_batch::RecordBatch, }; -use calamine::{Data as CalData, DataType, Range}; -use chrono::NaiveDate; use pyo3::{ prelude::{pyclass, pymethods, PyObject, Python}, @@ -30,7 +28,13 @@ use pyo3::{ PyAny, PyResult, ToPyObject, }; -use self::column_info::{ColumnInfo, ColumnInfoBuilder, ColumnNameFrom}; +use crate::utils::schema::get_schema_sample_rows; + +use self::column_info::{build_available_columns, build_available_columns_info, ColumnInfo}; +use self::sheet_data::{ + create_boolean_array, create_date_array, create_datetime_array, create_duration_array, + create_float_array, create_int_array, create_string_array, +}; #[derive(Debug)] pub(crate) enum Header { @@ -65,10 +69,10 @@ pub(crate) struct Pagination { } impl Pagination { - pub(crate) fn new( + pub(crate) fn new( skip_rows: usize, n_rows: Option, - range: &Range, + range: &Range, ) -> FastExcelResult { let max_height = range.height(); if max_height < skip_rows { @@ -84,6 +88,10 @@ impl Pagination { pub(crate) fn offset(&self) -> usize { self.skip_rows } + + pub(crate) fn n_rows(&self) -> Option { + self.n_rows + } } impl TryFrom<&PyList> for SelectedColumns { type Error = FastExcelError; @@ -137,6 +145,7 @@ impl SelectedColumns { .collect(), } } + const ALPHABET: [char; 26] = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', @@ -276,32 +285,13 @@ impl TryFrom> for SelectedColumns { } } -fn alias_for_name(name: &str, existing_names: &[String]) -> String { - fn rec(name: &str, existing_names: &[String], depth: usize) -> String { - let alias = if depth == 0 { - name.to_owned() - } else { - format!("{name}_{depth}") - }; - match existing_names - .iter() - .any(|existing_name| existing_name == &alias) - { - true => rec(name, existing_names, depth + 1), - false => alias, - } - } - - rec(name, existing_names, 0) -} - #[pyclass(name = "_ExcelSheet")] pub(crate) struct ExcelSheet { #[pyo3(get)] pub(crate) name: String, header: Header, pagination: Pagination, - data: Range, + data: ExcelSheetData<'static>, height: Option, total_height: Option, width: Option, @@ -313,19 +303,21 @@ pub(crate) struct ExcelSheet { } impl ExcelSheet { - pub(crate) fn data(&self) -> &Range { + pub(crate) fn data(&self) -> &ExcelSheetData<'_> { &self.data } pub(crate) fn try_new( name: String, - data: Range, + data: ExcelSheetData<'static>, header: Header, pagination: Pagination, schema_sample_rows: Option, selected_columns: SelectedColumns, dtypes: Option, ) -> FastExcelResult { + let available_columns_info = + build_available_columns_info(&data, &selected_columns, &header)?; let mut sheet = ExcelSheet { name, header, @@ -341,33 +333,17 @@ impl ExcelSheet { selected_columns: Vec::with_capacity(0), }; - let available_columns_info = sheet.get_available_columns_info(&selected_columns)?; - - let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len()); - - let dtype_sample_rows = - sheet.offset() + sheet.schema_sample_rows().unwrap_or(sheet.limit()); - let row_limit = cmp::min(dtype_sample_rows, sheet.limit()); + let row_limit = sheet.schema_sample_rows(); // Finalizing column info - let available_columns = available_columns_info - .into_iter() - .map(|mut column_info_builder| { - // Setting the right alias for every column - let alias = alias_for_name(column_info_builder.name(), &aliased_available_columns); - if alias != column_info_builder.name() { - column_info_builder = column_info_builder.with_name(alias.clone()); - } - aliased_available_columns.push(alias); - // Setting the dtype info - column_info_builder.finish( - &sheet.data, - sheet.offset(), - row_limit, - sheet.dtypes.as_ref(), - ) - }) - .collect::>>()?; + let available_columns = build_available_columns( + available_columns_info, + &sheet.data, + sheet.offset(), + row_limit, + sheet.dtypes.as_ref(), + )?; + let selected_columns = selected_columns.select_columns(&available_columns)?; sheet.available_columns = available_columns; sheet.selected_columns = selected_columns; @@ -376,108 +352,6 @@ impl ExcelSheet { Ok(sheet) } - fn get_available_columns_info( - &self, - selected_columns: &SelectedColumns, - ) -> FastExcelResult> { - let width = self.data.width(); - match &self.header { - Header::None => Ok((0..width) - .map(|col_idx| { - ColumnInfoBuilder::new( - format!("__UNNAMED__{col_idx}"), - col_idx, - ColumnNameFrom::Generated, - ) - }) - .collect()), - Header::At(row_idx) => Ok((0..width) - .map(|col_idx| { - self.data - .get((*row_idx, col_idx)) - .and_then(|data| data.as_string()) - .map(|col_name| { - ColumnInfoBuilder::new(col_name, col_idx, ColumnNameFrom::LookedUp) - }) - .unwrap_or_else(|| { - ColumnInfoBuilder::new( - format!("__UNNAMED__{col_idx}"), - col_idx, - ColumnNameFrom::Generated, - ) - }) - }) - .collect()), - Header::With(names) => { - if let SelectedColumns::Selection(column_selection) = selected_columns { - if column_selection.len() != names.len() { - return Err(FastExcelErrorKind::InvalidParameters( - "column_names and use_columns must have the same length".to_string(), - ) - .into()); - } - let selected_indices = column_selection - .iter() - .map(|idx_or_name| { - match idx_or_name { - IdxOrName::Idx(idx) => Ok(*idx), - IdxOrName::Name(name) => Err(FastExcelErrorKind::InvalidParameters( - format!("use_columns can only contain integers when used with columns_names, got \"{name}\"") - ) - .into()), - } - }) - .collect::>>()?; - - Ok((0..width) - .map(|col_idx| { - let provided_name_opt = if let Some(pos_in_names) = - selected_indices.iter().position(|idx| idx == &col_idx) - { - names.get(pos_in_names).cloned() - } else { - None - }; - - match provided_name_opt { - Some(provided_name) => ColumnInfoBuilder::new( - provided_name, - col_idx, - ColumnNameFrom::Provided, - ), - None => ColumnInfoBuilder::new( - format!("__UNNAMED__{col_idx}"), - col_idx, - ColumnNameFrom::Generated, - ), - } - }) - .collect()) - } else { - let nameless_start_idx = names.len(); - Ok(names - .iter() - .enumerate() - .map(|(col_idx, name)| { - ColumnInfoBuilder::new( - name.to_owned(), - col_idx, - ColumnNameFrom::Provided, - ) - }) - .chain((nameless_start_idx..width).map(|col_idx| { - ColumnInfoBuilder::new( - format!("__UNNAMED__{col_idx}"), - col_idx, - ColumnNameFrom::Generated, - ) - })) - .collect()) - } - } - } - } - pub(crate) fn limit(&self) -> usize { let upper_bound = self.data.height(); if let Some(n_rows) = self.pagination.n_rows { @@ -490,125 +364,63 @@ impl ExcelSheet { upper_bound } - pub(crate) fn schema_sample_rows(&self) -> &Option { - &self.schema_sample_rows + pub(crate) fn schema_sample_rows(&self) -> usize { + get_schema_sample_rows(self.schema_sample_rows, self.offset(), self.limit()) } } -fn create_boolean_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - Arc::new(BooleanArray::from_iter((offset..limit).map(|row| { - data.get((row, col)).and_then(|cell| match cell { - CalData::Bool(b) => Some(*b), - CalData::Int(i) => Some(*i != 0), - CalData::Float(f) => Some(*f != 0.0), - _ => None, - }) - }))) -} - -fn create_int_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - Arc::new(Int64Array::from_iter( - (offset..limit).map(|row| data.get((row, col)).and_then(|cell| cell.as_i64())), - )) -} - -fn create_float_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - Arc::new(Float64Array::from_iter( - (offset..limit).map(|row| data.get((row, col)).and_then(|cell| cell.as_f64())), - )) -} - -fn create_string_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - Arc::new(StringArray::from_iter((offset..limit).map(|row| { - // NOTE: Not using cell.as_string() here because it matches the String variant last, which - // is slower for columns containing mostly/only strings (which we expect to meet more often than - // mixed dtype columns containing mostly numbers) - data.get((row, col)).and_then(|cell| match cell { - CalData::String(s) => Some(s.to_string()), - CalData::Float(s) => Some(s.to_string()), - CalData::Int(s) => Some(s.to_string()), - CalData::DateTime(dt) => dt.as_datetime().map(|dt| dt.to_string()), - CalData::DateTimeIso(dt) => Some(dt.to_string()), - _ => None, - }) - }))) -} - -fn duration_type_to_i64(caldt: &CalData) -> Option { - caldt.as_duration().map(|d| d.num_milliseconds()) -} - -fn create_date_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - Arc::new(Date32Array::from_iter((offset..limit).map(|row| { - data.get((row, col)) - .and_then(|caldate| caldate.as_date()) - .and_then(|date| i32::try_from(date.signed_duration_since(epoch).num_days()).ok()) - }))) -} - -fn create_datetime_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - Arc::new(TimestampMillisecondArray::from_iter((offset..limit).map( - |row| { - data.get((row, col)) - .and_then(|caldt| caldt.as_datetime()) - .map(|dt| dt.and_utc().timestamp_millis()) - }, - ))) -} - -fn create_duration_array( - data: &Range, - col: usize, - offset: usize, - limit: usize, -) -> Arc { - Arc::new(DurationMillisecondArray::from_iter( - (offset..limit).map(|row| data.get((row, col)).and_then(duration_type_to_i64)), - )) -} - impl From<&ExcelSheet> for Schema { fn from(sheet: &ExcelSheet) -> Self { let fields: Vec<_> = sheet .selected_columns .iter() - .map(|col_info| Field::new(col_info.name(), col_info.dtype().into(), true)) + .map(Into::::into) .collect(); Schema::new(fields) } } +pub(crate) fn record_batch_from_data_and_schema( + schema: Schema, + data: &ExcelSheetData, + offset: usize, + limit: usize, +) -> FastExcelResult { + let mut iter = schema + .fields() + .iter() + .enumerate() + .map(|(col_idx, field)| { + ( + field.name(), + match field.data_type() { + ArrowDataType::Boolean => create_boolean_array(data, col_idx, offset, limit), + ArrowDataType::Int64 => create_int_array(data, col_idx, offset, limit), + ArrowDataType::Float64 => create_float_array(data, col_idx, offset, limit), + ArrowDataType::Utf8 => create_string_array(data, col_idx, offset, limit), + ArrowDataType::Timestamp(TimeUnit::Millisecond, None) => { + create_datetime_array(data, col_idx, offset, limit) + } + ArrowDataType::Date32 => create_date_array(data, col_idx, offset, limit), + ArrowDataType::Duration(TimeUnit::Millisecond) => { + create_duration_array(data, col_idx, offset, limit) + } + ArrowDataType::Null => Arc::new(NullArray::new(limit - offset)), + _ => unreachable!(), + }, + ) + }) + .peekable(); + // If the iterable is empty, try_from_iter returns an Err + if iter.peek().is_none() { + Ok(RecordBatch::new_empty(Arc::new(schema))) + } else { + RecordBatch::try_from_iter(iter) + .map_err(|err| FastExcelErrorKind::ArrowError(err.to_string()).into()) + .with_context(|| "could not create RecordBatch from iterable") + } +} + impl TryFrom<&ExcelSheet> for RecordBatch { type Error = FastExcelError; diff --git a/src/types/python/excelsheet/sheet_data.rs b/src/types/python/excelsheet/sheet_data.rs new file mode 100644 index 0000000..faa343c --- /dev/null +++ b/src/types/python/excelsheet/sheet_data.rs @@ -0,0 +1,207 @@ +use std::sync::Arc; + +use arrow::array::Array; +use calamine::{Data as CalData, DataRef as CalDataRef, DataType, Range}; + +use crate::{ + error::FastExcelResult, + types::dtype::{get_dtype_for_column, DType}, +}; + +pub(crate) enum ExcelSheetData<'r> { + Owned(Range), + Ref(Range>), +} + +impl ExcelSheetData<'_> { + pub(crate) fn width(&self) -> usize { + match self { + ExcelSheetData::Owned(range) => range.width(), + ExcelSheetData::Ref(range) => range.width(), + } + } + + pub(crate) fn height(&self) -> usize { + match self { + ExcelSheetData::Owned(range) => range.height(), + ExcelSheetData::Ref(range) => range.height(), + } + } + + pub(super) fn get_as_string(&self, pos: (usize, usize)) -> Option { + match self { + ExcelSheetData::Owned(range) => range.get(pos).and_then(|data| data.as_string()), + ExcelSheetData::Ref(range) => range.get(pos).and_then(|data| data.as_string()), + } + } + + pub(crate) fn dtype_for_column( + &self, + start_row: usize, + end_row: usize, + col: usize, + ) -> FastExcelResult { + match self { + ExcelSheetData::Owned(data) => get_dtype_for_column(data, start_row, end_row, col), + ExcelSheetData::Ref(data) => get_dtype_for_column(data, start_row, end_row, col), + } + } +} + +impl From> for ExcelSheetData<'_> { + fn from(range: Range) -> Self { + Self::Owned(range) + } +} + +impl<'a> From>> for ExcelSheetData<'a> { + fn from(range: Range>) -> Self { + Self::Ref(range) + } +} + +mod array_impls { + use std::sync::Arc; + + use arrow::array::{ + Array, BooleanArray, Date32Array, DurationMillisecondArray, Float64Array, Int64Array, + StringArray, TimestampMillisecondArray, + }; + use calamine::{CellType, DataType, Range}; + use chrono::NaiveDate; + + pub(super) fn create_boolean_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + Arc::new(BooleanArray::from_iter((offset..limit).map(|row| { + data.get((row, col)).and_then(|cell| { + if let Some(b) = cell.get_bool() { + Some(b) + } else if let Some(i) = cell.get_int() { + Some(i != 0) + } + // clippy formats else if let Some(blah) = ... { Some(x) } else { None } to the .map form + else { + cell.get_float().map(|f| f != 0.0) + } + }) + }))) + } + + pub(super) fn create_int_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + Arc::new(Int64Array::from_iter( + (offset..limit).map(|row| data.get((row, col)).and_then(|cell| cell.as_i64())), + )) + } + + pub(super) fn create_float_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + Arc::new(Float64Array::from_iter( + (offset..limit).map(|row| data.get((row, col)).and_then(|cell| cell.as_f64())), + )) + } + + pub(super) fn create_string_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + Arc::new(StringArray::from_iter((offset..limit).map(|row| { + data.get((row, col)).and_then(|cell| { + if cell.is_string() { + cell.get_string().map(str::to_string) + } else if cell.is_datetime() { + cell.get_datetime() + .and_then(|dt| dt.as_datetime()) + .map(|dt| dt.to_string()) + } else if cell.is_datetime_iso() { + cell.get_datetime_iso().map(str::to_string) + } else { + cell.as_string() + } + }) + }))) + } + + fn duration_type_to_i64(caldt: &DT) -> Option { + caldt.as_duration().map(|d| d.num_milliseconds()) + } + + pub(super) fn create_date_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + Arc::new(Date32Array::from_iter((offset..limit).map(|row| { + data.get((row, col)) + .and_then(|caldate| caldate.as_date()) + .and_then(|date| i32::try_from(date.signed_duration_since(epoch).num_days()).ok()) + }))) + } + + pub(super) fn create_datetime_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + Arc::new(TimestampMillisecondArray::from_iter((offset..limit).map( + |row| { + data.get((row, col)) + .and_then(|caldt| caldt.as_datetime()) + .map(|dt| dt.and_utc().timestamp_millis()) + }, + ))) + } + + pub(super) fn create_duration_array( + data: &Range
, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + Arc::new(DurationMillisecondArray::from_iter( + (offset..limit).map(|row| data.get((row, col)).and_then(duration_type_to_i64)), + )) + } +} + +/// Creates a function that will dispatch ExcelData to the generic create_x_array implementation +macro_rules! create_array_function { + ($func_name:ident) => { + pub(crate) fn $func_name( + data: &ExcelSheetData, + col: usize, + offset: usize, + limit: usize, + ) -> Arc { + match data { + ExcelSheetData::Owned(range) => array_impls::$func_name(range, col, offset, limit), + ExcelSheetData::Ref(range) => array_impls::$func_name(range, col, offset, limit), + } + } + }; +} + +create_array_function!(create_boolean_array); +create_array_function!(create_string_array); +create_array_function!(create_int_array); +create_array_function!(create_float_array); +create_array_function!(create_datetime_array); +create_array_function!(create_date_array); +create_array_function!(create_duration_array); diff --git a/src/utils/mod.rs b/src/utils/mod.rs new file mode 100644 index 0000000..014c083 --- /dev/null +++ b/src/utils/mod.rs @@ -0,0 +1 @@ +pub(crate) mod schema; diff --git a/src/utils/schema.rs b/src/utils/schema.rs new file mode 100644 index 0000000..79dcbae --- /dev/null +++ b/src/utils/schema.rs @@ -0,0 +1,48 @@ +use std::cmp::min; + +/// Determines how many rows should be used for schema sampling, based on the provided parameter, +/// and the sheet's offset and limit. +/// +/// Note that here, the limit should be retrieved from the sheet's `limit()` method, and must not +/// be out of the sheet's bounds +pub(crate) fn get_schema_sample_rows( + sample_rows: Option, + offset: usize, + limit: usize, +) -> usize { + // Checking how many rows we want to use to determine the dtype for a column. If sample_rows is + // not provided, we sample limit rows, i.e on the entire column + let sample_rows = offset + sample_rows.unwrap_or(limit); + // If sample_rows is higher than the sheet's limit, use the limit instead + min(sample_rows, limit) +} + +#[cfg(test)] +mod tests { + use super::get_schema_sample_rows; + use rstest::rstest; + + #[rstest] + // default value, 50 rows sheet, row limit should be 50 + #[case(Some(1000), 0, 50, 50)] + // default value, 5000 rows sheet, row limit should be 1000 + #[case(Some(1000), 0, 5000, 1000)] + // default value, 1500 rows sheet, offset of 1000, row limit should be 1500 + #[case(Some(1000), 1000, 1500, 1500)] + // 100 sampling size, 1500 rows sheet, offset of 1000, row limit should be 1100 + #[case(Some(100), 1000, 1500, 1100)] + // No value, 50 rows sheet, row limit should be 50 + #[case(None, 0, 50, 50)] + // No value, 5000 rows sheet, row limit should be 5000 + #[case(None, 0, 5000, 5000)] + // no value, 1500 rows sheet, offset of 1000, row limit should be 1500 + #[case(None, 1000, 1500, 1500)] + fn test_get_schema_sample_rows_return_values( + #[case] sample_rows: Option, + #[case] offset: usize, + #[case] limit: usize, + #[case] expected: usize, + ) { + assert_eq!(get_schema_sample_rows(sample_rows, offset, limit), expected); + } +} diff --git a/test.py b/test.py index 7ce0f28..5fa3c4a 100644 --- a/test.py +++ b/test.py @@ -7,6 +7,13 @@ def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("file") parser.add_argument("-c", "--column", type=str, nargs="+", help="the columns to use") + parser.add_argument( + "--eager", action="store_true", help="wether the sheet should be loaded eagerly" + ) + parser.add_argument( + "-i", "--iterations", type=int, help="the number of iterations to do", default=1 + ) + return parser.parse_args() @@ -15,8 +22,12 @@ def main(): excel_file = fastexcel.read_excel(args.file) use_columns = args.column or None - for sheet_name in excel_file.sheet_names: - excel_file.load_sheet_by_name(sheet_name, use_columns=use_columns).to_arrow() + for _ in range(args.iterations): + for sheet_name in excel_file.sheet_names: + if args.eager: + excel_file.load_sheet_eager(sheet_name, use_columns=use_columns) + else: + excel_file.load_sheet(sheet_name, use_columns=use_columns).to_arrow() if __name__ == "__main__":