Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add option to set header row #453

Merged
merged 15 commits into from
Oct 8, 2024
Merged
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,26 @@ if let Some(Ok(r)) = excel.worksheet_range("Sheet1") {
}
```

### Reader: With header row

```rs
use calamine::{Reader, Xlsx, open_workbook};

let mut excel: Xlsx<_> = open_workbook("file.xlsx").unwrap();

let sheet1 = excel
.with_header_row(Some(3))
.worksheet_range("Sheet1")
.unwrap();
```

Note that `xlsx` and `xlsb` files support lazy loading, so specifying a
header row takes effect immediately when reading a sheet range.
In contrast, for `xls` and `ods` files, all sheets are loaded at once when
opening the workbook with default settings.
As a result, setting the header row only applies afterward and does not
provide any performance benefits.

### Reader: More complex

Let's assume
Expand Down Expand Up @@ -190,7 +210,7 @@ The programs are all structured to follow the same constructs:
use calamine::{open_workbook, Reader, Xlsx};

fn main() {
// Open workbook
// Open workbook
let mut excel: Xlsx<_> =
open_workbook("NYC_311_SR_2010-2020-sample-1M.xlsx").expect("failed to find file");

Expand Down
36 changes: 27 additions & 9 deletions src/auto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
use crate::errors::Error;
use crate::vba::VbaProject;
use crate::{
open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, Range, Reader, ReaderRef,
Xls, Xlsb, Xlsx,
open_workbook, open_workbook_from_rs, Data, DataRef, HeaderRow, Metadata, Ods, Range, Reader,
ReaderRef, Xls, Xlsb, Xlsx,
};
use std::borrow::Cow;
use std::fs::File;
Expand Down Expand Up @@ -85,9 +85,27 @@ where
Err(Error::Msg("Sheets must be created from a Path"))
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
match self {
Sheets::Xls(ref mut e) => {
e.with_header_row(header_row);
}
Sheets::Xlsx(ref mut e) => {
e.with_header_row(header_row);
}
Sheets::Xlsb(ref mut e) => {
e.with_header_row(header_row);
}
Sheets::Ods(ref mut e) => {
e.with_header_row(header_row);
}
}
self
}

/// Gets `VbaProject`
fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, Self::Error>> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xls)),
Sheets::Xlsx(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xlsx)),
Sheets::Xlsb(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xlsb)),
Expand All @@ -97,7 +115,7 @@ where

/// Initialize
fn metadata(&self) -> &Metadata {
match *self {
match self {
Sheets::Xls(ref e) => e.metadata(),
Sheets::Xlsx(ref e) => e.metadata(),
Sheets::Xlsb(ref e) => e.metadata(),
Expand All @@ -107,7 +125,7 @@ where

/// Read worksheet data in corresponding worksheet path
fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, Self::Error> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.worksheet_range(name).map_err(Error::Xls),
Sheets::Xlsx(ref mut e) => e.worksheet_range(name).map_err(Error::Xlsx),
Sheets::Xlsb(ref mut e) => e.worksheet_range(name).map_err(Error::Xlsb),
Expand All @@ -117,7 +135,7 @@ where

/// Read worksheet formula in corresponding worksheet path
fn worksheet_formula(&mut self, name: &str) -> Result<Range<String>, Self::Error> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.worksheet_formula(name).map_err(Error::Xls),
Sheets::Xlsx(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsx),
Sheets::Xlsb(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsb),
Expand All @@ -126,7 +144,7 @@ where
}

fn worksheets(&mut self) -> Vec<(String, Range<Data>)> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.worksheets(),
Sheets::Xlsx(ref mut e) => e.worksheets(),
Sheets::Xlsb(ref mut e) => e.worksheets(),
Expand All @@ -136,7 +154,7 @@ where

#[cfg(feature = "picture")]
fn pictures(&self) -> Option<Vec<(String, Vec<u8>)>> {
match *self {
match self {
Sheets::Xls(ref e) => e.pictures(),
Sheets::Xlsx(ref e) => e.pictures(),
Sheets::Xlsb(ref e) => e.pictures(),
Expand All @@ -153,7 +171,7 @@ where
&'a mut self,
name: &str,
) -> Result<Range<DataRef<'a>>, Self::Error> {
match *self {
match self {
Sheets::Xlsx(ref mut e) => e.worksheet_range_ref(name).map_err(Error::Xlsx),
Sheets::Xlsb(ref mut e) => e.worksheet_range_ref(name).map_err(Error::Xlsb),
Sheets::Xls(_) => unimplemented!(),
Expand Down
21 changes: 21 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,23 @@ pub struct Sheet {
pub visible: SheetVisible,
}

/// Row to use as header
/// By default, the first non-empty row is used as header
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub enum HeaderRow {
/// First non-empty row
FirstNonEmptyRow,
/// Index of the header row
Row(u32),
}

impl Default for HeaderRow {
fn default() -> Self {
HeaderRow::FirstNonEmptyRow
}
}

// FIXME `Reader` must only be seek `Seek` for `Xls::xls`. Because of the present API this limits
// the kinds of readers (other) data in formats can be read from.
/// A trait to share spreadsheets reader functions across different `FileType`s
Expand All @@ -228,6 +245,10 @@ where
/// Creates a new instance.
fn new(reader: RS) -> Result<Self, Self::Error>;

/// Set header row (i.e. first row to be read)
/// If `header_row` is `None`, the first non-empty row will be used as header row
fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self;

/// Gets `VbaProject`
fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, Self::Error>>;

Expand Down
37 changes: 33 additions & 4 deletions src/ods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use zip::read::{ZipArchive, ZipFile};
use zip::result::ZipError;

use crate::vba::VbaProject;
use crate::{Data, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible};
use crate::{Data, DataType, HeaderRow, Metadata, Range, Reader, Sheet, SheetType, SheetVisible};
use std::marker::PhantomData;

const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet";
Expand Down Expand Up @@ -62,6 +62,13 @@ pub enum OdsError {
WorksheetNotFound(String),
}

/// Ods reader options
#[derive(Debug, Default)]
#[non_exhaustive]
struct OdsOptions {
pub header_row: HeaderRow,
}

from_err!(std::io::Error, OdsError, Io);
from_err!(zip::result::ZipError, OdsError, Zip);
from_err!(quick_xml::Error, OdsError, Xml);
Expand Down Expand Up @@ -116,6 +123,8 @@ pub struct Ods<RS> {
marker: PhantomData<RS>,
#[cfg(feature = "picture")]
pictures: Option<Vec<(String, Vec<u8>)>>,
/// Reader options
options: OdsOptions,
}

impl<RS> Reader<RS> for Ods<RS>
Expand Down Expand Up @@ -161,9 +170,15 @@ where
sheets,
#[cfg(feature = "picture")]
pictures,
options: OdsOptions::default(),
})
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
self.options.header_row = header_row;
self
}

/// Gets `VbaProject`
fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, OdsError>> {
None
Expand All @@ -176,10 +191,24 @@ where

/// Read worksheet data in corresponding worksheet path
fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, OdsError> {
self.sheets
let sheet = self
.sheets
.get(name)
.ok_or_else(|| OdsError::WorksheetNotFound(name.into()))
.map(|r| r.0.to_owned())
.ok_or_else(|| OdsError::WorksheetNotFound(name.into()))?
.0
.to_owned();

match self.options.header_row {
HeaderRow::FirstNonEmptyRow => Ok(sheet),
HeaderRow::Row(header_row_idx) => {
// If `header_row` is a row index, adjust the range
if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) {
Ok(sheet.range((header_row_idx, start.1), end))
} else {
Ok(sheet)
}
}
}
}

fn worksheets(&mut self) -> Vec<(String, Range<Data>)> {
Expand Down
27 changes: 24 additions & 3 deletions src/xls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ use crate::utils::read_usize;
use crate::utils::{push_column, read_f64, read_i16, read_i32, read_u16, read_u32};
use crate::vba::VbaProject;
use crate::{
Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, Sheet, SheetType, SheetVisible,
Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, Sheet, SheetType,
SheetVisible,
};

#[derive(Debug)]
Expand Down Expand Up @@ -136,6 +137,8 @@ pub struct XlsOptions {
///
/// [code page]: https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
pub force_codepage: Option<u16>,
/// Row to use as header
pub header_row: HeaderRow,
}

struct SheetData {
Expand Down Expand Up @@ -231,6 +234,11 @@ impl<RS: Read + Seek> Reader<RS> for Xls<RS> {
Self::new_with_options(reader, XlsOptions::default())
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
self.options.header_row = header_row;
self
}

fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, XlsError>> {
self.vba.as_ref().map(|vba| Ok(Cow::Borrowed(vba)))
}
Expand All @@ -241,10 +249,23 @@ impl<RS: Read + Seek> Reader<RS> for Xls<RS> {
}

fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, XlsError> {
self.sheets
let sheet = self
.sheets
.get(name)
.map(|r| r.range.clone())
.ok_or_else(|| XlsError::WorksheetNotFound(name.into()))
.ok_or_else(|| XlsError::WorksheetNotFound(name.into()))?;

match self.options.header_row {
HeaderRow::FirstNonEmptyRow => Ok(sheet),
HeaderRow::Row(header_row_idx) => {
// If `header_row` is a row index, adjust the range
if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) {
Ok(sheet.range((header_row_idx, start.1), end))
} else {
Ok(sheet)
}
}
}
}

fn worksheets(&mut self) -> Vec<(String, Range<Data>)> {
Expand Down
Loading