Skip to content

Commit

Permalink
Document when the ParquetRecordBatchReader will re-read metadata (#5887)
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb authored Jun 15, 2024
1 parent c7513bd commit c191294
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 8 deletions.
50 changes: 42 additions & 8 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,9 @@ impl<T> ArrowReaderBuilder<T> {
/// is then read from the file, including projection and filter pushdown
#[derive(Debug, Clone, Default)]
pub struct ArrowReaderOptions {
/// Should the reader strip any user defined metadata from the Arrow schema
skip_arrow_metadata: bool,
/// If true, attempt to read `OffsetIndex` and `ColumnIndex`
pub(crate) page_index: bool,
}

Expand Down Expand Up @@ -282,23 +284,41 @@ impl ArrowReaderOptions {
}
}

/// The cheaply clone-able metadata necessary to construct a [`ArrowReaderBuilder`]
/// The metadata necessary to construct a [`ArrowReaderBuilder`]
///
/// This allows loading the metadata for a file once and then using this to construct
/// multiple separate readers, for example, to distribute readers across multiple threads
/// Note this structure is cheaply clone-able as it consists of several arcs.
///
/// This structure allows
///
/// 1. Loading metadata for a file once and then using that same metadata to
/// construct multiple separate readers, for example, to distribute readers
/// across multiple threads
///
/// 2. Using a cached copy of the [`ParquetMetadata`] rather than reading it
/// from the file each time a reader is constructed.
///
/// [`ParquetMetadata`]: crate::file::metadata::ParquetMetaData
#[derive(Debug, Clone)]
pub struct ArrowReaderMetadata {
/// The Parquet Metadata, if known aprior
pub(crate) metadata: Arc<ParquetMetaData>,

/// The Arrow Schema
pub(crate) schema: SchemaRef,

pub(crate) fields: Option<Arc<ParquetField>>,
}

impl ArrowReaderMetadata {
/// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`]
/// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`], if necessary
///
/// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for an
/// example of how this can be used
///
/// # Notes
///
/// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for how this can be used
/// If `options` has [`ArrowReaderOptions::with_page_index`] true, but
/// `Self::metadata` is missing the page index, this function will attempt
/// to load the page index by making an object store request.
pub fn load<T: ChunkReader>(reader: &T, options: ArrowReaderOptions) -> Result<Self> {
let mut metadata = footer::parse_metadata(reader)?;
if options.page_index {
Expand All @@ -320,6 +340,12 @@ impl ArrowReaderMetadata {
Self::try_new(Arc::new(metadata), options)
}

/// Create a new [`ArrowReaderMetadata`]
///
/// # Notes
///
/// This function does not attempt to load the PageIndex if not present in the metadata.
/// See [`Self::load`] for more details.
pub fn try_new(metadata: Arc<ParquetMetaData>, options: ArrowReaderOptions) -> Result<Self> {
let kv_metadata = match options.skip_arrow_metadata {
true => None,
Expand Down Expand Up @@ -407,9 +433,17 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {

/// Create a [`ParquetRecordBatchReaderBuilder`] from the provided [`ArrowReaderMetadata`]
///
/// This allows loading metadata once and using it to create multiple builders with
/// potentially different settings
/// This interface allows:
///
/// 1. Loading metadata once and using it to create multiple builders with
/// potentially different settings or run on different threads
///
/// 2. Using a cached copy of the metadata rather than re-reading it from the
/// file each time a reader is constructed.
///
/// See the docs on [`ArrowReaderMetadata`] for more details
///
/// # Example
/// ```
/// # use std::fs::metadata;
/// # use std::sync::Arc;
Expand Down
6 changes: 6 additions & 0 deletions parquet/src/arrow/async_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,12 @@ impl ArrowReaderMetadata {
/// Returns a new [`ArrowReaderMetadata`] for this builder
///
/// See [`ParquetRecordBatchStreamBuilder::new_with_metadata`] for how this can be used
///
/// # Notes
///
/// If `options` has [`ArrowReaderOptions::with_page_index`] true, but
/// `Self::metadata` is missing the page index, this function will attempt
/// to load the page index by making an object store request.
pub async fn load_async<T: AsyncFileReader>(
input: &mut T,
options: ArrowReaderOptions,
Expand Down

0 comments on commit c191294

Please sign in to comment.