From 587250c8e0f9707cc102bd04573395c153249ced Mon Sep 17 00:00:00 2001 From: RinChanNOW Date: Mon, 4 Sep 2023 19:26:15 +0800 Subject: [PATCH] fix: avoid panic if offset index not exists. (#4761) * fix: avoid panic if offset index not exists. * Add unit tests and comments. --- parquet/src/arrow/arrow_reader/mod.rs | 43 ++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 5f95a8664b4b..2acc0faf130f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -480,7 +480,11 @@ impl Iterator for ReaderPageIterator { let rg = self.metadata.row_group(rg_idx); let meta = rg.column(self.column_idx); let offset_index = self.metadata.offset_index(); - let page_locations = offset_index.map(|i| i[rg_idx][self.column_idx].clone()); + // `offset_index` may not exist and `i[rg_idx]` will be empty. + // To avoid `i[rg_idx][self.oolumn_idx`] panic, we need to filter out empty `i[rg_idx]`. + let page_locations = offset_index + .filter(|i| !i[rg_idx].is_empty()) + .map(|i| i[rg_idx][self.column_idx].clone()); let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); @@ -2481,6 +2485,43 @@ mod tests { assert_eq!(reader.batch_size, num_rows as usize); } + #[test] + fn test_read_with_page_index_enabled() { + let testdata = arrow::util::test_util::parquet_test_data(); + + { + // `alltypes_tiny_pages.parquet` has page index + let path = format!("{testdata}/alltypes_tiny_pages.parquet"); + let test_file = File::open(path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + test_file, + ArrowReaderOptions::new().with_page_index(true), + ) + .unwrap(); + assert!(!builder.metadata().offset_index().unwrap()[0].is_empty()); + let reader = builder.build().unwrap(); + let batches = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 8); + } + + { + // `alltypes_plain.parquet` doesn't have page index + let path = format!("{testdata}/alltypes_plain.parquet"); + let test_file = File::open(path).unwrap(); + let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + test_file, + ArrowReaderOptions::new().with_page_index(true), + ) + .unwrap(); + // Although `Vec>` of each row group is empty, + // we should read the file successfully. + assert!(builder.metadata().offset_index().unwrap()[0].is_empty()); + let reader = builder.build().unwrap(); + let batches = reader.collect::, _>>().unwrap(); + assert_eq!(batches.len(), 1); + } + } + #[test] fn test_raw_repetition() { const MESSAGE_TYPE: &str = "