diff --git a/crates/polars-parquet/src/arrow/read/deserialize/null.rs b/crates/polars-parquet/src/arrow/read/deserialize/null.rs index 74defc1d3b74..8c28a7fc66bb 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/null.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null.rs @@ -124,6 +124,8 @@ pub fn iter_to_arrays( data_type: ArrowDataType, mut filter: Option, ) -> ParquetResult> { + _ = iter.read_dict_page()?; + let num_rows = Filter::opt_num_rows(&filter, iter.total_num_values()); let mut len = 0usize; diff --git a/crates/polars-parquet/src/parquet/read/compression.rs b/crates/polars-parquet/src/parquet/read/compression.rs index 91ff7a519c61..0996093b31f0 100644 --- a/crates/polars-parquet/src/parquet/read/compression.rs +++ b/crates/polars-parquet/src/parquet/read/compression.rs @@ -215,12 +215,14 @@ impl Iterator for BasicDecompressor { Some(Ok(p)) => p, }; - Some(decompress(page, &mut self.buffer).map(|p| { - if let Page::Data(p) = p { - p - } else { - panic!("Found compressed page in the middle of the pages") - } + Some(decompress(page, &mut self.buffer).and_then(|p| { + let Page::Data(p) = p else { + return Err(ParquetError::oos( + "Found dictionary page beyond the first page of a column chunk", + )); + }; + + Ok(p) })) } diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 18f15bfd40c0..69a2d1aa230d 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1400,3 +1400,17 @@ def test_write_sliced_lists_18069() -> None: after = pl.read_parquet(f) assert_frame_equal(before, after) + + +def test_null_array_dict_pages_18085() -> None: + test = pd.DataFrame( + [ + {"A": float("NaN"), "B": 3, "C": None}, + {"A": float("NaN"), "B": None, "C": None}, + ] + ) + + f = io.BytesIO() + test.to_parquet(f) + f.seek(0) + pl.read_parquet(f)