Skip to content

Commit

Permalink
Add row-group and column select
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant committed Oct 9, 2024
1 parent daecc97 commit 1b8904f
Showing 1 changed file with 18 additions and 4 deletions.
22 changes: 18 additions & 4 deletions arro3-io/src/parquet.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
use std::any::Any;
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;

use arrow_array::{RecordBatchIterator, RecordBatchReader};
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
use parquet::arrow::arrow_writer::ArrowWriterOptions;
use parquet::arrow::ArrowWriter;
use parquet::arrow::{ArrowWriter, ProjectionMask};
use parquet::basic::{Compression, Encoding};
use parquet::file::properties::{WriterProperties, WriterVersion};
use parquet::format::KeyValue;
use parquet::schema::types::ColumnPath;
use parquet::schema::types::{ColumnPath, SchemaDescriptor};
use pyo3::exceptions::{PyTypeError, PyValueError};
use pyo3::prelude::*;
use pyo3_arrow::error::PyArrowResult;
Expand All @@ -19,8 +20,21 @@ use pyo3_arrow::PyRecordBatchReader;
use crate::utils::{FileReader, FileWriter};

#[pyfunction]
pub fn read_parquet(py: Python, file: FileReader) -> PyArrowResult<PyObject> {
let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
pub fn read_parquet(
py: Python,
file: FileReader,
rgs: Option<Vec<usize>>,
columns: Option<Vec<usize>>,
) -> PyArrowResult<PyObject> {
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
if let Some(nn) = rgs {
builder = builder.with_row_groups(nn);
}

if let Some(cols) = columns {
let projection = ProjectionMask::leaves(builder.parquet_schema(), cols);
builder = builder.with_projection(projection);
}

let metadata = builder.schema().metadata().clone();
let reader = builder.build().unwrap();
Expand Down

0 comments on commit 1b8904f

Please sign in to comment.