-
-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor(rust): Add parquet source node to new streaming engine (#18152)
- Loading branch information
1 parent
41d3048
commit ffb66aa
Showing
29 changed files
with
2,640 additions
and
154 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
use std::ops::Range; | ||
use std::sync::Arc; | ||
|
||
use polars_error::{to_compute_err, PolarsResult}; | ||
use polars_utils::_limit_path_len_io_err; | ||
use polars_utils::mmap::MemSlice; | ||
|
||
use crate::cloud::{ | ||
build_object_store, object_path_from_str, CloudLocation, CloudOptions, ObjectStorePath, | ||
PolarsObjectStore, | ||
}; | ||
|
||
#[allow(async_fn_in_trait)] | ||
pub trait ByteSource: Send + Sync { | ||
async fn get_size(&self) -> PolarsResult<usize>; | ||
/// # Panics | ||
/// Panics if `range` is not in bounds. | ||
async fn get_range(&self, range: Range<usize>) -> PolarsResult<MemSlice>; | ||
async fn get_ranges(&self, ranges: &[Range<usize>]) -> PolarsResult<Vec<MemSlice>>; | ||
} | ||
|
||
/// Byte source backed by a `MemSlice`, which can potentially be memory-mapped. | ||
pub struct MemSliceByteSource(pub MemSlice); | ||
|
||
impl MemSliceByteSource { | ||
async fn try_new_mmap_from_path( | ||
path: &str, | ||
_cloud_options: Option<&CloudOptions>, | ||
) -> PolarsResult<Self> { | ||
let file = Arc::new( | ||
tokio::fs::File::open(path) | ||
.await | ||
.map_err(|err| _limit_path_len_io_err(path.as_ref(), err))? | ||
.into_std() | ||
.await, | ||
); | ||
let mmap = Arc::new(unsafe { memmap::Mmap::map(file.as_ref()) }.map_err(to_compute_err)?); | ||
|
||
Ok(Self(MemSlice::from_mmap(mmap))) | ||
} | ||
} | ||
|
||
impl ByteSource for MemSliceByteSource { | ||
async fn get_size(&self) -> PolarsResult<usize> { | ||
Ok(self.0.as_ref().len()) | ||
} | ||
|
||
async fn get_range(&self, range: Range<usize>) -> PolarsResult<MemSlice> { | ||
let out = self.0.slice(range); | ||
Ok(out) | ||
} | ||
|
||
async fn get_ranges(&self, ranges: &[Range<usize>]) -> PolarsResult<Vec<MemSlice>> { | ||
Ok(ranges | ||
.iter() | ||
.map(|x| self.0.slice(x.clone())) | ||
.collect::<Vec<_>>()) | ||
} | ||
} | ||
|
||
pub struct ObjectStoreByteSource { | ||
store: PolarsObjectStore, | ||
path: ObjectStorePath, | ||
} | ||
|
||
impl ObjectStoreByteSource { | ||
async fn try_new_from_path( | ||
path: &str, | ||
cloud_options: Option<&CloudOptions>, | ||
) -> PolarsResult<Self> { | ||
let (CloudLocation { prefix, .. }, store) = | ||
build_object_store(path, cloud_options, false).await?; | ||
let path = object_path_from_str(&prefix)?; | ||
let store = PolarsObjectStore::new(store); | ||
|
||
Ok(Self { store, path }) | ||
} | ||
} | ||
|
||
impl ByteSource for ObjectStoreByteSource { | ||
async fn get_size(&self) -> PolarsResult<usize> { | ||
Ok(self.store.head(&self.path).await?.size) | ||
} | ||
|
||
async fn get_range(&self, range: Range<usize>) -> PolarsResult<MemSlice> { | ||
let bytes = self.store.get_range(&self.path, range).await?; | ||
let mem_slice = MemSlice::from_bytes(bytes); | ||
|
||
Ok(mem_slice) | ||
} | ||
|
||
async fn get_ranges(&self, ranges: &[Range<usize>]) -> PolarsResult<Vec<MemSlice>> { | ||
let ranges = self.store.get_ranges(&self.path, ranges).await?; | ||
Ok(ranges.into_iter().map(MemSlice::from_bytes).collect()) | ||
} | ||
} | ||
|
||
/// Dynamic dispatch to async functions. | ||
pub enum DynByteSource { | ||
MemSlice(MemSliceByteSource), | ||
Cloud(ObjectStoreByteSource), | ||
} | ||
|
||
impl DynByteSource { | ||
pub fn variant_name(&self) -> &str { | ||
match self { | ||
Self::MemSlice(_) => "MemSlice", | ||
Self::Cloud(_) => "Cloud", | ||
} | ||
} | ||
} | ||
|
||
impl Default for DynByteSource { | ||
fn default() -> Self { | ||
Self::MemSlice(MemSliceByteSource(MemSlice::default())) | ||
} | ||
} | ||
|
||
impl ByteSource for DynByteSource { | ||
async fn get_size(&self) -> PolarsResult<usize> { | ||
match self { | ||
Self::MemSlice(v) => v.get_size().await, | ||
Self::Cloud(v) => v.get_size().await, | ||
} | ||
} | ||
|
||
async fn get_range(&self, range: Range<usize>) -> PolarsResult<MemSlice> { | ||
match self { | ||
Self::MemSlice(v) => v.get_range(range).await, | ||
Self::Cloud(v) => v.get_range(range).await, | ||
} | ||
} | ||
|
||
async fn get_ranges(&self, ranges: &[Range<usize>]) -> PolarsResult<Vec<MemSlice>> { | ||
match self { | ||
Self::MemSlice(v) => v.get_ranges(ranges).await, | ||
Self::Cloud(v) => v.get_ranges(ranges).await, | ||
} | ||
} | ||
} | ||
|
||
impl From<MemSliceByteSource> for DynByteSource { | ||
fn from(value: MemSliceByteSource) -> Self { | ||
Self::MemSlice(value) | ||
} | ||
} | ||
|
||
impl From<ObjectStoreByteSource> for DynByteSource { | ||
fn from(value: ObjectStoreByteSource) -> Self { | ||
Self::Cloud(value) | ||
} | ||
} | ||
|
||
#[derive(Clone, Debug)] | ||
pub enum DynByteSourceBuilder { | ||
Mmap, | ||
/// Supports both cloud and local files. | ||
ObjectStore, | ||
} | ||
|
||
impl DynByteSourceBuilder { | ||
pub async fn try_build_from_path( | ||
&self, | ||
path: &str, | ||
cloud_options: Option<&CloudOptions>, | ||
) -> PolarsResult<DynByteSource> { | ||
Ok(match self { | ||
Self::Mmap => MemSliceByteSource::try_new_mmap_from_path(path, cloud_options) | ||
.await? | ||
.into(), | ||
Self::ObjectStore => ObjectStoreByteSource::try_new_from_path(path, cloud_options) | ||
.await? | ||
.into(), | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,58 @@ | ||
/// Given a `slice` that is relative to the start of a list of files, calculate the slice to apply | ||
/// at a file with a row offset of `current_row_offset`. | ||
pub fn split_slice_at_file( | ||
current_row_offset: &mut usize, | ||
current_row_offset_ref: &mut usize, | ||
n_rows_this_file: usize, | ||
global_slice_start: usize, | ||
global_slice_end: usize, | ||
) -> (usize, usize) { | ||
let next_file_offset = *current_row_offset + n_rows_this_file; | ||
// e.g. | ||
// slice: (start: 1, end: 2) | ||
// files: | ||
// 0: (1 row): current_offset: 0, next_file_offset: 1 | ||
// 1: (1 row): current_offset: 1, next_file_offset: 2 | ||
// 2: (1 row): current_offset: 2, next_file_offset: 3 | ||
// in this example we want to include only file 1. | ||
let has_overlap_with_slice = | ||
*current_row_offset < global_slice_end && next_file_offset > global_slice_start; | ||
let current_row_offset = *current_row_offset_ref; | ||
*current_row_offset_ref += n_rows_this_file; | ||
match SplitSlicePosition::split_slice_at_file( | ||
current_row_offset, | ||
n_rows_this_file, | ||
global_slice_start..global_slice_end, | ||
) { | ||
SplitSlicePosition::Overlapping(offset, len) => (offset, len), | ||
SplitSlicePosition::Before | SplitSlicePosition::After => (0, 0), | ||
} | ||
} | ||
|
||
#[derive(Debug)] | ||
pub enum SplitSlicePosition { | ||
Before, | ||
Overlapping(usize, usize), | ||
After, | ||
} | ||
|
||
impl SplitSlicePosition { | ||
pub fn split_slice_at_file( | ||
current_row_offset: usize, | ||
n_rows_this_file: usize, | ||
global_slice: std::ops::Range<usize>, | ||
) -> Self { | ||
// e.g. | ||
// slice: (start: 1, end: 2) | ||
// files: | ||
// 0: (1 row): current_offset: 0, next_file_offset: 1 | ||
// 1: (1 row): current_offset: 1, next_file_offset: 2 | ||
// 2: (1 row): current_offset: 2, next_file_offset: 3 | ||
// in this example we want to include only file 1. | ||
|
||
let next_row_offset = current_row_offset + n_rows_this_file; | ||
|
||
let (rel_start, slice_len) = if !has_overlap_with_slice { | ||
(0, 0) | ||
} else { | ||
let n_rows_to_skip = global_slice_start.saturating_sub(*current_row_offset); | ||
let n_excess_rows = next_file_offset.saturating_sub(global_slice_end); | ||
( | ||
n_rows_to_skip, | ||
n_rows_this_file - n_rows_to_skip - n_excess_rows, | ||
) | ||
}; | ||
if next_row_offset <= global_slice.start { | ||
Self::Before | ||
} else if current_row_offset >= global_slice.end { | ||
Self::After | ||
} else { | ||
let n_rows_to_skip = global_slice.start.saturating_sub(current_row_offset); | ||
let n_excess_rows = next_row_offset.saturating_sub(global_slice.end); | ||
|
||
*current_row_offset = next_file_offset; | ||
(rel_start, slice_len) | ||
Self::Overlapping( | ||
n_rows_to_skip, | ||
n_rows_this_file - n_rows_to_skip - n_excess_rows, | ||
) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.