diff --git a/.github/pr-title-checker-config.json b/.github/pr-title-checker-config.json new file mode 100644 index 000000000000..65f09d5aab9d --- /dev/null +++ b/.github/pr-title-checker-config.json @@ -0,0 +1,15 @@ +{ + "LABEL": { + "name": "title needs formatting", + "color": "FF0000" + }, + "CHECKS": { + "regexp": "^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)(\\((python|rust)\\!?(,(python|rust)\\!?)?\\))?\\!?\\: [A-Z].*[^\\.\\!\\?,… ]$", + "ignoreLabels": ["skip changelog"] + }, + "MESSAGES": { + "success": "PR title OK!", + "failure": "Invalid PR title! Please update according to the contributing guidelines: https://docs.pola.rs/development/contributing/#pull-requests", + "notice": "" + } +} diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml index 7c9be45095fe..a0c7e83bfdb3 100644 --- a/.github/workflows/pr-labeler.yml +++ b/.github/workflows/pr-labeler.yml @@ -9,9 +9,14 @@ permissions: pull-requests: write jobs: - main: + labeler: runs-on: ubuntu-latest steps: + - name: Check pull request title + uses: thehanimo/pr-title-checker@v1.4.2 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Label pull request uses: release-drafter/release-drafter@v6 with: diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml index 10a4adc54615..6c8659db8fab 100644 --- a/.github/workflows/test-python.yml +++ b/.github/workflows/test-python.yml @@ -25,6 +25,7 @@ concurrency: env: RUSTFLAGS: -C debuginfo=0 # Do not produce debug symbols to keep memory usage down RUST_BACKTRACE: 1 + PYTHONUTF8: 1 defaults: run: diff --git a/Cargo.lock b/Cargo.lock index fe2ff451e130..6231e1b8d839 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3383,6 +3383,7 @@ dependencies = [ "rayon", "recursive", "slotmap", + "tokio", "version_check", ] @@ -3503,7 +3504,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.0.0" +version = "1.1.0" dependencies = [ "ahash", "arboard", diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index deeda0df6c08..38888299b11b 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -34,7 +34,7 @@ use crate::array::iterator::NonNullValuesIter; use crate::bitmap::utils::{BitmapIter, ZipValidity}; pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>; pub type Utf8ViewArray = BinaryViewArrayGeneric; -pub use view::{View, INLINE_VIEW_SIZE}; +pub use view::View; use super::Splitable; diff --git a/crates/polars-arrow/src/array/binview/mutable.rs b/crates/polars-arrow/src/array/binview/mutable.rs index 25482754337a..891f8e6075e8 100644 --- a/crates/polars-arrow/src/array/binview/mutable.rs +++ b/crates/polars-arrow/src/array/binview/mutable.rs @@ -188,6 +188,18 @@ impl MutableBinaryViewArray { self.views.push(value); } + #[inline] + pub fn push_buffer(&mut self, buffer: Buffer) -> u32 { + if !self.in_progress_buffer.is_empty() { + self.completed_buffers + .push(Buffer::from(std::mem::take(&mut self.in_progress_buffer))); + } + + let buffer_idx = self.completed_buffers.len(); + self.completed_buffers.push(buffer); + buffer_idx as u32 + } + #[inline] pub fn push_value>(&mut self, value: V) { if let Some(validity) = &mut self.validity { diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index ccb771d2417d..fd205d8ce508 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -1,5 +1,5 @@ use std::cmp::Ordering; -use std::fmt::{Display, Formatter}; +use std::fmt::{self, Display, Formatter}; use std::ops::Add; use bytemuck::{Pod, Zeroable}; @@ -13,10 +13,12 @@ use crate::buffer::Buffer; use crate::datatypes::PrimitiveType; use crate::types::NativeType; -pub const INLINE_VIEW_SIZE: u32 = 12; - // We use this instead of u128 because we want alignment of <= 8 bytes. -#[derive(Debug, Copy, Clone, Default)] +/// A reference to a set of bytes. +/// +/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields. +/// If `length > 12`, these fields specify a slice of a buffer. +#[derive(Copy, Clone, Default)] #[repr(C)] pub struct View { /// The length of the string/bytes. @@ -29,29 +31,77 @@ pub struct View { pub offset: u32, } +impl fmt::Debug for View { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.length <= Self::MAX_INLINE_SIZE { + fmt.debug_struct("View") + .field("length", &self.length) + .field("content", &unsafe { + std::slice::from_raw_parts( + (self as *const _ as *const u8).add(4), + self.length as usize, + ) + }) + .finish() + } else { + fmt.debug_struct("View") + .field("length", &self.length) + .field("prefix", &self.prefix.to_be_bytes()) + .field("buffer_idx", &self.buffer_idx) + .field("offset", &self.offset) + .finish() + } + } +} + impl View { + pub const MAX_INLINE_SIZE: u32 = 12; + #[inline(always)] pub fn as_u128(self) -> u128 { unsafe { std::mem::transmute(self) } } + /// Create a new inline view + /// + /// # Panics + /// + /// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`. + #[inline] + pub fn new_inline(bytes: &[u8]) -> Self { + debug_assert!(bytes.len() <= u32::MAX as usize); + assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE); + + let mut view = Self { + length: bytes.len() as u32, + ..Default::default() + }; + + let view_ptr = &mut view as *mut _ as *mut u8; + + // SAFETY: + // - bytes length <= 12, + // - size_of:: == 16 + // - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C)) + // - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and + // offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8) + unsafe { + let inline_data_ptr = view_ptr.add(4); + core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len()); + } + view + } + #[inline] pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self { - if bytes.len() <= 12 { - let mut ret = Self { - length: bytes.len() as u32, - ..Default::default() - }; - let ret_ptr = &mut ret as *mut _ as *mut u8; - unsafe { - core::ptr::copy_nonoverlapping(bytes.as_ptr(), ret_ptr.add(4), bytes.len()); - } - ret + debug_assert!(bytes.len() <= u32::MAX as usize); + + if bytes.len() as u32 <= Self::MAX_INLINE_SIZE { + Self::new_inline(bytes) } else { - let prefix_buf: [u8; 4] = std::array::from_fn(|i| *bytes.get(i).unwrap_or(&0)); Self { length: bytes.len() as u32, - prefix: u32::from_le_bytes(prefix_buf), + prefix: u32::from_le_bytes(bytes[0..4].try_into().unwrap()), buffer_idx, offset, } @@ -190,8 +240,8 @@ where { for view in views { let len = view.length; - if len <= INLINE_VIEW_SIZE { - if len < INLINE_VIEW_SIZE && view.as_u128() >> (32 + len * 8) != 0 { + if len <= View::MAX_INLINE_SIZE { + if len < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + len * 8) != 0 { polars_bail!(ComputeError: "view contained non-zero padding in prefix"); } diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index 0dd22cf51d9e..c2c0c958032d 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -763,7 +763,7 @@ mod values; pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray}; pub use binview::{ BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, MutablePlBinary, - MutablePlString, Utf8ViewArray, View, ViewType, INLINE_VIEW_SIZE, + MutablePlString, Utf8ViewArray, View, ViewType, }; pub use boolean::{BooleanArray, MutableBooleanArray}; pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray}; diff --git a/crates/polars-arrow/src/bitmap/utils/iterator.rs b/crates/polars-arrow/src/bitmap/utils/iterator.rs index 18dc8e20a815..e95246483d20 100644 --- a/crates/polars-arrow/src/bitmap/utils/iterator.rs +++ b/crates/polars-arrow/src/bitmap/utils/iterator.rs @@ -46,6 +46,91 @@ impl<'a> BitmapIter<'a> { rest_len, } } + + /// Consume and returns the numbers of `1` / `true` values at the beginning of the iterator. + /// + /// This performs the same operation as `(&mut iter).take_while(|b| b).count()`. + /// + /// This is a lot more efficient than consecutively polling the iterator and should therefore + /// be preferred, if the use-case allows for it. + pub fn take_leading_ones(&mut self) -> usize { + let word_ones = usize::min(self.word_len, self.word.trailing_ones() as usize); + self.word_len -= word_ones; + self.word = self.word.wrapping_shr(word_ones as u32); + + if self.word_len != 0 { + return word_ones; + } + + let mut num_leading_ones = word_ones; + + while self.rest_len != 0 { + self.word_len = usize::min(self.rest_len, 64); + self.rest_len -= self.word_len; + + unsafe { + let chunk = self.bytes.get_unchecked(..8).try_into().unwrap(); + self.word = u64::from_le_bytes(chunk); + self.bytes = self.bytes.get_unchecked(8..); + } + + let word_ones = usize::min(self.word_len, self.word.trailing_ones() as usize); + self.word_len -= word_ones; + self.word = self.word.wrapping_shr(word_ones as u32); + num_leading_ones += word_ones; + + if self.word_len != 0 { + return num_leading_ones; + } + } + + num_leading_ones + } + + /// Consume and returns the numbers of `0` / `false` values that the start of the iterator. + /// + /// This performs the same operation as `(&mut iter).take_while(|b| !b).count()`. + /// + /// This is a lot more efficient than consecutively polling the iterator and should therefore + /// be preferred, if the use-case allows for it. + pub fn take_leading_zeros(&mut self) -> usize { + let word_zeros = usize::min(self.word_len, self.word.trailing_zeros() as usize); + self.word_len -= word_zeros; + self.word = self.word.wrapping_shr(word_zeros as u32); + + if self.word_len != 0 { + return word_zeros; + } + + let mut num_leading_zeros = word_zeros; + + while self.rest_len != 0 { + self.word_len = usize::min(self.rest_len, 64); + self.rest_len -= self.word_len; + unsafe { + let chunk = self.bytes.get_unchecked(..8).try_into().unwrap(); + self.word = u64::from_le_bytes(chunk); + self.bytes = self.bytes.get_unchecked(8..); + } + + let word_zeros = usize::min(self.word_len, self.word.trailing_zeros() as usize); + self.word_len -= word_zeros; + self.word = self.word.wrapping_shr(word_zeros as u32); + num_leading_zeros += word_zeros; + + if self.word_len != 0 { + return num_leading_zeros; + } + } + + num_leading_zeros + } + + /// Returns the number of remaining elements in the iterator + #[inline] + pub fn num_remaining(&self) -> usize { + self.word_len + self.rest_len + } } impl<'a> Iterator for BitmapIter<'a> { @@ -53,35 +138,31 @@ impl<'a> Iterator for BitmapIter<'a> { #[inline] fn next(&mut self) -> Option { - if self.word_len != 0 { - let ret = self.word & 1 != 0; - self.word >>= 1; - self.word_len -= 1; - return Some(ret); - } + if self.word_len == 0 { + if self.rest_len == 0 { + return None; + } - if self.rest_len != 0 { self.word_len = self.rest_len.min(64); self.rest_len -= self.word_len; + unsafe { let chunk = self.bytes.get_unchecked(..8).try_into().unwrap(); self.word = u64::from_le_bytes(chunk); self.bytes = self.bytes.get_unchecked(8..); } - - let ret = self.word & 1 != 0; - self.word >>= 1; - self.word_len -= 1; - return Some(ret); } - None + let ret = self.word & 1 != 0; + self.word >>= 1; + self.word_len -= 1; + Some(ret) } #[inline] fn size_hint(&self) -> (usize, Option) { - let exact = self.word_len + self.rest_len; - (exact, Some(exact)) + let num_remaining = self.num_remaining(); + (num_remaining, Some(num_remaining)) } } @@ -102,3 +183,59 @@ impl<'a> DoubleEndedIterator for BitmapIter<'a> { unsafe impl TrustedLen for BitmapIter<'_> {} impl ExactSizeIterator for BitmapIter<'_> {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[ignore = "Fuzz test. Too slow"] + fn test_leading_ops() { + for _ in 0..10_000 { + let bs = rand::random::() % 4; + + let mut length = 0; + let mut pattern = Vec::new(); + for _ in 0..rand::random::() % 1024 { + let word = match bs { + 0 => u64::MIN, + 1 => u64::MAX, + 2 | 3 => rand::random(), + _ => unreachable!(), + }; + + pattern.extend_from_slice(&word.to_le_bytes()); + length += 64; + } + + for _ in 0..rand::random::() % 7 { + pattern.push(rand::random::()); + length += 8; + } + + let last_length = rand::random::() % 8; + if last_length != 0 { + pattern.push(rand::random::()); + length += last_length; + } + + let mut iter = BitmapIter::new(&pattern, 0, length); + + let mut prev_remaining = iter.num_remaining(); + while iter.num_remaining() != 0 { + let num_ones = iter.clone().take_leading_ones(); + assert_eq!(num_ones, (&mut iter).take_while(|&b| b).count()); + + let num_zeros = iter.clone().take_leading_zeros(); + assert_eq!(num_zeros, (&mut iter).take_while(|&b| !b).count()); + + // Ensure that we are making progress + assert!(iter.num_remaining() < prev_remaining); + prev_remaining = iter.num_remaining(); + } + + assert_eq!(iter.take_leading_zeros(), 0); + assert_eq!(iter.take_leading_ones(), 0); + } + } +} diff --git a/crates/polars-arrow/src/doc/lib.md b/crates/polars-arrow/src/doc/lib.md index 61bc87c4d7b3..dd10d361bd80 100644 --- a/crates/polars-arrow/src/doc/lib.md +++ b/crates/polars-arrow/src/doc/lib.md @@ -42,7 +42,7 @@ fn main() -> Result<()> { write_statistics: true, compression: CompressionOptions::Snappy, version: Version::V1, - data_pagesize_limit: None, + data_page_size: None, }; let row_groups = RowGroupIterator::try_new( diff --git a/crates/polars-arrow/src/legacy/utils.rs b/crates/polars-arrow/src/legacy/utils.rs index a9a6b7d0ed78..af482171a1a9 100644 --- a/crates/polars-arrow/src/legacy/utils.rs +++ b/crates/polars-arrow/src/legacy/utils.rs @@ -18,7 +18,7 @@ pub trait CustomIterTools: Iterator { where Self: Sized, { - TrustMyLength::new(self, length) + unsafe { TrustMyLength::new(self, length) } } fn collect_trusted>(self) -> T diff --git a/crates/polars-arrow/src/pushable.rs b/crates/polars-arrow/src/pushable.rs index 025e78275dcd..de642833bc6a 100644 --- a/crates/polars-arrow/src/pushable.rs +++ b/crates/polars-arrow/src/pushable.rs @@ -19,6 +19,12 @@ pub trait Pushable: Sized + Default { fn push(&mut self, value: T); fn len(&self) -> usize; fn push_null(&mut self); + #[inline] + fn extend_n(&mut self, n: usize, iter: impl Iterator) { + for item in iter.take(n) { + self.push(item); + } + } fn extend_constant(&mut self, additional: usize, value: T); fn extend_null_constant(&mut self, additional: usize); fn freeze(self) -> Self::Freeze; @@ -31,6 +37,7 @@ impl Pushable for MutableBitmap { fn reserve(&mut self, additional: usize) { MutableBitmap::reserve(self, additional) } + #[inline] fn len(&self) -> usize { self.len() @@ -82,6 +89,11 @@ impl Pushable for Vec { self.push(value) } + #[inline] + fn extend_n(&mut self, n: usize, iter: impl Iterator) { + self.extend(iter.take(n)); + } + #[inline] fn extend_constant(&mut self, additional: usize, value: T) { self.resize(self.len() + additional, value); diff --git a/crates/polars-arrow/src/trusted_len.rs b/crates/polars-arrow/src/trusted_len.rs index 3237ba83cbb2..5f194770e7c4 100644 --- a/crates/polars-arrow/src/trusted_len.rs +++ b/crates/polars-arrow/src/trusted_len.rs @@ -87,8 +87,13 @@ impl TrustMyLength where I: Iterator, { + /// Create a new `TrustMyLength` iterator + /// + /// # Safety + /// + /// This is safe if the iterator always has the exact length given by `len`. #[inline] - pub fn new(iter: I, len: usize) -> Self { + pub unsafe fn new(iter: I, len: usize) -> Self { Self { iter, len } } } @@ -104,6 +109,7 @@ where self.iter.next() } + #[inline] fn size_hint(&self) -> (usize, Option) { (self.len, Some(self.len)) } diff --git a/crates/polars-core/src/chunked_array/array/mod.rs b/crates/polars-core/src/chunked_array/array/mod.rs index 96fee06ff3b2..a3b7a1a1f339 100644 --- a/crates/polars-core/src/chunked_array/array/mod.rs +++ b/crates/polars-core/src/chunked_array/array/mod.rs @@ -44,7 +44,9 @@ impl ArrayChunked { ) -> PolarsResult { // Rechunk or the generated Series will have wrong length. let ca = self.rechunk(); - let field = self.inner_dtype().to_arrow_field("item", true); + let field = self + .inner_dtype() + .to_arrow_field("item", CompatLevel::newest()); let chunks = ca.downcast_iter().map(|arr| { let elements = unsafe { @@ -66,8 +68,10 @@ impl ArrayChunked { let out = out.rechunk(); let values = out.chunks()[0].clone(); - let inner_dtype = - FixedSizeListArray::default_datatype(out.dtype().to_arrow(true), ca.width()); + let inner_dtype = FixedSizeListArray::default_datatype( + out.dtype().to_arrow(CompatLevel::newest()), + ca.width(), + ); let arr = FixedSizeListArray::new(inner_dtype, values, arr.validity().cloned()); Ok(arr) }); diff --git a/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs b/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs index a419ee930401..e235d08ffbd6 100644 --- a/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs +++ b/crates/polars-core/src/chunked_array/builder/fixed_size_list.rs @@ -124,7 +124,7 @@ impl FixedSizeListBuilder for AnonymousOwnedFixedSizeListBuilder { .finish( self.inner_dtype .as_ref() - .map(|dt| dt.to_arrow(true)) + .map(|dt| dt.to_arrow(CompatLevel::newest())) .as_ref(), ) .unwrap(); diff --git a/crates/polars-core/src/chunked_array/builder/list/anonymous.rs b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs index 1fb5393db1df..99b566320fbf 100644 --- a/crates/polars-core/src/chunked_array/builder/list/anonymous.rs +++ b/crates/polars-core/src/chunked_array/builder/list/anonymous.rs @@ -89,7 +89,7 @@ impl<'a> AnonymousListBuilder<'a> { let inner_dtype_physical = inner_dtype .as_ref() - .map(|dt| dt.to_physical().to_arrow(true)); + .map(|dt| dt.to_physical().to_arrow(CompatLevel::newest())); let arr = slf.builder.finish(inner_dtype_physical.as_ref()).unwrap(); let list_dtype_logical = match inner_dtype { @@ -157,7 +157,7 @@ impl ListBuilderTrait for AnonymousOwnedListBuilder { let slf = std::mem::take(self); let inner_dtype_physical = inner_dtype .as_ref() - .map(|dt| dt.to_physical().to_arrow(true)); + .map(|dt| dt.to_physical().to_arrow(CompatLevel::newest())); let arr = slf.builder.finish(inner_dtype_physical.as_ref()).unwrap(); let list_dtype_logical = match inner_dtype { diff --git a/crates/polars-core/src/chunked_array/builder/list/primitive.rs b/crates/polars-core/src/chunked_array/builder/list/primitive.rs index ce9caff3a116..34e12433db7a 100644 --- a/crates/polars-core/src/chunked_array/builder/list/primitive.rs +++ b/crates/polars-core/src/chunked_array/builder/list/primitive.rs @@ -39,7 +39,7 @@ where ) -> Self { let values = MutablePrimitiveArray::::with_capacity_from( values_capacity, - values_type.to_arrow(true), + values_type.to_arrow(CompatLevel::newest()), ); let builder = LargePrimitiveBuilder::::new_with_capacity(values, capacity); let field = Field::new(name, DataType::List(Box::new(logical_type))); diff --git a/crates/polars-core/src/chunked_array/builder/mod.rs b/crates/polars-core/src/chunked_array/builder/mod.rs index 3c1060016101..bac88f5a1ea5 100644 --- a/crates/polars-core/src/chunked_array/builder/mod.rs +++ b/crates/polars-core/src/chunked_array/builder/mod.rs @@ -66,7 +66,7 @@ where T: PolarsNumericType, { fn from_slice(name: &str, v: &[T::Native]) -> Self { - let arr = PrimitiveArray::from_slice(v).to(T::get_dtype().to_arrow(true)); + let arr = PrimitiveArray::from_slice(v).to(T::get_dtype().to_arrow(CompatLevel::newest())); ChunkedArray::with_chunk(name, arr) } diff --git a/crates/polars-core/src/chunked_array/builder/primitive.rs b/crates/polars-core/src/chunked_array/builder/primitive.rs index eaedc93a5a80..14eb2c1f4f46 100644 --- a/crates/polars-core/src/chunked_array/builder/primitive.rs +++ b/crates/polars-core/src/chunked_array/builder/primitive.rs @@ -41,7 +41,7 @@ where { pub fn new(name: &str, capacity: usize) -> Self { let array_builder = MutablePrimitiveArray::::with_capacity(capacity) - .to(T::get_dtype().to_arrow(true)); + .to(T::get_dtype().to_arrow(CompatLevel::newest())); PrimitiveChunkedBuilder { array_builder, diff --git a/crates/polars-core/src/chunked_array/cast.rs b/crates/polars-core/src/chunked_array/cast.rs index ea74c39e64e7..892d28203e55 100644 --- a/crates/polars-core/src/chunked_array/cast.rs +++ b/crates/polars-core/src/chunked_array/cast.rs @@ -51,7 +51,7 @@ pub(crate) fn cast_chunks( let check_nulls = matches!(options, CastOptions::Strict); let options = options.into(); - let arrow_dtype = dtype.to_arrow(true); + let arrow_dtype = dtype.try_to_arrow(CompatLevel::newest())?; chunks .iter() .map(|arr| { diff --git a/crates/polars-core/src/chunked_array/collect.rs b/crates/polars-core/src/chunked_array/collect.rs index 6131d741eac6..054f59de8958 100644 --- a/crates/polars-core/src/chunked_array/collect.rs +++ b/crates/polars-core/src/chunked_array/collect.rs @@ -18,6 +18,7 @@ use crate::chunked_array::ChunkedArray; use crate::datatypes::{ ArrayCollectIterExt, ArrayFromIter, ArrayFromIterDtype, DataType, Field, PolarsDataType, }; +use crate::prelude::CompatLevel; pub trait ChunkedCollectIterExt: Iterator + Sized { #[inline] @@ -26,7 +27,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { T::Array: ArrayFromIterDtype, { let field = Arc::new(Field::new(name, dtype.clone())); - let arr = self.collect_arr_with_dtype(field.dtype.to_arrow(true)); + let arr = self.collect_arr_with_dtype(field.dtype.to_arrow(CompatLevel::newest())); ChunkedArray::from_chunk_iter_and_field(field, [arr]) } @@ -36,7 +37,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { T::Array: ArrayFromIterDtype, { let field = Arc::clone(&name_dtype_src.field); - let arr = self.collect_arr_with_dtype(field.dtype.to_arrow(true)); + let arr = self.collect_arr_with_dtype(field.dtype.to_arrow(CompatLevel::newest())); ChunkedArray::from_chunk_iter_and_field(field, [arr]) } @@ -47,7 +48,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { Self: TrustedLen, { let field = Arc::new(Field::new(name, dtype.clone())); - let arr = self.collect_arr_trusted_with_dtype(field.dtype.to_arrow(true)); + let arr = self.collect_arr_trusted_with_dtype(field.dtype.to_arrow(CompatLevel::newest())); ChunkedArray::from_chunk_iter_and_field(field, [arr]) } @@ -58,7 +59,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { Self: TrustedLen, { let field = Arc::clone(&name_dtype_src.field); - let arr = self.collect_arr_trusted_with_dtype(field.dtype.to_arrow(true)); + let arr = self.collect_arr_trusted_with_dtype(field.dtype.to_arrow(CompatLevel::newest())); ChunkedArray::from_chunk_iter_and_field(field, [arr]) } @@ -73,7 +74,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { Self: Iterator>, { let field = Arc::new(Field::new(name, dtype.clone())); - let arr = self.try_collect_arr_with_dtype(field.dtype.to_arrow(true))?; + let arr = self.try_collect_arr_with_dtype(field.dtype.to_arrow(CompatLevel::newest()))?; Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) } @@ -87,7 +88,7 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { Self: Iterator>, { let field = Arc::clone(&name_dtype_src.field); - let arr = self.try_collect_arr_with_dtype(field.dtype.to_arrow(true))?; + let arr = self.try_collect_arr_with_dtype(field.dtype.to_arrow(CompatLevel::newest()))?; Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) } @@ -102,7 +103,8 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { Self: Iterator> + TrustedLen, { let field = Arc::new(Field::new(name, dtype.clone())); - let arr = self.try_collect_arr_trusted_with_dtype(field.dtype.to_arrow(true))?; + let arr = + self.try_collect_arr_trusted_with_dtype(field.dtype.to_arrow(CompatLevel::newest()))?; Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) } @@ -116,7 +118,8 @@ pub trait ChunkedCollectIterExt: Iterator + Sized { Self: Iterator> + TrustedLen, { let field = Arc::clone(&name_dtype_src.field); - let arr = self.try_collect_arr_trusted_with_dtype(field.dtype.to_arrow(true))?; + let arr = + self.try_collect_arr_trusted_with_dtype(field.dtype.to_arrow(CompatLevel::newest()))?; Ok(ChunkedArray::from_chunk_iter_and_field(field, [arr])) } } diff --git a/crates/polars-core/src/chunked_array/from.rs b/crates/polars-core/src/chunked_array/from.rs index af4892890336..56ac8cb90604 100644 --- a/crates/polars-core/src/chunked_array/from.rs +++ b/crates/polars-core/src/chunked_array/from.rs @@ -217,7 +217,10 @@ where #[cfg(debug_assertions)] { if !chunks.is_empty() && !chunks[0].is_empty() && dtype.is_primitive() { - assert_eq!(chunks[0].data_type(), &dtype.to_arrow(true)) + assert_eq!( + chunks[0].data_type(), + &dtype.to_arrow(CompatLevel::newest()) + ) } } let field = Arc::new(Field::new(name, dtype)); @@ -234,7 +237,10 @@ where } pub fn full_null_like(ca: &Self, length: usize) -> Self { - let chunks = std::iter::once(T::Array::full_null(length, ca.dtype().to_arrow(true))); + let chunks = std::iter::once(T::Array::full_null( + length, + ca.dtype().to_arrow(CompatLevel::newest()), + )); Self::from_chunk_iter_like(ca, chunks) } } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/from.rs b/crates/polars-core/src/chunked_array/logical/categorical/from.rs index 568d5650ba8e..5df0a2691583 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/from.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/from.rs @@ -3,8 +3,8 @@ use arrow::datatypes::IntegerType; use super::*; -fn convert_values(arr: &Utf8ViewArray, pl_flavor: bool) -> ArrayRef { - if pl_flavor { +fn convert_values(arr: &Utf8ViewArray, compat_level: CompatLevel) -> ArrayRef { + if compat_level.0 >= 1 { arr.clone().boxed() } else { utf8view_to_utf8::(arr).boxed() @@ -12,16 +12,16 @@ fn convert_values(arr: &Utf8ViewArray, pl_flavor: bool) -> ArrayRef { } impl CategoricalChunked { - pub fn to_arrow(&self, pl_flavor: bool, as_i64: bool) -> ArrayRef { + pub fn to_arrow(&self, compat_level: CompatLevel, as_i64: bool) -> ArrayRef { if as_i64 { - self.to_i64(pl_flavor).boxed() + self.to_i64(compat_level).boxed() } else { - self.to_u32(pl_flavor).boxed() + self.to_u32(compat_level).boxed() } } - fn to_u32(&self, pl_flavor: bool) -> DictionaryArray { - let values_dtype = if pl_flavor { + fn to_u32(&self, compat_level: CompatLevel) -> DictionaryArray { + let values_dtype = if compat_level.0 >= 1 { ArrowDataType::Utf8View } else { ArrowDataType::LargeUtf8 @@ -32,7 +32,7 @@ impl CategoricalChunked { let dtype = ArrowDataType::Dictionary(IntegerType::UInt32, Box::new(values_dtype), false); match map { RevMapping::Local(arr, _) => { - let values = convert_values(arr, pl_flavor); + let values = convert_values(arr, compat_level); // SAFETY: // the keys are in bounds @@ -44,7 +44,7 @@ impl CategoricalChunked { .map(|opt_k| opt_k.map(|k| *reverse_map.get(k).unwrap())); let keys = PrimitiveArray::from_trusted_len_iter(iter); - let values = convert_values(values, pl_flavor); + let values = convert_values(values, compat_level); // SAFETY: // the keys are in bounds @@ -53,8 +53,8 @@ impl CategoricalChunked { } } - fn to_i64(&self, pl_flavor: bool) -> DictionaryArray { - let values_dtype = if pl_flavor { + fn to_i64(&self, compat_level: CompatLevel) -> DictionaryArray { + let values_dtype = if compat_level.0 >= 1 { ArrowDataType::Utf8View } else { ArrowDataType::LargeUtf8 @@ -65,7 +65,7 @@ impl CategoricalChunked { let dtype = ArrowDataType::Dictionary(IntegerType::Int64, Box::new(values_dtype), false); match map { RevMapping::Local(arr, _) => { - let values = convert_values(arr, pl_flavor); + let values = convert_values(arr, compat_level); // SAFETY: // the keys are in bounds @@ -89,7 +89,7 @@ impl CategoricalChunked { .map(|opt_k| opt_k.map(|k| *reverse_map.get(k).unwrap() as i64)); let keys = PrimitiveArray::from_trusted_len_iter(iter); - let values = convert_values(values, pl_flavor); + let values = convert_values(values, compat_level); // SAFETY: // the keys are in bounds diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 30ab29b5c78a..1b2fa0d06a7f 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -464,7 +464,7 @@ mod test { let ca = ca.cast(&DataType::Categorical(None, Default::default()))?; let ca = ca.categorical().unwrap(); - let arr = ca.to_arrow(true, false); + let arr = ca.to_arrow(CompatLevel::newest(), false); let s = Series::try_from(("foo", arr))?; assert!(matches!(s.dtype(), &DataType::Categorical(_, _))); assert_eq!(s.null_count(), 1); diff --git a/crates/polars-core/src/chunked_array/logical/decimal.rs b/crates/polars-core/src/chunked_array/logical/decimal.rs index 4526f0d63e99..acc19a522e46 100644 --- a/crates/polars-core/src/chunked_array/logical/decimal.rs +++ b/crates/polars-core/src/chunked_array/logical/decimal.rs @@ -20,7 +20,7 @@ impl Int128Chunked { let (_, values, validity) = default.into_inner(); *arr = PrimitiveArray::new( - DataType::Decimal(precision, Some(scale)).to_arrow(true), + DataType::Decimal(precision, Some(scale)).to_arrow(CompatLevel::newest()), values, validity, ); diff --git a/crates/polars-core/src/chunked_array/logical/struct_/mod.rs b/crates/polars-core/src/chunked_array/logical/struct_/mod.rs index 07bc343aee21..3fc4baa6a706 100644 --- a/crates/polars-core/src/chunked_array/logical/struct_/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/struct_/mod.rs @@ -48,12 +48,12 @@ fn fields_to_struct_array(fields: &[Series], physical: bool) -> (ArrayRef, Vec s.to_arrow(0, true), + DataType::Object(_, _) => s.to_arrow(0, CompatLevel::newest()), _ => { if physical { s.chunks()[0].clone() } else { - s.to_arrow(0, true) + s.to_arrow(0, CompatLevel::newest()) } }, } @@ -145,7 +145,7 @@ impl StructChunked { .iter() .map(|s| match s.dtype() { #[cfg(feature = "object")] - DataType::Object(_, _) => s.to_arrow(i, true), + DataType::Object(_, _) => s.to_arrow(i, CompatLevel::newest()), _ => s.chunks()[i].clone(), }) .collect::>(); @@ -295,11 +295,11 @@ impl StructChunked { self.into() } - pub(crate) fn to_arrow(&self, i: usize, pl_flavor: bool) -> ArrayRef { + pub(crate) fn to_arrow(&self, i: usize, compat_level: CompatLevel) -> ArrayRef { let values = self .fields .iter() - .map(|s| s.to_arrow(i, pl_flavor)) + .map(|s| s.to_arrow(i, compat_level)) .collect::>(); // we determine fields from arrays as there might be object arrays diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 3cc4f06b798e..176bbd11fc90 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -933,7 +933,11 @@ pub(crate) fn to_primitive( values: Vec, validity: Option, ) -> PrimitiveArray { - PrimitiveArray::new(T::get_dtype().to_arrow(true), values.into(), validity) + PrimitiveArray::new( + T::get_dtype().to_arrow(CompatLevel::newest()), + values.into(), + validity, + ) } pub(crate) fn to_array( diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index f6d7db426cd1..632afe52d889 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -42,13 +42,13 @@ where let out: U::Array = arr .values_iter() .map(&mut op) - .collect_arr_with_dtype(dtype.to_arrow(true)); + .collect_arr_with_dtype(dtype.to_arrow(CompatLevel::newest())); out.with_validity_typed(arr.validity().cloned()) } else { let out: U::Array = arr .iter() .map(|opt| opt.map(&mut op)) - .collect_arr_with_dtype(dtype.to_arrow(true)); + .collect_arr_with_dtype(dtype.to_arrow(CompatLevel::newest())); out.with_validity_typed(arr.validity().cloned()) } }); @@ -133,7 +133,7 @@ where drop(arr); let compute_immutable = |arr: &PrimitiveArray| { - arrow::compute::arity::unary(arr, f, S::get_dtype().to_arrow(true)) + arrow::compute::arity::unary(arr, f, S::get_dtype().to_arrow(CompatLevel::newest())) }; if owned_arr.values().is_sliced() { diff --git a/crates/polars-core/src/chunked_array/ops/arity.rs b/crates/polars-core/src/chunked_array/ops/arity.rs index 0f3cf8cce4ca..ed9e6dab79ec 100644 --- a/crates/polars-core/src/chunked_array/ops/arity.rs +++ b/crates/polars-core/src/chunked_array/ops/arity.rs @@ -6,7 +6,7 @@ use polars_error::PolarsResult; use crate::chunked_array::metadata::MetadataProperties; use crate::datatypes::{ArrayCollectIterExt, ArrayFromIter}; -use crate::prelude::{ChunkedArray, PolarsDataType, Series}; +use crate::prelude::{ChunkedArray, CompatLevel, PolarsDataType, Series}; use crate::utils::{align_chunks_binary, align_chunks_binary_owned, align_chunks_ternary}; // We need this helper because for<'a> notation can't yet be applied properly @@ -106,7 +106,7 @@ where V::Array: ArrayFromIter<>>::Ret>, { if ca.null_count() == ca.len() { - let arr = V::Array::full_null(ca.len(), V::get_dtype().to_arrow(true)); + let arr = V::Array::full_null(ca.len(), V::get_dtype().to_arrow(CompatLevel::newest())); return ChunkedArray::with_chunk(ca.name(), arr); } @@ -130,7 +130,7 @@ where V::Array: ArrayFromIter, { if ca.null_count() == ca.len() { - let arr = V::Array::full_null(ca.len(), V::get_dtype().to_arrow(true)); + let arr = V::Array::full_null(ca.len(), V::get_dtype().to_arrow(CompatLevel::newest())); return Ok(ChunkedArray::with_chunk(ca.name(), arr)); } @@ -308,7 +308,7 @@ where { if lhs.null_count() == lhs.len() || rhs.null_count() == rhs.len() { let len = lhs.len().min(rhs.len()); - let arr = V::Array::full_null(len, V::get_dtype().to_arrow(true)); + let arr = V::Array::full_null(len, V::get_dtype().to_arrow(CompatLevel::newest())); return ChunkedArray::with_chunk(lhs.name(), arr); } @@ -704,7 +704,7 @@ where let min = lhs.len().min(rhs.len()); let max = lhs.len().max(rhs.len()); let len = if min == 1 { max } else { min }; - let arr = V::Array::full_null(len, V::get_dtype().to_arrow(true)); + let arr = V::Array::full_null(len, V::get_dtype().to_arrow(CompatLevel::newest())); return ChunkedArray::with_chunk(lhs.name(), arr); } @@ -745,7 +745,10 @@ where let opt_rhs = rhs.get(0); match opt_rhs { None => { - let arr = O::Array::full_null(lhs.len(), O::get_dtype().to_arrow(true)); + let arr = O::Array::full_null( + lhs.len(), + O::get_dtype().to_arrow(CompatLevel::newest()), + ); ChunkedArray::::with_chunk(lhs.name(), arr) }, Some(rhs) => unary_kernel(lhs, |arr| rhs_broadcast_kernel(arr, rhs.clone())), @@ -755,7 +758,10 @@ where let opt_lhs = lhs.get(0); match opt_lhs { None => { - let arr = O::Array::full_null(rhs.len(), O::get_dtype().to_arrow(true)); + let arr = O::Array::full_null( + rhs.len(), + O::get_dtype().to_arrow(CompatLevel::newest()), + ); ChunkedArray::::with_chunk(lhs.name(), arr) }, Some(lhs) => unary_kernel(rhs, |arr| lhs_broadcast_kernel(lhs.clone(), arr)), @@ -789,7 +795,10 @@ where let opt_rhs = rhs.get(0); match opt_rhs { None => { - let arr = O::Array::full_null(lhs.len(), O::get_dtype().to_arrow(true)); + let arr = O::Array::full_null( + lhs.len(), + O::get_dtype().to_arrow(CompatLevel::newest()), + ); ChunkedArray::::with_chunk(lhs.name(), arr) }, Some(rhs) => unary_kernel_owned(lhs, |arr| rhs_broadcast_kernel(arr, rhs.clone())), @@ -799,7 +808,10 @@ where let opt_lhs = lhs.get(0); match opt_lhs { None => { - let arr = O::Array::full_null(rhs.len(), O::get_dtype().to_arrow(true)); + let arr = O::Array::full_null( + rhs.len(), + O::get_dtype().to_arrow(CompatLevel::newest()), + ); ChunkedArray::::with_chunk(lhs.name(), arr) }, Some(lhs) => unary_kernel_owned(rhs, |arr| lhs_broadcast_kernel(lhs.clone(), arr)), diff --git a/crates/polars-core/src/chunked_array/ops/bit_repr.rs b/crates/polars-core/src/chunked_array/ops/bit_repr.rs index 37617a1d43ea..9a2f1c33594a 100644 --- a/crates/polars-core/src/chunked_array/ops/bit_repr.rs +++ b/crates/polars-core/src/chunked_array/ops/bit_repr.rs @@ -46,7 +46,7 @@ fn reinterpret_list_chunked( let pa = PrimitiveArray::from_data_default(reinterpreted_buf, inner_arr.validity().cloned()); LargeListArray::new( - DataType::List(Box::new(U::get_dtype())).to_arrow(true), + DataType::List(Box::new(U::get_dtype())).to_arrow(CompatLevel::newest()), array.offsets().clone(), pa.to_boxed(), array.validity().cloned(), diff --git a/crates/polars-core/src/chunked_array/ops/explode.rs b/crates/polars-core/src/chunked_array/ops/explode.rs index 910d2941b28d..f469209558b6 100644 --- a/crates/polars-core/src/chunked_array/ops/explode.rs +++ b/crates/polars-core/src/chunked_array/ops/explode.rs @@ -150,7 +150,7 @@ where unsafe { set_bit_unchecked(validity_slice, i, false) } } let arr = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), new_values.into(), Some(validity.into()), ); @@ -269,7 +269,9 @@ impl ExplodeByOffsets for ListChunked { last = o; } process_range(start, last, &mut builder); - let arr = builder.finish(Some(&inner_type.to_arrow(true))).unwrap(); + let arr = builder + .finish(Some(&inner_type.to_arrow(CompatLevel::newest()))) + .unwrap(); let mut ca = unsafe { self.copy_with_chunks(vec![Box::new(arr)]) }; use MetadataProperties as P; diff --git a/crates/polars-core/src/chunked_array/ops/fill_null.rs b/crates/polars-core/src/chunked_array/ops/fill_null.rs index f09852f94f5e..1a20cadc7175 100644 --- a/crates/polars-core/src/chunked_array/ops/fill_null.rs +++ b/crates/polars-core/src/chunked_array/ops/fill_null.rs @@ -306,7 +306,7 @@ where ChunkedArray::from_chunk_iter_like( ca, [ - T::Array::from_zeroable_vec(values, ca.dtype().to_arrow(true)) + T::Array::from_zeroable_vec(values, ca.dtype().to_arrow(CompatLevel::newest())) .with_validity_typed(Some(bm.into())), ], ) @@ -340,7 +340,7 @@ where ChunkedArray::from_chunk_iter_like( ca, [ - T::Array::from_zeroable_vec(values, ca.dtype().to_arrow(true)) + T::Array::from_zeroable_vec(values, ca.dtype().to_arrow(CompatLevel::newest())) .with_validity_typed(Some(bm.into())), ], ) diff --git a/crates/polars-core/src/chunked_array/ops/filter.rs b/crates/polars-core/src/chunked_array/ops/filter.rs index a6cf1ca98288..d0e6fe59c7e6 100644 --- a/crates/polars-core/src/chunked_array/ops/filter.rs +++ b/crates/polars-core/src/chunked_array/ops/filter.rs @@ -120,7 +120,9 @@ impl ChunkFilter for ListChunked { Some(true) => Ok(self.clone()), _ => Ok(ListChunked::from_chunk_iter( self.name(), - [ListArray::new_empty(self.dtype().to_arrow(true))], + [ListArray::new_empty( + self.dtype().to_arrow(CompatLevel::newest()), + )], )), }; } @@ -146,7 +148,9 @@ impl ChunkFilter for ArrayChunked { Some(true) => Ok(self.clone()), _ => Ok(ArrayChunked::from_chunk_iter( self.name(), - [FixedSizeListArray::new_empty(self.dtype().to_arrow(true))], + [FixedSizeListArray::new_empty( + self.dtype().to_arrow(CompatLevel::newest()), + )], )), }; } diff --git a/crates/polars-core/src/chunked_array/ops/full.rs b/crates/polars-core/src/chunked_array/ops/full.rs index c04d68cce776..71d80749e618 100644 --- a/crates/polars-core/src/chunked_array/ops/full.rs +++ b/crates/polars-core/src/chunked_array/ops/full.rs @@ -21,7 +21,7 @@ where T: PolarsNumericType, { fn full_null(name: &str, length: usize) -> Self { - let arr = PrimitiveArray::new_null(T::get_dtype().to_arrow(true), length); + let arr = PrimitiveArray::new_null(T::get_dtype().to_arrow(CompatLevel::newest()), length); ChunkedArray::with_chunk(name, arr) } } @@ -55,7 +55,7 @@ impl<'a> ChunkFull<&'a str> for StringChunked { impl ChunkFullNull for StringChunked { fn full_null(name: &str, length: usize) -> Self { - let arr = Utf8ViewArray::new_null(DataType::String.to_arrow(true), length); + let arr = Utf8ViewArray::new_null(DataType::String.to_arrow(CompatLevel::newest()), length); ChunkedArray::with_chunk(name, arr) } } @@ -72,7 +72,8 @@ impl<'a> ChunkFull<&'a [u8]> for BinaryChunked { impl ChunkFullNull for BinaryChunked { fn full_null(name: &str, length: usize) -> Self { - let arr = BinaryViewArray::new_null(DataType::Binary.to_arrow(true), length); + let arr = + BinaryViewArray::new_null(DataType::Binary.to_arrow(CompatLevel::newest()), length); ChunkedArray::with_chunk(name, arr) } } @@ -90,7 +91,10 @@ impl<'a> ChunkFull<&'a [u8]> for BinaryOffsetChunked { impl ChunkFullNull for BinaryOffsetChunked { fn full_null(name: &str, length: usize) -> Self { - let arr = BinaryArray::::new_null(DataType::BinaryOffset.to_arrow(true), length); + let arr = BinaryArray::::new_null( + DataType::BinaryOffset.to_arrow(CompatLevel::newest()), + length, + ); ChunkedArray::with_chunk(name, arr) } } @@ -122,7 +126,11 @@ impl ArrayChunked { ) -> ArrayChunked { let arr = FixedSizeListArray::new_null( ArrowDataType::FixedSizeList( - Box::new(ArrowField::new("item", inner_dtype.to_arrow(true), true)), + Box::new(ArrowField::new( + "item", + inner_dtype.to_arrow(CompatLevel::newest()), + true, + )), width, ), length, @@ -137,7 +145,11 @@ impl ChunkFull<&Series> for ArrayChunked { let width = value.len(); let dtype = value.dtype(); let arrow_dtype = ArrowDataType::FixedSizeList( - Box::new(ArrowField::new("item", dtype.to_arrow(true), true)), + Box::new(ArrowField::new( + "item", + dtype.to_arrow(CompatLevel::newest()), + true, + )), width, ); let value = value.rechunk().chunks()[0].clone(); @@ -158,7 +170,7 @@ impl ListChunked { let arr: ListArray = ListArray::new_null( ArrowDataType::LargeList(Box::new(ArrowField::new( "item", - inner_dtype.to_physical().to_arrow(true), + inner_dtype.to_physical().to_arrow(CompatLevel::newest()), true, ))), length, diff --git a/crates/polars-core/src/chunked_array/ops/gather.rs b/crates/polars-core/src/chunked_array/ops/gather.rs index 67c3b980d516..21bf6479dfc4 100644 --- a/crates/polars-core/src/chunked_array/ops/gather.rs +++ b/crates/polars-core/src/chunked_array/ops/gather.rs @@ -155,7 +155,7 @@ where } let targets: Vec<_> = ca.downcast_iter().collect(); let arr = gather_idx_array_unchecked( - ca.dtype().to_arrow(true), + ca.dtype().to_arrow(CompatLevel::newest()), &targets, ca.null_count() > 0, indices.as_ref(), @@ -192,7 +192,7 @@ where let targets: Vec<_> = ca.downcast_iter().collect(); let chunks = indices.downcast_iter().map(|idx_arr| { - let dtype = ca.dtype().to_arrow(true); + let dtype = ca.dtype().to_arrow(CompatLevel::newest()); if idx_arr.null_count() == 0 { gather_idx_array_unchecked(dtype, &targets, targets_have_nulls, idx_arr.values()) } else if targets.len() == 1 { diff --git a/crates/polars-core/src/chunked_array/ops/rolling_window.rs b/crates/polars-core/src/chunked_array/ops/rolling_window.rs index ef3c07529504..c5898edb4df1 100644 --- a/crates/polars-core/src/chunked_array/ops/rolling_window.rs +++ b/crates/polars-core/src/chunked_array/ops/rolling_window.rs @@ -270,7 +270,7 @@ mod inner_mod { } } let arr = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), values.into(), Some(validity.into()), ); diff --git a/crates/polars-core/src/chunked_array/ops/set.rs b/crates/polars-core/src/chunked_array/ops/set.rs index 52646925a05c..abafab312c6a 100644 --- a/crates/polars-core/src/chunked_array/ops/set.rs +++ b/crates/polars-core/src/chunked_array/ops/set.rs @@ -55,7 +55,7 @@ where self.downcast_iter().next().unwrap(), idx, value, - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), )?; return Ok(Self::with_chunk(self.name(), arr)); } @@ -101,7 +101,14 @@ where let chunks = left .downcast_iter() .zip(mask.downcast_iter()) - .map(|(arr, mask)| set_with_mask(arr, mask, value, T::get_dtype().to_arrow(true))); + .map(|(arr, mask)| { + set_with_mask( + arr, + mask, + value, + T::get_dtype().to_arrow(CompatLevel::newest()), + ) + }); Ok(ChunkedArray::from_chunk_iter(self.name(), chunks)) } else { // slow path, could be optimized. diff --git a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs index 8b909f09930a..1e86cda3e33b 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/arg_sort_multiple.rs @@ -94,12 +94,12 @@ pub fn _get_rows_encoded_compat_array(by: &Series) -> PolarsResult { DataType::Categorical(_, _) | DataType::Enum(_, _) => { let ca = by.categorical().unwrap(); if ca.uses_lexical_ordering() { - by.to_arrow(0, true) + by.to_arrow(0, CompatLevel::newest()) } else { ca.physical().chunks[0].clone() } }, - _ => by.to_arrow(0, true), + _ => by.to_arrow(0, CompatLevel::newest()), }; Ok(out) } diff --git a/crates/polars-core/src/chunked_array/ops/sort/mod.rs b/crates/polars-core/src/chunked_array/ops/sort/mod.rs index 907aeefa33d6..c2ef58a23c26 100644 --- a/crates/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/sort/mod.rs @@ -201,7 +201,7 @@ where } let arr = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), vals.into(), Some(create_validity(len, null_count, options.nulls_last)), ); diff --git a/crates/polars-core/src/chunked_array/ops/unique/mod.rs b/crates/polars-core/src/chunked_array/ops/unique/mod.rs index 2f4cc86d192f..989e30061478 100644 --- a/crates/polars-core/src/chunked_array/ops/unique/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/unique/mod.rs @@ -126,7 +126,11 @@ where if !T::Native::is_float() && MetadataEnv::experimental_enabled() { let md = self.metadata(); if let (Some(min), Some(max)) = (md.get_min_value(), md.get_max_value()) { - let data_type = self.field.as_ref().data_type().to_arrow(false); + let data_type = self + .field + .as_ref() + .data_type() + .to_arrow(CompatLevel::oldest()); if let Some(mut state) = PrimitiveRangedUniqueState::new( *min, *max, @@ -268,7 +272,11 @@ impl ChunkUnique for BooleanChunked { fn unique(&self) -> PolarsResult { use polars_compute::unique::RangedUniqueKernel; - let data_type = self.field.as_ref().data_type().to_arrow(false); + let data_type = self + .field + .as_ref() + .data_type() + .to_arrow(CompatLevel::oldest()); let has_null = self.null_count() > 0; let mut state = BooleanUniqueKernelState::new(has_null, data_type); diff --git a/crates/polars-core/src/chunked_array/trusted_len.rs b/crates/polars-core/src/chunked_array/trusted_len.rs index af6a6d3f3c51..84ff13cb906d 100644 --- a/crates/polars-core/src/chunked_array/trusted_len.rs +++ b/crates/polars-core/src/chunked_array/trusted_len.rs @@ -17,7 +17,8 @@ where // SAFETY: iter is TrustedLen. let iter = iter.into_iter(); let arr = unsafe { - PrimitiveArray::from_trusted_len_iter_unchecked(iter).to(T::get_dtype().to_arrow(true)) + PrimitiveArray::from_trusted_len_iter_unchecked(iter) + .to(T::get_dtype().to_arrow(CompatLevel::newest())) }; arr.into() } @@ -37,7 +38,7 @@ where // SAFETY: iter is TrustedLen. let iter = iter.into_iter(); let values = unsafe { Vec::from_trusted_len_iter_unchecked(iter) }.into(); - let arr = PrimitiveArray::new(T::get_dtype().to_arrow(true), values, None); + let arr = PrimitiveArray::new(T::get_dtype().to_arrow(CompatLevel::newest()), values, None); NoNull::new(arr.into()) } } diff --git a/crates/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs index 0144af1537dd..9fc2a0ba0fb0 100644 --- a/crates/polars-core/src/datatypes/dtype.rs +++ b/crates/polars-core/src/datatypes/dtype.rs @@ -55,6 +55,7 @@ pub enum DataType { Float64, /// Fixed point decimal type optional precision and non-negative scale. /// This is backed by a signed 128-bit integer which allows for up to 38 significant digits. + /// Meaning max precision is 38. #[cfg(feature = "dtype-decimal")] Decimal(Option, Option), // precision/scale; scale being None means "infer" /// String data @@ -497,7 +498,7 @@ impl DataType { } /// Convert to an Arrow Field - pub fn to_arrow_field(&self, name: &str, pl_flavor: bool) -> ArrowField { + pub fn to_arrow_field(&self, name: &str, compat_level: CompatLevel) -> ArrowField { let metadata = match self { #[cfg(feature = "dtype-categorical")] DataType::Enum(_, _) => Some(BTreeMap::from([( @@ -511,7 +512,7 @@ impl DataType { _ => None, }; - let field = ArrowField::new(name, self.to_arrow(pl_flavor), true); + let field = ArrowField::new(name, self.to_arrow(compat_level), true); if let Some(metadata) = metadata { field.with_metadata(metadata) @@ -522,12 +523,12 @@ impl DataType { /// Convert to an Arrow data type. #[inline] - pub fn to_arrow(&self, pl_flavor: bool) -> ArrowDataType { - self.try_to_arrow(pl_flavor).unwrap() + pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowDataType { + self.try_to_arrow(compat_level).unwrap() } #[inline] - pub fn try_to_arrow(&self, pl_flavor: bool) -> PolarsResult { + pub fn try_to_arrow(&self, compat_level: CompatLevel) -> PolarsResult { use DataType::*; match self { Boolean => Ok(ArrowDataType::Boolean), @@ -542,13 +543,17 @@ impl DataType { Float32 => Ok(ArrowDataType::Float32), Float64 => Ok(ArrowDataType::Float64), #[cfg(feature = "dtype-decimal")] - // note: what else can we do here other than setting precision to 38?.. - Decimal(precision, scale) => Ok(ArrowDataType::Decimal( - (*precision).unwrap_or(38), - scale.unwrap_or(0), // and what else can we do here? - )), + Decimal(precision, scale) => { + let precision = (*precision).unwrap_or(38); + polars_ensure!(precision <= 38 && precision > 0, InvalidOperation: "decimal precision should be <= 38 & >= 1"); + + Ok(ArrowDataType::Decimal( + precision, + scale.unwrap_or(0), // and what else can we do here? + )) + }, String => { - let dt = if pl_flavor { + let dt = if compat_level.0 >= 1 { ArrowDataType::Utf8View } else { ArrowDataType::LargeUtf8 @@ -556,7 +561,7 @@ impl DataType { Ok(dt) }, Binary => { - let dt = if pl_flavor { + let dt = if compat_level.0 >= 1 { ArrowDataType::BinaryView } else { ArrowDataType::LargeBinary @@ -569,11 +574,11 @@ impl DataType { Time => Ok(ArrowDataType::Time64(ArrowTimeUnit::Nanosecond)), #[cfg(feature = "dtype-array")] Array(dt, size) => Ok(ArrowDataType::FixedSizeList( - Box::new(dt.to_arrow_field("item", pl_flavor)), + Box::new(dt.to_arrow_field("item", compat_level)), *size, )), List(dt) => Ok(ArrowDataType::LargeList(Box::new( - dt.to_arrow_field("item", pl_flavor), + dt.to_arrow_field("item", compat_level), ))), Null => Ok(ArrowDataType::Null), #[cfg(feature = "object")] @@ -587,7 +592,7 @@ impl DataType { }, #[cfg(feature = "dtype-categorical")] Categorical(_, _) | Enum(_, _) => { - let values = if pl_flavor { + let values = if compat_level.0 >= 1 { ArrowDataType::Utf8View } else { ArrowDataType::LargeUtf8 @@ -600,7 +605,10 @@ impl DataType { }, #[cfg(feature = "dtype-struct")] Struct(fields) => { - let fields = fields.iter().map(|fld| fld.to_arrow(pl_flavor)).collect(); + let fields = fields + .iter() + .map(|fld| fld.to_arrow(compat_level)) + .collect(); Ok(ArrowDataType::Struct(fields)) }, BinaryOffset => Ok(ArrowDataType::LargeBinary), @@ -610,7 +618,7 @@ impl DataType { UnknownKind::Float => ArrowDataType::Float64, UnknownKind::Str => ArrowDataType::Utf8View, UnknownKind::Int(v) => { - return materialize_dyn_int(*v).dtype().try_to_arrow(pl_flavor) + return materialize_dyn_int(*v).dtype().try_to_arrow(compat_level) }, }; Ok(dt) @@ -784,3 +792,31 @@ pub fn create_enum_data_type(categories: Utf8ViewArray) -> DataType { let rev_map = RevMapping::build_local(categories); DataType::Enum(Some(Arc::new(rev_map)), Default::default()) } + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct CompatLevel(pub(crate) u16); + +impl CompatLevel { + pub const fn newest() -> CompatLevel { + CompatLevel(1) + } + + pub const fn oldest() -> CompatLevel { + CompatLevel(0) + } + + // The following methods are only used internally + + #[doc(hidden)] + pub fn with_level(level: u16) -> PolarsResult { + if level > CompatLevel::newest().0 { + polars_bail!(InvalidOperation: "invalid compat level"); + } + Ok(CompatLevel(level)) + } + + #[doc(hidden)] + pub fn get_level(&self) -> u16 { + self.0 + } +} diff --git a/crates/polars-core/src/datatypes/field.rs b/crates/polars-core/src/datatypes/field.rs index 63a3bafe33fe..aea148546ef0 100644 --- a/crates/polars-core/src/datatypes/field.rs +++ b/crates/polars-core/src/datatypes/field.rs @@ -107,10 +107,10 @@ impl Field { /// let f = Field::new("Value", DataType::Int64); /// let af = arrow::datatypes::Field::new("Value", arrow::datatypes::ArrowDataType::Int64, true); /// - /// assert_eq!(f.to_arrow(true), af); + /// assert_eq!(f.to_arrow(CompatLevel::newest()), af); /// ``` - pub fn to_arrow(&self, pl_flavor: bool) -> ArrowField { - self.dtype.to_arrow_field(self.name.as_str(), pl_flavor) + pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowField { + self.dtype.to_arrow_field(self.name.as_str(), compat_level) } } diff --git a/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs b/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs index 1915797ee41f..812d89f0d240 100644 --- a/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs +++ b/crates/polars-core/src/frame/group_by/aggregations/agg_list.rs @@ -70,11 +70,13 @@ where }; let array = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), list_values.into(), validity, ); - let data_type = ListArray::::default_datatype(T::get_dtype().to_arrow(true)); + let data_type = ListArray::::default_datatype( + T::get_dtype().to_arrow(CompatLevel::newest()), + ); // SAFETY: // offsets are monotonically increasing let arr = ListArray::::new( @@ -133,11 +135,13 @@ where }; let array = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), list_values.into(), validity, ); - let data_type = ListArray::::default_datatype(T::get_dtype().to_arrow(true)); + let data_type = ListArray::::default_datatype( + T::get_dtype().to_arrow(CompatLevel::newest()), + ); let arr = ListArray::::new( data_type, Offsets::new_unchecked(offsets).into(), diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index f9b82388ae3a..0515d030a569 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -1122,6 +1122,9 @@ impl DataFrame { /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet). pub fn drop_many_amortized(&self, names: &PlHashSet<&str>) -> DataFrame { + if names.is_empty() { + return self.clone(); + } let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len())); self.columns.iter().for_each(|s| { if !names.contains(&s.name()) { @@ -2379,10 +2382,10 @@ impl DataFrame { /// This responsibility is left to the caller as we don't want to take mutable references here, /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller /// as well. - pub fn iter_chunks(&self, pl_flavor: bool, parallel: bool) -> RecordBatchIter { - // If any of the columns is binview and we don't convert `pl_flavor` we allow parallelism + pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter { + // If any of the columns is binview and we don't convert `compat_level` we allow parallelism // as we must allocate arrow strings/binaries. - let parallel = if parallel && !pl_flavor { + let parallel = if parallel && compat_level.0 >= 1 { self.columns.len() > 1 && self .columns @@ -2396,7 +2399,7 @@ impl DataFrame { columns: &self.columns, idx: 0, n_chunks: self.n_chunks(), - pl_flavor, + compat_level, parallel, } } @@ -3015,7 +3018,7 @@ pub struct RecordBatchIter<'a> { columns: &'a Vec, idx: usize, n_chunks: usize, - pl_flavor: bool, + compat_level: CompatLevel, parallel: bool, } @@ -3031,12 +3034,12 @@ impl<'a> Iterator for RecordBatchIter<'a> { let iter = self .columns .par_iter() - .map(|s| s.to_arrow(self.idx, self.pl_flavor)); + .map(|s| s.to_arrow(self.idx, self.compat_level)); POOL.install(|| iter.collect()) } else { self.columns .iter() - .map(|s| s.to_arrow(self.idx, self.pl_flavor)) + .map(|s| s.to_arrow(self.idx, self.compat_level)) .collect() }; self.idx += 1; @@ -3114,7 +3117,7 @@ mod test { "foo" => &[1, 2, 3, 4, 5] ) .unwrap(); - let mut iter = df.iter_chunks(true, false); + let mut iter = df.iter_chunks(CompatLevel::newest(), false); assert_eq!(5, iter.next().unwrap().len()); assert!(iter.next().is_none()); } diff --git a/crates/polars-core/src/frame/row/transpose.rs b/crates/polars-core/src/frame/row/transpose.rs index 0fdc15c9c6f6..7ad4bc4f1fef 100644 --- a/crates/polars-core/src/frame/row/transpose.rs +++ b/crates/polars-core/src/frame/row/transpose.rs @@ -247,7 +247,7 @@ where }; let arr = PrimitiveArray::::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), values.into(), validity, ); diff --git a/crates/polars-core/src/schema.rs b/crates/polars-core/src/schema.rs index 981615499dfb..dfb581135a0b 100644 --- a/crates/polars-core/src/schema.rs +++ b/crates/polars-core/src/schema.rs @@ -370,11 +370,11 @@ impl Schema { } /// Convert self to `ArrowSchema` by cloning the fields - pub fn to_arrow(&self, pl_flavor: bool) -> ArrowSchema { + pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowSchema { let fields: Vec<_> = self .inner .iter() - .map(|(name, dtype)| dtype.to_arrow_field(name.as_str(), pl_flavor)) + .map(|(name, dtype)| dtype.to_arrow_field(name.as_str(), compat_level)) .collect(); ArrowSchema::from(fields) } diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 0fc82c1d6487..515e77b158c2 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; use std::fmt::Formatter; -use serde::de::{MapAccess, Visitor}; +use serde::de::{Error as DeError, MapAccess, Visitor}; +use serde::ser::Error as SerError; use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; #[cfg(feature = "dtype-array")] @@ -79,6 +80,10 @@ impl Serialize for Series { let ca = self.null().unwrap(); ca.serialize(serializer) }, + #[cfg(feature = "object")] + DataType::Object(_, _) => Err(S::Error::custom( + "serializing data of type Object is not supported", + )), dt => { with_match_physical_numeric_polars_type!(dt, |$T| { let ca: &ChunkedArray<$T> = self.as_ref().as_ref().as_ref(); @@ -285,9 +290,9 @@ impl<'de> Deserialize<'de> for Series { let len = values.first().unwrap(); Ok(Series::new_null(&name, *len)) }, - dt => { - panic!("{dt:?} dtype deserialization not yet implemented") - }, + dt => Err(A::Error::custom(format!( + "deserializing data of type {dt} is not supported" + ))), }?; if let Some(f) = bit_settings { diff --git a/crates/polars-core/src/series/from.rs b/crates/polars-core/src/series/from.rs index 9a82ed0e2506..6a4c61cd7f37 100644 --- a/crates/polars-core/src/series/from.rs +++ b/crates/polars-core/src/series/from.rs @@ -102,9 +102,12 @@ impl Series { Float64 => Float64Chunked::from_chunks(name, chunks).into_series(), BinaryOffset => BinaryOffsetChunked::from_chunks(name, chunks).into_series(), #[cfg(feature = "dtype-struct")] - Struct(_) => { - Series::_try_from_arrow_unchecked(name, chunks, &dtype.to_arrow(true)).unwrap() - }, + Struct(_) => Series::_try_from_arrow_unchecked( + name, + chunks, + &dtype.to_arrow(CompatLevel::newest()), + ) + .unwrap(), #[cfg(feature = "object")] Object(_, _) => { assert_eq!(chunks.len(), 1); diff --git a/crates/polars-core/src/series/implementations/decimal.rs b/crates/polars-core/src/series/implementations/decimal.rs index 9f5c382d94c7..2437a43fc3f4 100644 --- a/crates/polars-core/src/series/implementations/decimal.rs +++ b/crates/polars-core/src/series/implementations/decimal.rs @@ -43,7 +43,8 @@ impl SeriesWrap { Series::from_chunks_and_dtype_unchecked("", vec![arr.values().clone()], dtype) }; let new_values = s.array_ref(0).clone(); - let data_type = ListArray::::default_datatype(dtype.to_arrow(true)); + let data_type = + ListArray::::default_datatype(dtype.to_arrow(CompatLevel::newest())); let new_arr = ListArray::::new( data_type, arr.offsets().clone(), diff --git a/crates/polars-core/src/series/into.rs b/crates/polars-core/src/series/into.rs index f1fbd6143f0a..c0ac905666cc 100644 --- a/crates/polars-core/src/series/into.rs +++ b/crates/polars-core/src/series/into.rs @@ -19,11 +19,11 @@ impl Series { /// Convert a chunk in the Series to the correct Arrow type. /// This conversion is needed because polars doesn't use a /// 1 on 1 mapping for logical/ categoricals, etc. - pub fn to_arrow(&self, chunk_idx: usize, pl_flavor: bool) -> ArrayRef { + pub fn to_arrow(&self, chunk_idx: usize, compat_level: CompatLevel) -> ArrayRef { match self.dtype() { // make sure that we recursively apply all logical types. #[cfg(feature = "dtype-struct")] - DataType::Struct(_) => self.struct_().unwrap().to_arrow(chunk_idx, pl_flavor), + DataType::Struct(_) => self.struct_().unwrap().to_arrow(chunk_idx, compat_level), // special list branch to // make sure that we recursively apply all logical types. DataType::List(inner) => { @@ -45,10 +45,10 @@ impl Series { .unwrap() }; - s.to_arrow(0, pl_flavor) + s.to_arrow(0, compat_level) }; - let data_type = ListArray::::default_datatype(inner.to_arrow(pl_flavor)); + let data_type = ListArray::::default_datatype(inner.to_arrow(compat_level)); let arr = ListArray::::new( data_type, arr.offsets().clone(), @@ -74,30 +74,30 @@ impl Series { ) }; - new.to_arrow(pl_flavor, false) + new.to_arrow(compat_level, false) }, #[cfg(feature = "dtype-date")] DataType::Date => cast( &*self.chunks()[chunk_idx], - &DataType::Date.to_arrow(pl_flavor), + &DataType::Date.to_arrow(compat_level), ) .unwrap(), #[cfg(feature = "dtype-datetime")] DataType::Datetime(_, _) => cast( &*self.chunks()[chunk_idx], - &self.dtype().to_arrow(pl_flavor), + &self.dtype().to_arrow(compat_level), ) .unwrap(), #[cfg(feature = "dtype-duration")] DataType::Duration(_) => cast( &*self.chunks()[chunk_idx], - &self.dtype().to_arrow(pl_flavor), + &self.dtype().to_arrow(compat_level), ) .unwrap(), #[cfg(feature = "dtype-time")] DataType::Time => cast( &*self.chunks()[chunk_idx], - &DataType::Time.to_arrow(pl_flavor), + &DataType::Time.to_arrow(compat_level), ) .unwrap(), #[cfg(feature = "object")] @@ -117,7 +117,7 @@ impl Series { } }, DataType::String => { - if pl_flavor { + if compat_level.0 >= 1 { self.array_ref(chunk_idx).clone() } else { let arr = self.array_ref(chunk_idx); @@ -125,7 +125,7 @@ impl Series { } }, DataType::Binary => { - if pl_flavor { + if compat_level.0 >= 1 { self.array_ref(chunk_idx).clone() } else { let arr = self.array_ref(chunk_idx); diff --git a/crates/polars-core/src/series/mod.rs b/crates/polars-core/src/series/mod.rs index 567f7e66f9d7..c508970faeae 100644 --- a/crates/polars-core/src/series/mod.rs +++ b/crates/polars-core/src/series/mod.rs @@ -592,8 +592,16 @@ impl Series { pub fn to_physical_repr(&self) -> Cow { use DataType::*; match self.dtype() { - Date => Cow::Owned(self.cast(&Int32).unwrap()), - Datetime(_, _) | Duration(_) | Time => Cow::Owned(self.cast(&Int64).unwrap()), + // NOTE: Don't use cast here, as it might rechunk (if all nulls) + // which is not allowed in a phys repr. + #[cfg(feature = "dtype-date")] + Date => Cow::Owned(self.date().unwrap().0.clone().into_series()), + #[cfg(feature = "dtype-datetime")] + Datetime(_, _) => Cow::Owned(self.datetime().unwrap().0.clone().into_series()), + #[cfg(feature = "dtype-duration")] + Duration(_) => Cow::Owned(self.duration().unwrap().0.clone().into_series()), + #[cfg(feature = "dtype-time")] + Time => Cow::Owned(self.time().unwrap().0.clone().into_series()), #[cfg(feature = "dtype-categorical")] Categorical(_, _) | Enum(_, _) => { let ca = self.categorical().unwrap(); @@ -906,7 +914,9 @@ impl Series { let offsets = (0i64..(s.len() as i64 + 1)).collect::>(); let offsets = unsafe { Offsets::new_unchecked(offsets) }; - let data_type = LargeListArray::default_datatype(s.dtype().to_physical().to_arrow(true)); + let data_type = LargeListArray::default_datatype( + s.dtype().to_physical().to_arrow(CompatLevel::newest()), + ); let new_arr = LargeListArray::new(data_type, offsets.into(), values, None); let mut out = ListChunked::with_chunk(s.name(), new_arr); out.set_inner_dtype(s.dtype().clone()); diff --git a/crates/polars-core/src/series/ops/reshape.rs b/crates/polars-core/src/series/ops/reshape.rs index 87e4a643d8e5..ca824f6a4104 100644 --- a/crates/polars-core/src/series/ops/reshape.rs +++ b/crates/polars-core/src/series/ops/reshape.rs @@ -156,8 +156,12 @@ impl Series { while let Some(dim) = dims.pop_back() { prev_dtype = DataType::Array(Box::new(prev_dtype), dim as usize); - prev_array = - FixedSizeListArray::new(prev_dtype.to_arrow(true), prev_array, None).boxed(); + prev_array = FixedSizeListArray::new( + prev_dtype.to_arrow(CompatLevel::newest()), + prev_array, + None, + ) + .boxed(); } Ok(unsafe { Series::from_chunks_and_dtype_unchecked( diff --git a/crates/polars-core/src/utils/supertype.rs b/crates/polars-core/src/utils/supertype.rs index 54a07b46b415..2068f228411b 100644 --- a/crates/polars-core/src/utils/supertype.rs +++ b/crates/polars-core/src/utils/supertype.rs @@ -276,11 +276,11 @@ pub fn get_supertype_with_options( }, (dt, Unknown(kind)) => { match kind { - // numeric vs float|str -> always float|str - UnknownKind::Float | UnknownKind::Int(_) if dt.is_float() | dt.is_string() => Some(dt.clone()), + // numeric vs float|str -> always float|str|decimal + UnknownKind::Float | UnknownKind::Int(_) if dt.is_float() | dt.is_string() | dt.is_decimal() => Some(dt.clone()), UnknownKind::Float if dt.is_integer() => Some(Unknown(UnknownKind::Float)), - // Materialize float - UnknownKind::Float if dt.is_float() => Some(dt.clone()), + // Materialize float to float or decimal + UnknownKind::Float if dt.is_float() | dt.is_decimal() => Some(dt.clone()), // Materialize str UnknownKind::Str if dt.is_string() | dt.is_enum() => Some(dt.clone()), // Materialize str diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index e5d455655e89..0d4634d6eeaf 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -14,6 +14,7 @@ pub struct BinaryExpr { right: Arc, expr: Expr, has_literal: bool, + allow_threading: bool, } impl BinaryExpr { @@ -23,6 +24,7 @@ impl BinaryExpr { right: Arc, expr: Expr, has_literal: bool, + allow_threading: bool, ) -> Self { Self { left, @@ -30,6 +32,7 @@ impl BinaryExpr { right, expr, has_literal, + allow_threading, } } } @@ -175,21 +178,13 @@ impl PhysicalExpr for BinaryExpr { // they also saturate the thread pool by themselves, so that's fine. let has_window = state.has_window(); - // Streaming takes care of parallelism, don't parallelize here, as it - // increases contention. - #[cfg(feature = "streaming")] - let in_streaming = state.in_streaming_engine(); - - #[cfg(not(feature = "streaming"))] - let in_streaming = false; - let (lhs, rhs); if has_window { let mut state = state.split(); state.remove_cache_window_flag(); lhs = self.left.evaluate(df, &state)?; rhs = self.right.evaluate(df, &state)?; - } else if in_streaming || self.has_literal { + } else if !self.allow_threading || self.has_literal { // Literals are free, don't pay par cost. lhs = self.left.evaluate(df, state)?; rhs = self.right.evaluate(df, state)?; diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index 97f25abc4196..5a71230a9d09 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -821,7 +821,7 @@ where unsafe { values.set_len(len) } let validity = Bitmap::from(validity); let arr = PrimitiveArray::new( - T::get_dtype().to_physical().to_arrow(true), + T::get_dtype().to_physical().to_arrow(CompatLevel::newest()), values.into(), Some(validity), ); diff --git a/crates/polars-expr/src/planner.rs b/crates/polars-expr/src/planner.rs index cf716477d20b..85968c74e77c 100644 --- a/crates/polars-expr/src/planner.rs +++ b/crates/polars-expr/src/planner.rs @@ -321,6 +321,7 @@ fn create_physical_expr_inner( rhs, node_to_expr(expression, expr_arena), state.local.has_lit, + state.allow_threading, ))) }, Column(column) => Ok(Arc::new(ColumnExpr::new( @@ -516,7 +517,7 @@ fn create_physical_expr_inner( function.clone(), node_to_expr(expression, expr_arena), *options, - true, + state.allow_threading, schema.cloned(), output_dtype, ))) @@ -554,7 +555,7 @@ fn create_physical_expr_inner( function.clone().into(), node_to_expr(expression, expr_arena), *options, - true, + state.allow_threading, schema.cloned(), output_dtype, ))) diff --git a/crates/polars-expr/src/state/execution_state.rs b/crates/polars-expr/src/state/execution_state.rs index 90798ef6b8ee..07c571e26653 100644 --- a/crates/polars-expr/src/state/execution_state.rs +++ b/crates/polars-expr/src/state/execution_state.rs @@ -229,20 +229,6 @@ impl ExecutionState { flags }); } - - #[cfg(feature = "streaming")] - pub fn set_in_streaming_engine(&mut self) { - self.set_flags(&|mut flags| { - flags.insert(StateFlags::IN_STREAMING); - flags - }); - } - - #[cfg(feature = "streaming")] - pub fn in_streaming_engine(&self) -> bool { - let flags: StateFlags = self.flags.load(Ordering::Relaxed).into(); - flags.contains(StateFlags::IN_STREAMING) - } } impl Default for ExecutionState { diff --git a/crates/polars-ffi/src/version_0.rs b/crates/polars-ffi/src/version_0.rs index 43fec994c4d4..eb24542f0733 100644 --- a/crates/polars-ffi/src/version_0.rs +++ b/crates/polars-ffi/src/version_0.rs @@ -1,3 +1,5 @@ +use polars_core::prelude::CompatLevel; + use super::*; /// An FFI exported `Series`. @@ -52,13 +54,13 @@ unsafe extern "C" fn c_release_series_export(e: *mut SeriesExport) { } pub fn export_series(s: &Series) -> SeriesExport { - let field = ArrowField::new(s.name(), s.dtype().to_arrow(true), true); + let field = ArrowField::new(s.name(), s.dtype().to_arrow(CompatLevel::newest()), true); let schema = Box::new(ffi::export_field_to_c(&field)); let mut arrays = (0..s.chunks().len()) .map(|i| { // Make sure we export the logical type. - let arr = s.to_arrow(i, true); + let arr = s.to_arrow(i, CompatLevel::newest()); Box::into_raw(Box::new(ffi::export_array_to_c(arr.clone()))) }) .collect::>(); diff --git a/crates/polars-io/src/avro/write.rs b/crates/polars-io/src/avro/write.rs index b12cd358da16..2954de97d964 100644 --- a/crates/polars-io/src/avro/write.rs +++ b/crates/polars-io/src/avro/write.rs @@ -64,12 +64,12 @@ where } fn finish(&mut self, df: &mut DataFrame) -> PolarsResult<()> { - let schema = schema_to_arrow_checked(&df.schema(), false, "avro")?; + let schema = schema_to_arrow_checked(&df.schema(), CompatLevel::oldest(), "avro")?; let record = write::to_record(&schema, self.name.clone())?; let mut data = vec![]; let mut compressed_block = avro_schema::file::CompressedBlock::default(); - for chunk in df.iter_chunks(false, true) { + for chunk in df.iter_chunks(CompatLevel::oldest(), true) { let mut serializers = chunk .iter() .zip(record.fields.iter()) diff --git a/crates/polars-io/src/cloud/glob.rs b/crates/polars-io/src/cloud/glob.rs index b61ac58baa0a..dc202c5459d3 100644 --- a/crates/polars-io/src/cloud/glob.rs +++ b/crates/polars-io/src/cloud/glob.rs @@ -1,9 +1,8 @@ -use futures::future::ready; -use futures::{StreamExt, TryStreamExt}; +use futures::TryStreamExt; use object_store::path::Path; use polars_core::error::to_compute_err; use polars_core::prelude::{polars_ensure, polars_err}; -use polars_error::{PolarsError, PolarsResult}; +use polars_error::PolarsResult; use regex::Regex; use url::Url; @@ -177,7 +176,7 @@ pub async fn glob(url: &str, cloud_options: Option<&CloudOptions>) -> PolarsResu }, store, ) = super::build_object_store(url, cloud_options).await?; - let matcher = Matcher::new( + let matcher = &Matcher::new( if scheme == "file" { // For local paths the returned location has the leading slash stripped. prefix[1..].to_string() @@ -187,14 +186,16 @@ pub async fn glob(url: &str, cloud_options: Option<&CloudOptions>) -> PolarsResu expansion.as_deref(), )?; - let list_stream = store + let mut locations = store .list(Some(&Path::from(prefix))) - .map_err(to_compute_err); - let mut locations: Vec = list_stream - .then(|entry| async { Ok::<_, PolarsError>(entry.map_err(to_compute_err)?.location) }) - .filter(|name| ready(name.as_ref().map_or(true, |name| matcher.is_matching(name)))) - .try_collect() - .await?; + .try_filter_map(|x| async move { + let out = (x.size > 0 && matcher.is_matching(&x.location)).then_some(x.location); + Ok(out) + }) + .try_collect::>() + .await + .map_err(to_compute_err)?; + locations.sort_unstable(); Ok(locations .into_iter() diff --git a/crates/polars-io/src/cloud/object_store_setup.rs b/crates/polars-io/src/cloud/object_store_setup.rs index 4d43d8184d62..7b18640db617 100644 --- a/crates/polars-io/src/cloud/object_store_setup.rs +++ b/crates/polars-io/src/cloud/object_store_setup.rs @@ -40,17 +40,9 @@ fn url_and_creds_to_key(url: &Url, options: Option<&CloudOptions>) -> String { ) } -/// Simply construct an object_store `Path` struct from a string. -pub fn object_path_from_string(path: String) -> object_store::path::Path { - // We transmute because they don't expose a way to just create it from a string - // without encoding or decoding it. If one day we can't use this transmute hack - // anymore then we'll just have to `Path::from_url_path(percent_encode(path))` - { - const _: [(); std::mem::align_of::()] = - [(); std::mem::align_of::()]; - }; - - unsafe { std::mem::transmute::(path) } +/// Construct an object_store `Path` from a string without any encoding/decoding. +pub fn object_path_from_string(path: String) -> PolarsResult { + object_store::path::Path::parse(&path).map_err(to_compute_err) } /// Build an [`ObjectStore`] based on the URL and passed in url. Return the cloud location and an implementation of the object store. @@ -147,7 +139,7 @@ mod test { use super::object_path_from_string; let path = "%25"; - let out = object_path_from_string(path.to_string()); + let out = object_path_from_string(path.to_string()).unwrap(); assert_eq!(out.as_ref(), path); } diff --git a/crates/polars-io/src/file_cache/utils.rs b/crates/polars-io/src/file_cache/utils.rs index 6262cfe772aa..b239c2792e54 100644 --- a/crates/polars-io/src/file_cache/utils.rs +++ b/crates/polars-io/src/file_cache/utils.rs @@ -85,7 +85,7 @@ pub fn init_entries_from_uri_list]>>( let cloud_path = { assert!(expansion.is_none(), "path should not contain wildcards"); - object_path_from_string(prefix) + object_path_from_string(prefix)? }; let object_store = object_store.clone(); diff --git a/crates/polars-io/src/hive.rs b/crates/polars-io/src/hive.rs new file mode 100644 index 000000000000..ddf1d8973b3e --- /dev/null +++ b/crates/polars-io/src/hive.rs @@ -0,0 +1,42 @@ +use polars_core::frame::DataFrame; +use polars_core::schema::IndexOfSchema; +use polars_core::series::Series; + +/// Materializes hive partitions. +/// We have a special num_rows arg, as df can be empty when a projection contains +/// only hive partition columns. +/// +/// # Safety +/// +/// num_rows equals the height of the df when the df height is non-zero. +pub(crate) fn materialize_hive_partitions( + df: &mut DataFrame, + reader_schema: &S, + hive_partition_columns: Option<&[Series]>, + num_rows: usize, +) { + if let Some(hive_columns) = hive_partition_columns { + let Some(first) = hive_columns.first() else { + return; + }; + + if reader_schema.index_of(first.name()).is_some() { + // Insert these hive columns in the order they are stored in the file. + for s in hive_columns { + let i = match df.get_columns().binary_search_by_key( + &reader_schema.index_of(s.name()).unwrap_or(usize::MAX), + |s| reader_schema.index_of(s.name()).unwrap_or(usize::MIN), + ) { + Ok(i) => i, + Err(i) => i, + }; + + df.insert_column(i, s.new_from_index(0, num_rows)).unwrap(); + } + } else { + for s in hive_columns { + unsafe { df.with_column_unchecked(s.new_from_index(0, num_rows)) }; + } + } + } +} diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index 198e75ab3afe..12cbf9dddf73 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -36,12 +36,13 @@ use std::io::{Read, Seek}; use std::path::PathBuf; use arrow::datatypes::ArrowSchemaRef; -use arrow::io::ipc::read; +use arrow::io::ipc::read::{self, get_row_count}; use arrow::record_batch::RecordBatch; use polars_core::prelude::*; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use crate::hive::materialize_hive_partitions; use crate::mmap::MmapBytesReader; use crate::predicates::PhysicalIoExpr; use crate::prelude::*; @@ -79,6 +80,7 @@ pub struct IpcReader { pub(super) n_rows: Option, pub(super) projection: Option>, pub(crate) columns: Option>, + hive_partition_columns: Option>, pub(super) row_index: Option, // Stores the as key semaphore to make sure we don't write to the memory mapped file. pub(super) memory_map: Option, @@ -126,6 +128,11 @@ impl IpcReader { self } + pub fn with_hive_partition_columns(mut self, columns: Option>) -> Self { + self.hive_partition_columns = columns; + self + } + /// Add a row index column. pub fn with_row_index(mut self, row_index: Option) -> Self { self.row_index = row_index; @@ -200,6 +207,7 @@ impl SerReader for IpcReader { rechunk: true, n_rows: None, columns: None, + hive_partition_columns: None, projection: None, row_index: None, memory_map: None, @@ -214,29 +222,71 @@ impl SerReader for IpcReader { } fn finish(mut self) -> PolarsResult { - if self.memory_map.is_some() && self.reader.to_file().is_some() { - match self.finish_memmapped(None) { - Ok(df) => return Ok(df), - Err(err) => check_mmap_err(err)?, + let reader_schema = if let Some(ref schema) = self.schema { + schema.clone() + } else { + self.get_metadata()?.schema.clone() + }; + let reader_schema = reader_schema.as_ref(); + + let hive_partition_columns = self.hive_partition_columns.take(); + + // In case only hive columns are projected, the df would be empty, but we need the row count + // of the file in order to project the correct number of rows for the hive columns. + let (mut df, row_count) = (|| { + if self + .projection + .as_ref() + .map(|x| x.is_empty()) + .unwrap_or(false) + { + return PolarsResult::Ok(( + Default::default(), + get_row_count(&mut self.reader)? as usize, + )); } - } - let rechunk = self.rechunk; - let metadata = read::read_file_metadata(&mut self.reader)?; - let schema = &metadata.schema; - if let Some(columns) = &self.columns { - let prj = columns_to_projection(columns, schema)?; - self.projection = Some(prj); - } + if self.memory_map.is_some() && self.reader.to_file().is_some() { + match self.finish_memmapped(None) { + Ok(df) => { + let n = df.height(); + return Ok((df, n)); + }, + Err(err) => check_mmap_err(err)?, + } + } + let rechunk = self.rechunk; + let schema = self.get_metadata()?.schema.clone(); - let schema = if let Some(projection) = &self.projection { - Arc::new(apply_projection(&metadata.schema, projection)) - } else { - metadata.schema.clone() + if let Some(columns) = &self.columns { + let prj = columns_to_projection(columns, schema.as_ref())?; + self.projection = Some(prj); + } + + let schema = if let Some(projection) = &self.projection { + Arc::new(apply_projection(schema.as_ref(), projection)) + } else { + schema + }; + + let metadata = self.get_metadata()?.clone(); + + let ipc_reader = + read::FileReader::new(self.reader, metadata, self.projection, self.n_rows); + let df = finish_reader(ipc_reader, rechunk, None, None, &schema, self.row_index)?; + let n = df.height(); + Ok((df, n)) + })()?; + + if let Some(hive_cols) = hive_partition_columns { + materialize_hive_partitions( + &mut df, + reader_schema, + Some(hive_cols.as_slice()), + row_count, + ); }; - let ipc_reader = - read::FileReader::new(self.reader, metadata.clone(), self.projection, self.n_rows); - finish_reader(ipc_reader, rechunk, None, None, &schema, self.row_index) + Ok(df) } } diff --git a/crates/polars-io/src/ipc/ipc_reader_async.rs b/crates/polars-io/src/ipc/ipc_reader_async.rs index 2a78f4bfe3b6..4501898b50ad 100644 --- a/crates/polars-io/src/ipc/ipc_reader_async.rs +++ b/crates/polars-io/src/ipc/ipc_reader_async.rs @@ -78,7 +78,7 @@ impl IpcReaderAsync { // Any wildcards should already have been resolved here. Without this assertion they would // be ignored. debug_assert!(expansion.is_none(), "path should not contain wildcards"); - object_path_from_string(prefix) + object_path_from_string(prefix)? }; Ok(Self { diff --git a/crates/polars-io/src/ipc/ipc_stream.rs b/crates/polars-io/src/ipc/ipc_stream.rs index ed0872dc1ea0..35bd22dab5c3 100644 --- a/crates/polars-io/src/ipc/ipc_stream.rs +++ b/crates/polars-io/src/ipc/ipc_stream.rs @@ -207,7 +207,7 @@ where pub struct IpcStreamWriter { writer: W, compression: Option, - pl_flavor: bool, + compat_level: CompatLevel, } use arrow::record_batch::RecordBatch; @@ -221,8 +221,8 @@ impl IpcStreamWriter { self } - pub fn with_pl_flavor(mut self, pl_flavor: bool) -> Self { - self.pl_flavor = pl_flavor; + pub fn with_compat_level(mut self, compat_level: CompatLevel) -> Self { + self.compat_level = compat_level; self } } @@ -235,7 +235,7 @@ where IpcStreamWriter { writer, compression: None, - pl_flavor: false, + compat_level: CompatLevel::oldest(), } } @@ -247,9 +247,9 @@ where }, ); - ipc_stream_writer.start(&df.schema().to_arrow(self.pl_flavor), None)?; + ipc_stream_writer.start(&df.schema().to_arrow(self.compat_level), None)?; let df = chunk_df_for_writing(df, 512 * 512)?; - let iter = df.iter_chunks(self.pl_flavor, true); + let iter = df.iter_chunks(self.compat_level, true); for batch in iter { ipc_stream_writer.write(&batch, None)? diff --git a/crates/polars-io/src/ipc/write.rs b/crates/polars-io/src/ipc/write.rs index b2c08846dbbc..b187ff8edc07 100644 --- a/crates/polars-io/src/ipc/write.rs +++ b/crates/polars-io/src/ipc/write.rs @@ -42,7 +42,7 @@ pub struct IpcWriter { pub(super) writer: W, pub(super) compression: Option, /// Polars' flavor of arrow. This might be temporary. - pub(super) pl_flavor: bool, + pub(super) compat_level: CompatLevel, } impl IpcWriter { @@ -52,13 +52,13 @@ impl IpcWriter { self } - pub fn with_pl_flavor(mut self, pl_flavor: bool) -> Self { - self.pl_flavor = pl_flavor; + pub fn with_compat_level(mut self, compat_level: CompatLevel) -> Self { + self.compat_level = compat_level; self } pub fn batched(self, schema: &Schema) -> PolarsResult> { - let schema = schema_to_arrow_checked(schema, self.pl_flavor, "ipc")?; + let schema = schema_to_arrow_checked(schema, self.compat_level, "ipc")?; let mut writer = write::FileWriter::new( self.writer, Arc::new(schema), @@ -71,7 +71,7 @@ impl IpcWriter { Ok(BatchedWriter { writer, - pl_flavor: self.pl_flavor, + compat_level: self.compat_level, }) } } @@ -84,12 +84,12 @@ where IpcWriter { writer, compression: None, - pl_flavor: true, + compat_level: CompatLevel::newest(), } } fn finish(&mut self, df: &mut DataFrame) -> PolarsResult<()> { - let schema = schema_to_arrow_checked(&df.schema(), self.pl_flavor, "ipc")?; + let schema = schema_to_arrow_checked(&df.schema(), self.compat_level, "ipc")?; let mut ipc_writer = write::FileWriter::try_new( &mut self.writer, Arc::new(schema), @@ -99,7 +99,7 @@ where }, )?; df.align_chunks(); - let iter = df.iter_chunks(self.pl_flavor, true); + let iter = df.iter_chunks(self.compat_level, true); for batch in iter { ipc_writer.write(&batch, None)? @@ -111,7 +111,7 @@ where pub struct BatchedWriter { writer: write::FileWriter, - pl_flavor: bool, + compat_level: CompatLevel, } impl BatchedWriter { @@ -120,7 +120,7 @@ impl BatchedWriter { /// # Panics /// The caller must ensure the chunks in the given [`DataFrame`] are aligned. pub fn write_batch(&mut self, df: &DataFrame) -> PolarsResult<()> { - let iter = df.iter_chunks(self.pl_flavor, true); + let iter = df.iter_chunks(self.compat_level, true); for batch in iter { self.writer.write(&batch, None)? } diff --git a/crates/polars-io/src/ipc/write_async.rs b/crates/polars-io/src/ipc/write_async.rs index 7a5b3240cbb5..5ed459a715d2 100644 --- a/crates/polars-io/src/ipc/write_async.rs +++ b/crates/polars-io/src/ipc/write_async.rs @@ -10,14 +10,14 @@ impl IpcWriter { IpcWriter { writer, compression: None, - pl_flavor: false, + compat_level: CompatLevel::oldest(), } } pub fn batched_async(self, schema: &Schema) -> PolarsResult> { let writer = FileSink::new( self.writer, - schema.to_arrow(false), + schema.to_arrow(CompatLevel::oldest()), None, WriteOptions { compression: self.compression.map(|c| c.into()), @@ -44,7 +44,7 @@ where /// # Panics /// The caller must ensure the chunks in the given [`DataFrame`] are aligned. pub async fn write_batch(&mut self, df: &DataFrame) -> PolarsResult<()> { - let iter = df.iter_chunks(false, true); + let iter = df.iter_chunks(CompatLevel::oldest(), true); for batch in iter { self.writer.feed(batch.into()).await?; } diff --git a/crates/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs index c51e238da2f9..99dbd53ffa5d 100644 --- a/crates/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -146,11 +146,11 @@ where .map(|s| { #[cfg(feature = "object")] polars_ensure!(!matches!(s.dtype(), DataType::Object(_, _)), ComputeError: "cannot write 'Object' datatype to json"); - Ok(s.field().to_arrow(true)) + Ok(s.field().to_arrow(CompatLevel::newest())) }) .collect::>>()?; let batches = df - .iter_chunks(true, false) + .iter_chunks(CompatLevel::newest(), false) .map(|chunk| Ok(Box::new(chunk_to_struct(chunk, fields.clone())) as ArrayRef)); match self.json_format { @@ -191,10 +191,10 @@ where .map(|s| { #[cfg(feature = "object")] polars_ensure!(!matches!(s.dtype(), DataType::Object(_, _)), ComputeError: "cannot write 'Object' datatype to json"); - Ok(s.field().to_arrow(true)) + Ok(s.field().to_arrow(CompatLevel::newest())) }) .collect::>>()?; - let chunks = df.iter_chunks(true, false); + let chunks = df.iter_chunks(CompatLevel::newest(), false); let batches = chunks.map(|chunk| Ok(Box::new(chunk_to_struct(chunk, fields.clone())) as ArrayRef)); let mut serializer = polars_json::ndjson::write::Serializer::new(batches, vec![]); @@ -267,7 +267,7 @@ where overwrite_schema(mut_schema, overwrite)?; } - DataType::Struct(schema.iter_fields().collect()).to_arrow(true) + DataType::Struct(schema.iter_fields().collect()).to_arrow(CompatLevel::newest()) } else { // infer let inner_dtype = if let BorrowedValue::Array(values) = &json_value { @@ -276,7 +276,7 @@ where self.infer_schema_len .unwrap_or(NonZeroUsize::new(usize::MAX).unwrap()), )? - .to_arrow(true) + .to_arrow(CompatLevel::newest()) } else { polars_json::json::infer(&json_value)? }; @@ -295,7 +295,7 @@ where .map(|(name, dt)| Field::new(&name, dt)) .collect(), ) - .to_arrow(true) + .to_arrow(CompatLevel::newest()) } else { inner_dtype } diff --git a/crates/polars-io/src/lib.rs b/crates/polars-io/src/lib.rs index d5fc527b822b..f32fa4b1408f 100644 --- a/crates/polars-io/src/lib.rs +++ b/crates/polars-io/src/lib.rs @@ -19,7 +19,7 @@ pub mod ndjson; mod options; #[cfg(feature = "parquet")] pub mod parquet; -#[cfg(feature = "partition")] +#[cfg(feature = "parquet")] pub mod partition; #[cfg(feature = "async")] pub mod pl_async; @@ -32,3 +32,5 @@ pub mod utils; pub use cloud::glob as async_glob; pub use options::*; pub use shared::*; + +pub mod hive; diff --git a/crates/polars-io/src/ndjson/mod.rs b/crates/polars-io/src/ndjson/mod.rs index 8d6ad6c2d680..4ec6ffa7a1da 100644 --- a/crates/polars-io/src/ndjson/mod.rs +++ b/crates/polars-io/src/ndjson/mod.rs @@ -13,7 +13,7 @@ pub fn infer_schema( let data_types = polars_json::ndjson::iter_unique_dtypes(reader, infer_schema_len)?; let data_type = crate::json::infer::data_types_to_supertype(data_types.map(|dt| DataType::from(&dt)))?; - let schema = StructArray::get_fields(&data_type.to_arrow(true)) + let schema = StructArray::get_fields(&data_type.to_arrow(CompatLevel::newest())) .iter() .collect(); Ok(schema) diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index e53cd8922d71..5ec9632871d5 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -49,7 +49,7 @@ impl ParquetObjectStore { // Any wildcards should already have been resolved here. Without this assertion they would // be ignored. debug_assert!(expansion.is_none(), "path should not contain wildcards"); - let path = object_path_from_string(prefix); + let path = object_path_from_string(prefix)?; Ok(ParquetObjectStore { store: PolarsObjectStore::new(store), diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index cd869b96638c..f272daaedc63 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -17,6 +17,7 @@ use super::predicates::read_this_row_group; use super::to_metadata::ToMetadata; use super::utils::materialize_empty_df; use super::{mmap, ParallelStrategy}; +use crate::hive::materialize_hive_partitions; use crate::mmap::{MmapBytesReader, ReaderBytes}; use crate::parquet::metadata::FileMetaDataRef; use crate::predicates::{apply_predicate, PhysicalIoExpr}; @@ -149,45 +150,6 @@ pub(super) fn array_iter_to_series( } } -/// Materializes hive partitions. -/// We have a special num_rows arg, as df can be empty when a projection contains -/// only hive partition columns. -/// -/// # Safety -/// -/// num_rows equals the height of the df when the df height is non-zero. -pub(crate) fn materialize_hive_partitions( - df: &mut DataFrame, - reader_schema: &ArrowSchema, - hive_partition_columns: Option<&[Series]>, - num_rows: usize, -) { - if let Some(hive_columns) = hive_partition_columns { - let Some(first) = hive_columns.first() else { - return; - }; - - if reader_schema.index_of(first.name()).is_some() { - // Insert these hive columns in the order they are stored in the file. - for s in hive_columns { - let i = match df.get_columns().binary_search_by_key( - &reader_schema.index_of(s.name()).unwrap_or(usize::MAX), - |s| reader_schema.index_of(s.name()).unwrap_or(usize::MIN), - ) { - Ok(i) => i, - Err(i) => i, - }; - - df.insert_column(i, s.new_from_index(0, num_rows)).unwrap(); - } - } else { - for s in hive_columns { - unsafe { df.with_column_unchecked(s.new_from_index(0, num_rows)) }; - } - } - } -} - #[allow(clippy::too_many_arguments)] fn rg_to_dfs( store: &mmap::ColumnStore, diff --git a/crates/polars-io/src/parquet/read/utils.rs b/crates/polars-io/src/parquet/read/utils.rs index 78a7c4f5b50e..bb476a5fad08 100644 --- a/crates/polars-io/src/parquet/read/utils.rs +++ b/crates/polars-io/src/parquet/read/utils.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use polars_core::prelude::{ArrowSchema, DataFrame, Series, IDX_DTYPE}; -use super::read_impl::materialize_hive_partitions; +use crate::hive::materialize_hive_partitions; use crate::utils::apply_projection; use crate::RowIndex; diff --git a/crates/polars-io/src/parquet/write/batched_writer.rs b/crates/polars-io/src/parquet/write/batched_writer.rs index 5f363e0eb1a3..86b95bc36f85 100644 --- a/crates/polars-io/src/parquet/write/batched_writer.rs +++ b/crates/polars-io/src/parquet/write/batched_writer.rs @@ -27,7 +27,7 @@ impl BatchedWriter { &'a self, df: &'a DataFrame, ) -> impl Iterator>> + 'a { - let rb_iter = df.iter_chunks(true, false); + let rb_iter = df.iter_chunks(CompatLevel::newest(), false); rb_iter.filter_map(move |batch| match batch.len() { 0 => None, _ => { @@ -95,7 +95,7 @@ fn prepare_rg_iter<'a>( options: WriteOptions, parallel: bool, ) -> impl Iterator>> + 'a { - let rb_iter = df.iter_chunks(true, false); + let rb_iter = df.iter_chunks(CompatLevel::newest(), false); rb_iter.filter_map(move |batch| match batch.len() { 0 => None, _ => { diff --git a/crates/polars-io/src/parquet/write/options.rs b/crates/polars-io/src/parquet/write/options.rs index d65a32552233..4e4bfa9e1edf 100644 --- a/crates/polars-io/src/parquet/write/options.rs +++ b/crates/polars-io/src/parquet/write/options.rs @@ -16,7 +16,7 @@ pub struct ParquetWriteOptions { /// If `None` will be all written to a single row group. pub row_group_size: Option, /// if `None` will be 1024^2 bytes - pub data_pagesize_limit: Option, + pub data_page_size: Option, /// maintain the order the data was processed pub maintain_order: bool, } diff --git a/crates/polars-io/src/parquet/write/writer.rs b/crates/polars-io/src/parquet/write/writer.rs index cd03a77cfed8..32b104306aa6 100644 --- a/crates/polars-io/src/parquet/write/writer.rs +++ b/crates/polars-io/src/parquet/write/writer.rs @@ -10,9 +10,23 @@ use polars_parquet::write::{ use super::batched_writer::BatchedWriter; use super::options::ParquetCompression; +use super::ParquetWriteOptions; use crate::prelude::chunk_df_for_writing; use crate::shared::schema_to_arrow_checked; +impl ParquetWriteOptions { + pub fn to_writer(&self, f: F) -> ParquetWriter + where + F: Write, + { + ParquetWriter::new(f) + .with_compression(self.compression) + .with_statistics(self.statistics) + .with_row_group_size(self.row_group_size) + .with_data_page_size(self.data_page_size) + } +} + /// Write a DataFrame to Parquet format. #[must_use] pub struct ParquetWriter { @@ -83,7 +97,7 @@ where } pub fn batched(self, schema: &Schema) -> PolarsResult> { - let schema = schema_to_arrow_checked(schema, true, "parquet")?; + let schema = schema_to_arrow_checked(schema, CompatLevel::newest(), "parquet")?; let parquet_schema = to_parquet_schema(&schema)?; let encodings = get_encodings(&schema); let options = self.materialize_options(); @@ -103,7 +117,7 @@ where statistics: self.statistics, compression: self.compression, version: Version::V1, - data_pagesize_limit: self.data_page_size, + data_page_size: self.data_page_size, } } diff --git a/crates/polars-io/src/partition.rs b/crates/polars-io/src/partition.rs index b25f14189817..cdc1768226ea 100644 --- a/crates/polars-io/src/partition.rs +++ b/crates/polars-io/src/partition.rs @@ -9,6 +9,7 @@ use polars_core::series::IsSorted; use polars_core::POOL; use rayon::prelude::*; +use crate::parquet::write::ParquetWriteOptions; use crate::utils::resolve_homedir; use crate::WriterFactory; @@ -127,3 +128,111 @@ where } path } + +pub fn write_partitioned_dataset( + df: &DataFrame, + path: &Path, + partition_by: &[S], + file_write_options: &ParquetWriteOptions, + chunk_size: usize, +) -> PolarsResult<()> +where + S: AsRef, +{ + let base_path = path; + + for (path_part, part_df) in get_hive_partitions_iter(df, partition_by)? { + let dir = base_path.join(path_part); + std::fs::create_dir_all(&dir)?; + + let n_files = (part_df.estimated_size() / chunk_size).clamp(1, 0xf_ffff_ffff_ffff); + let rows_per_file = (df.height() / n_files).saturating_add(1); + + fn get_path_for_index(i: usize) -> String { + // Use a fixed-width file name so that it sorts properly. + format!("{:013x}.parquet", i) + } + + for (i, slice_start) in (0..part_df.height()).step_by(rows_per_file).enumerate() { + let f = std::fs::File::create(dir.join(get_path_for_index(i)))?; + + file_write_options + .to_writer(f) + .finish(&mut part_df.slice(slice_start as i64, rows_per_file))?; + } + } + + Ok(()) +} + +/// Creates an iterator of (hive partition path, DataFrame) pairs, e.g.: +/// ("a=1/b=1", DataFrame) +fn get_hive_partitions_iter<'a, S>( + df: &'a DataFrame, + partition_by: &'a [S], +) -> PolarsResult + 'a>> +where + S: AsRef, +{ + let schema = df.schema(); + + let partition_by_col_idx = partition_by + .iter() + .map(|x| { + let Some(i) = schema.index_of(x.as_ref()) else { + polars_bail!(ColumnNotFound: "{}", x.as_ref()) + }; + Ok(i) + }) + .collect::>>()?; + + let get_hive_path_part = move |df: &DataFrame| { + const CHAR_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS + .add(b'/') + .add(b'=') + .add(b':') + .add(b' '); + + let cols = df.get_columns(); + + partition_by_col_idx + .iter() + .map(|&i| { + let s = &cols[i].slice(0, 1).cast(&DataType::String).unwrap(); + + format!( + "{}={}", + s.name(), + percent_encoding::percent_encode( + s.str() + .unwrap() + .get(0) + .unwrap_or("__HIVE_DEFAULT_PARTITION__") + .as_bytes(), + CHAR_SET + ) + ) + }) + .collect::>() + .join("/") + }; + + let groups = df.group_by(partition_by)?; + let groups = groups.take_groups(); + + let out: Box> = match groups { + GroupsProxy::Idx(idx) => Box::new(idx.into_iter().map(move |(_, group)| { + let part_df = + unsafe { df._take_unchecked_slice_sorted(&group, false, IsSorted::Ascending) }; + (get_hive_path_part(&part_df), part_df) + })), + GroupsProxy::Slice { groups, .. } => { + Box::new(groups.into_iter().map(move |[offset, len]| { + let part_df = df.slice(offset as i64, len as usize); + (get_hive_path_part(&part_df), part_df) + })) + }, + }; + + Ok(out) +} diff --git a/crates/polars-io/src/shared.rs b/crates/polars-io/src/shared.rs index bb18f8cba8d2..2788d10a54fd 100644 --- a/crates/polars-io/src/shared.rs +++ b/crates/polars-io/src/shared.rs @@ -120,13 +120,13 @@ pub(crate) fn finish_reader( pub(crate) fn schema_to_arrow_checked( schema: &Schema, - pl_flavor: bool, + compat_level: CompatLevel, _file_name: &str, ) -> PolarsResult { let fields = schema.iter_fields().map(|field| { #[cfg(feature = "object")] polars_ensure!(!matches!(field.data_type(), DataType::Object(_, _)), ComputeError: "cannot write 'Object' datatype to {}", _file_name); - Ok(field.data_type().to_arrow_field(field.name().as_str(), pl_flavor)) + Ok(field.data_type().to_arrow_field(field.name().as_str(), compat_level)) }).collect::>>()?; Ok(ArrowSchema::from(fields)) } diff --git a/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs index 2f9183f18cd6..19ce681d0523 100644 --- a/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs +++ b/crates/polars-lazy/src/physical_plan/streaming/construct_pipeline.rs @@ -248,12 +248,11 @@ fn get_pipeline_node( IR::MapFunction { function: FunctionNode::Pipeline { function: Arc::new(Mutex::new(move |_df: DataFrame| { - let mut state = ExecutionState::new(); + let state = ExecutionState::new(); if state.verbose() { eprintln!("RUN STREAMING PIPELINE"); eprintln!("{:?}", &pipelines) } - state.set_in_streaming_engine(); execute_pipeline(state, std::mem::take(&mut pipelines)) })), schema, diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 5ffb79fd1c63..e4adb35fcffe 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -216,7 +216,7 @@ impl LazyCsvReader { where F: Fn(Schema) -> PolarsResult, { - let paths = self.expand_paths(false)?.0; + let paths = self.expand_paths_default()?; let Some(path) = paths.first() else { polars_bail!(ComputeError: "no paths specified for this reader"); }; @@ -262,7 +262,7 @@ impl LazyFileListReader for LazyCsvReader { /// Get the final [LazyFrame]. fn finish(self) -> PolarsResult { // `expand_paths` respects globs - let paths = self.expand_paths(false)?.0; + let paths = self.expand_paths_default()?; let mut lf: LazyFrame = DslBuilder::scan_csv(paths, self.read_options, self.cache, self.cloud_options)? diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index eaec83618732..86a70de80b81 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -17,6 +17,27 @@ pub(super) fn get_glob_start_idx(path: &[u8]) -> Option { memchr::memchr3(b'*', b'?', b'[', path) } +/// Checks if `expanded_paths` were expanded from a single directory +pub(super) fn expanded_from_single_directory>( + paths: &[P], + expanded_paths: &[P], +) -> bool { + // Single input that isn't a glob + paths.len() == 1 && get_glob_start_idx(paths[0].as_ref().to_str().unwrap().as_bytes()).is_none() + // And isn't a file + && { + ( + // For local paths, we can just use `is_dir` + !is_cloud_url(paths[0].as_ref()) && paths[0].as_ref().is_dir() + ) + || ( + // Otherwise we check the output path is different from the input path, so that we also + // handle the case of a directory containing a single file. + !expanded_paths.is_empty() && (paths[0].as_ref() != expanded_paths[0].as_ref()) + ) + } +} + /// Recursively traverses directories and expands globs if `glob` is `true`. /// Returns the expanded paths and the index at which to start parsing hive /// partitions from the path. @@ -72,7 +93,7 @@ fn expand_paths( let (cloud_location, store) = polars_io::cloud::build_object_store(path, cloud_options).await?; - let prefix = object_path_from_string(cloud_location.prefix.clone()); + let prefix = object_path_from_string(cloud_location.prefix.clone())?; let out = if !path.ends_with("/") && cloud_location.expansion.is_none() @@ -87,7 +108,7 @@ fn expand_paths( ))], ) } else { - use futures::{StreamExt, TryStreamExt}; + use futures::TryStreamExt; if !is_cloud { // FORCE_ASYNC in the test suite wants us to raise a proper error message @@ -107,10 +128,12 @@ fn expand_paths( } } + let cloud_location = &cloud_location; + let mut paths = store .list(Some(&prefix)) - .map(|x| { - x.map(|x| { + .try_filter_map(|x| async move { + let out = (x.size > 0).then(|| { PathBuf::from({ format_path( &cloud_location.scheme, @@ -118,7 +141,8 @@ fn expand_paths( x.location.as_ref(), ) }) - }) + }); + Ok(out) }) .try_collect::>() .await @@ -194,7 +218,7 @@ fn expand_paths( for path in paths { if path.is_dir() { stack.push_back(path); - } else { + } else if path.metadata()?.len() > 0 { out_paths.push(path); } } @@ -214,7 +238,7 @@ fn expand_paths( for path in paths { let path = path.map_err(to_compute_err)?; - if !path.is_dir() { + if !path.is_dir() && path.metadata()?.len() > 0 { out_paths.push(path); } } @@ -225,10 +249,31 @@ fn expand_paths( } } - Ok(( - out_paths.into_iter().collect::>(), - *expand_start_idx, - )) + let out_paths = if expanded_from_single_directory(paths, out_paths.as_ref()) { + // Require all file extensions to be the same when expanding a single directory. + let ext = out_paths[0].extension(); + + (0..out_paths.len()) + .map(|i| { + let path = out_paths[i].clone(); + + if path.extension() != ext { + polars_bail!( + InvalidOperation: r#"directory contained paths with different file extensions: \ + first path: {}, second path: {}. Please use a glob pattern to explicitly specify + which files to read (e.g. "dir/**/*", "dir/**/*.parquet")"#, + out_paths[i - 1].to_str().unwrap(), path.to_str().unwrap() + ); + }; + + Ok(path) + }) + .collect::>>()? + } else { + Arc::<[_]>::from(out_paths) + }; + + Ok((out_paths, *expand_start_idx)) } /// Reads [LazyFrame] from a filesystem or a cloud storage. @@ -242,7 +287,7 @@ pub trait LazyFileListReader: Clone { return self.finish_no_glob(); } - let paths = self.expand_paths(false)?.0; + let paths = self.expand_paths_default()?; let lfs = paths .iter() @@ -345,4 +390,10 @@ pub trait LazyFileListReader: Clone { check_directory_level, ) } + + /// Expand paths without performing any directory level or file extension + /// checks. + fn expand_paths_default(&self) -> PolarsResult> { + self.expand_paths(false).map(|x| x.0) + } } diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index 41a5b7b066de..a83a3c31b386 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -1,9 +1,10 @@ use std::path::{Path, PathBuf}; +use file_list_reader::expanded_from_single_directory; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; use polars_io::ipc::IpcScanOptions; -use polars_io::RowIndex; +use polars_io::{HiveOptions, RowIndex}; use crate::prelude::*; @@ -15,6 +16,7 @@ pub struct ScanArgsIpc { pub row_index: Option, pub memory_map: bool, pub cloud_options: Option, + pub hive_options: HiveOptions, } impl Default for ScanArgsIpc { @@ -26,6 +28,7 @@ impl Default for ScanArgsIpc { row_index: None, memory_map: true, cloud_options: Default::default(), + hive_options: Default::default(), } } } @@ -46,8 +49,14 @@ impl LazyIpcReader { } impl LazyFileListReader for LazyIpcReader { - fn finish(self) -> PolarsResult { - let paths = self.expand_paths(false)?.0; + fn finish(mut self) -> PolarsResult { + let (paths, hive_start_idx) = + self.expand_paths(self.args.hive_options.enabled.unwrap_or(false))?; + self.args.hive_options.enabled = + Some(self.args.hive_options.enabled.unwrap_or_else(|| { + expanded_from_single_directory(self.paths.as_ref(), paths.as_ref()) + })); + self.args.hive_options.hive_start_idx = hive_start_idx; let args = self.args; let options = IpcScanOptions { @@ -62,6 +71,7 @@ impl LazyFileListReader for LazyIpcReader { args.row_index, args.rechunk, args.cloud_options, + args.hive_options, )? .build() .into(); diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 15deb86b5e29..8f74c2da395f 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -96,7 +96,7 @@ impl LazyFileListReader for LazyJsonLineReader { return self.finish_no_glob(); } - let paths = self.expand_paths(false)?.0; + let paths = self.expand_paths_default()?; let file_options = FileScanOptions { n_rows: self.n_rows, diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index c4c503539784..fe4e1da1f43a 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -1,13 +1,12 @@ use std::path::{Path, PathBuf}; +use file_list_reader::expanded_from_single_directory; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; use polars_io::parquet::read::ParallelStrategy; -use polars_io::utils::is_cloud_url; use polars_io::{HiveOptions, RowIndex}; use crate::prelude::*; -use crate::scan::file_list_reader::get_glob_start_idx; #[derive(Clone)] pub struct ScanArgsParquet { @@ -63,13 +62,7 @@ impl LazyFileListReader for LazyParquetReader { self.expand_paths(self.args.hive_options.enabled.unwrap_or(false))?; self.args.hive_options.enabled = Some(self.args.hive_options.enabled.unwrap_or_else(|| { - self.paths.len() == 1 - && get_glob_start_idx(self.paths[0].to_str().unwrap().as_bytes()).is_none() - && !paths.is_empty() - && { - (!is_cloud_url(&paths[0]) && paths[0].is_dir()) - || (paths[0] != self.paths[0]) - } + expanded_from_single_directory(self.paths.as_ref(), paths.as_ref()) })); self.args.hive_options.hive_start_idx = hive_start_idx; diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs index a81b59dcef61..29dd7c02695a 100644 --- a/crates/polars-lazy/src/tests/io.rs +++ b/crates/polars-lazy/src/tests/io.rs @@ -419,6 +419,7 @@ fn test_ipc_globbing() -> PolarsResult<()> { row_index: None, memory_map: true, cloud_options: None, + hive_options: Default::default(), }, )? .collect()?; diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index a1a449afdefc..1ff50b037218 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -1,29 +1,27 @@ use std::path::PathBuf; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::RwLock; +use hive::HivePartitions; use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; use polars_io::cloud::CloudOptions; use polars_io::predicates::apply_predicate; use polars_io::utils::is_cloud_url; -use polars_io::RowIndex; use rayon::prelude::*; use super::*; pub struct IpcExec { pub(crate) paths: Arc<[PathBuf]>, - pub(crate) schema: SchemaRef, + pub(crate) file_info: FileInfo, pub(crate) predicate: Option>, pub(crate) options: IpcScanOptions, pub(crate) file_options: FileScanOptions, + pub(crate) hive_parts: Option>, pub(crate) cloud_options: Option, - pub(crate) metadata: Option, } impl IpcExec { - fn read(&mut self, verbose: bool) -> PolarsResult { + fn read(&mut self) -> PolarsResult { let is_cloud = self.paths.iter().any(is_cloud_url); let force_async = config::force_async(); @@ -35,12 +33,11 @@ impl IpcExec { #[cfg(feature = "cloud")] { - if force_async && verbose { + if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); } - polars_io::pl_async::get_runtime() - .block_on_potential_spawn(self.read_async(verbose))? + polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())? } } else { self.read_sync()? @@ -53,7 +50,10 @@ impl IpcExec { Ok(out) } - fn read_sync(&mut self) -> PolarsResult { + fn read_impl PolarsResult + Send + Sync>( + &mut self, + path_idx_to_file: F, + ) -> PolarsResult { if config::verbose() { eprintln!("executing ipc read sync with row_index = {:?}, n_rows = {:?}, predicate = {:?} for paths {:?}", self.file_options.row_index.as_ref(), @@ -65,211 +65,111 @@ impl IpcExec { let projection = materialize_projection( self.file_options.with_columns.as_deref(), - &self.schema, + &self.file_info.schema, None, self.file_options.row_index.is_some(), ); - let n_rows = self - .file_options - .n_rows - .map(|n| IdxSize::try_from(n).unwrap()); - - let row_limit = n_rows.unwrap_or(IdxSize::MAX); + let read_path = |path_index: usize, n_rows: Option| { + IpcReader::new(path_idx_to_file(path_index)?) + .with_n_rows(n_rows) + .with_row_index(self.file_options.row_index.clone()) + .with_projection(projection.clone()) + .with_hive_partition_columns( + self.hive_parts + .as_ref() + .map(|x| x[path_index].materialize_partition_columns()), + ) + .memory_mapped( + self.options + .memory_map + .then(|| self.paths[path_index].clone()), + ) + .finish() + }; - // Used to determine the next file to open. This guarantees the order. - let path_index = AtomicUsize::new(0); - let row_counter = RwLock::new(ConsecutiveCountState::new(self.paths.len())); + let mut dfs = if let Some(mut n_rows) = self.file_options.n_rows { + let mut out = Vec::with_capacity(self.paths.len()); - let index_and_dfs = (0..self.paths.len()) - .into_par_iter() - .map(|_| -> PolarsResult<(usize, DataFrame)> { - let index = path_index.fetch_add(1, Ordering::Relaxed); - let path = &self.paths[index]; + for i in 0..self.paths.len() { + let df = read_path(i, Some(n_rows))?; + let df_height = df.height(); + out.push(df); - let already_read_in_sequence = row_counter.read().unwrap().sum(); - if already_read_in_sequence >= row_limit { - return Ok((index, Default::default())); + assert!( + df_height <= n_rows, + "impl error: got more rows than expected" + ); + if df_height == n_rows { + break; } + n_rows -= df_height; + } - let file = std::fs::File::open(path)?; + out + } else { + POOL.install(|| { + (0..self.paths.len()) + .into_par_iter() + .map(|i| read_path(i, None)) + .collect::>>() + })? + }; - let memory_mapped = if self.options.memory_map { - Some(path.clone()) - } else { - None - }; + if let Some(ref row_index) = self.file_options.row_index { + let mut offset = 0; + for df in &mut dfs { + df.apply(&row_index.name, |series| series.idx().unwrap() + offset) + .unwrap(); + offset += df.height(); + } + }; - let df = IpcReader::new(file) - .with_n_rows( - // NOTE: If there is any file that by itself exceeds the - // row limit, passing the total row limit to each - // individual reader helps. - n_rows.map(|n| { - n.saturating_sub(already_read_in_sequence) - .try_into() - .unwrap() - }), - ) - .with_row_index(self.file_options.row_index.clone()) - .with_projection(projection.clone()) - .memory_mapped(memory_mapped) - .finish()?; + let dfs = if let Some(predicate) = self.predicate.clone() { + let predicate = phys_expr_to_io_expr(predicate); + let predicate = Some(predicate.as_ref()); - row_counter - .write() - .unwrap() - .write(index, df.height().try_into().unwrap()); + POOL.install(|| { + dfs.into_par_iter() + .map(|mut df| { + apply_predicate(&mut df, predicate, true)?; + Ok(df) + }) + .collect::>>() + })? + } else { + dfs + }; - Ok((index, df)) - }) - .collect::>>()?; + accumulate_dataframes_vertical(dfs) + } - finish_index_and_dfs( - index_and_dfs, - row_counter.into_inner().unwrap(), - self.file_options.row_index.as_ref(), - row_limit, - self.predicate.as_ref(), - ) + fn read_sync(&mut self) -> PolarsResult { + let paths = self.paths.clone(); + self.read_impl(move |i| std::fs::File::open(&paths[i]).map_err(Into::into)) } #[cfg(feature = "cloud")] - async fn read_async(&mut self, verbose: bool) -> PolarsResult { - use futures::stream::{self, StreamExt}; - use futures::TryStreamExt; - - /// See https://users.rust-lang.org/t/implementation-of-fnonce-is-not-general-enough-with-async-block/83427/3. - trait AssertSend { - fn assert_send(self) -> impl Send + stream::Stream - where - Self: Send + stream::Stream + Sized, - { - self - } - } - - impl AssertSend for T {} - - let n_rows = self - .file_options - .n_rows - .map(|limit| limit.try_into().unwrap()); - - let row_limit = n_rows.unwrap_or(IdxSize::MAX); - - let row_counter = RwLock::new(ConsecutiveCountState::new(self.paths.len())); + async fn read_async(&mut self) -> PolarsResult { + // TODO: Better async impl that can download only the parts of the file it needs, and do it + // concurrently. + use polars_io::file_cache::init_entries_from_uri_list; - let index_and_dfs = stream::iter(&*self.paths) - .enumerate() - .map(|(index, path)| { - let this = &*self; - let row_counter = &row_counter; - async move { - let already_read_in_sequence = row_counter.read().unwrap().sum(); - if already_read_in_sequence >= row_limit { - return Ok((index, Default::default())); - } - - let reader = IpcReaderAsync::from_uri( - path.to_str().unwrap(), - this.cloud_options.as_ref(), - ) - .await?; - let df = reader - .data( - this.metadata.as_ref(), - IpcReadOptions::default() - .with_row_limit( - // NOTE: If there is any file that by itself - // exceeds the row limit, passing the total - // row limit to each individual reader - // helps. - n_rows.map(|n| { - n.saturating_sub(already_read_in_sequence) - .try_into() - .unwrap() - }), - ) - .with_row_index(this.file_options.row_index.clone()) - .with_projection(this.file_options.with_columns.as_ref().cloned()), - verbose, - ) - .await?; - - row_counter - .write() - .unwrap() - .write(index, df.height().try_into().unwrap()); - - PolarsResult::Ok((index, df)) - } - }) - .assert_send() - .buffer_unordered(config::get_file_prefetch_size()) - .try_collect::>() - .await?; - - finish_index_and_dfs( - index_and_dfs, - row_counter.into_inner().unwrap(), - self.file_options.row_index.as_ref(), - row_limit, - self.predicate.as_ref(), - ) + tokio::task::block_in_place(|| { + let cache_entries = init_entries_from_uri_list( + self.paths + .iter() + .map(|x| Arc::from(x.to_str().unwrap())) + .collect::>() + .as_slice(), + self.cloud_options.as_ref(), + )?; + + self.read_impl(move |i| cache_entries[i].try_open_check_latest()) + }) } } -fn finish_index_and_dfs( - mut index_and_dfs: Vec<(usize, DataFrame)>, - row_counter: ConsecutiveCountState, - row_index: Option<&RowIndex>, - row_limit: IdxSize, - predicate: Option<&Arc>, -) -> PolarsResult { - index_and_dfs.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); - - debug_assert!( - index_and_dfs.iter().enumerate().all(|(a, &(b, _))| a == b), - "expected dataframe indices in order from 0 to len" - ); - - debug_assert_eq!(index_and_dfs.len(), row_counter.len()); - let mut offset = 0; - let mut df = accumulate_dataframes_vertical( - index_and_dfs - .into_iter() - .zip(row_counter.counts()) - .filter_map(|((_, mut df), count)| { - let count = count?; - - let remaining = row_limit.checked_sub(offset)?; - - // If necessary, correct having read too much from a single file. - if remaining < count { - df = df.slice(0, remaining.try_into().unwrap()); - } - - // If necessary, correct row indices now that we know the offset. - if let Some(row_index) = row_index { - df.apply(&row_index.name, |series| { - series.idx().expect("index column should be of index type") + offset - }) - .expect("index column should exist"); - } - - offset += count; - - Some(df) - }), - )?; - - let predicate = predicate.cloned().map(phys_expr_to_io_expr); - apply_predicate(&mut df, predicate.as_deref(), true)?; - - Ok(df) -} - impl Executor for IpcExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { @@ -283,6 +183,6 @@ impl Executor for IpcExec { Cow::Borrowed("") }; - state.record(|| self.read(state.verbose()), profile_name) + state.record(|| self.read(), profile_name) } } diff --git a/crates/polars-mem-engine/src/executors/scan/mod.rs b/crates/polars-mem-engine/src/executors/scan/mod.rs index cd7131f27632..1f50268db23f 100644 --- a/crates/polars-mem-engine/src/executors/scan/mod.rs +++ b/crates/polars-mem-engine/src/executors/scan/mod.rs @@ -24,8 +24,6 @@ use polars_io::predicates::PhysicalIoExpr; #[cfg(any(feature = "parquet", feature = "csv", feature = "ipc", feature = "cse"))] use polars_io::prelude::*; use polars_plan::global::_set_n_rows_for_scan; -#[cfg(feature = "ipc")] -pub(crate) use support::ConsecutiveCountState; use super::*; use crate::prelude::*; diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index ba3189ba2e0d..4b99f8f6a2a7 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -76,32 +76,33 @@ impl ParquetExec { // First initialize the readers, predicates and metadata. // This will be used to determine the slices. That way we can actually read all the // files in parallel even if we add row index columns or slices. - let readers_and_metadata = (0..paths.len()) - .map(|i| { - let path = &paths[i]; - let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); - - let file = std::fs::File::open(path)?; - let (projection, predicate) = prepare_scan_args( - self.predicate.clone(), - &mut self.file_options.with_columns.clone(), - &mut self.file_info.schema.clone(), - base_row_index.is_some(), - hive_partitions.as_deref(), - ); - - let mut reader = ParquetReader::new(file) - .read_parallel(parallel) - .set_low_memory(self.options.low_memory) - .use_statistics(self.options.use_statistics) - .set_rechunk(false) - .with_hive_partition_columns(hive_partitions); - - reader - .num_rows() - .map(|num_rows| (reader, num_rows, predicate, projection)) - }) - .collect::>>()?; + let iter = (0..paths.len()).into_par_iter().map(|i| { + let path = &paths[i]; + let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); + + let file = std::fs::File::open(path)?; + let (projection, predicate) = prepare_scan_args( + self.predicate.clone(), + &mut self.file_options.with_columns.clone(), + &mut self.file_info.schema.clone(), + base_row_index.is_some(), + hive_partitions.as_deref(), + ); + + let mut reader = ParquetReader::new(file) + .read_parallel(parallel) + .set_low_memory(self.options.low_memory) + .use_statistics(self.options.use_statistics) + .set_rechunk(false) + .with_hive_partition_columns(hive_partitions); + + reader + .num_rows() + .map(|num_rows| (reader, num_rows, predicate, projection)) + }); + + // We do this in parallel because wide tables can take a long time deserializing metadata. + let readers_and_metadata = POOL.install(|| iter.collect::>>())?; let iter = readers_and_metadata .iter() diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs index f7f7bd46463a..b88ecaa1d7fe 100644 --- a/crates/polars-mem-engine/src/planner/lp.rs +++ b/crates/polars-mem-engine/src/planner/lp.rs @@ -269,12 +269,12 @@ fn create_physical_plan_impl( metadata, } => Ok(Box::new(executors::IpcExec { paths, - schema: file_info.schema, + file_info, predicate, options, file_options, + hive_parts, cloud_options, - metadata, })), #[cfg(feature = "parquet")] FileScan::Parquet { diff --git a/crates/polars-ops/src/chunked_array/gather/chunked.rs b/crates/polars-ops/src/chunked_array/gather/chunked.rs index 44a7ac951ba7..4b4aed6f7f87 100644 --- a/crates/polars-ops/src/chunked_array/gather/chunked.rs +++ b/crates/polars-ops/src/chunked_array/gather/chunked.rs @@ -208,7 +208,7 @@ where T::Array: Debug, { unsafe fn take_chunked_unchecked(&self, by: &[ChunkId], sorted: IsSorted) -> Self { - let arrow_dtype = self.dtype().to_arrow(true); + let arrow_dtype = self.dtype().to_arrow(CompatLevel::newest()); let mut out = if let Some(iter) = self.downcast_slices() { let targets = iter.collect::>(); @@ -245,7 +245,7 @@ where // Take function that checks of null state in `ChunkIdx`. unsafe fn take_opt_chunked_unchecked(&self, by: &[NullableChunkId]) -> Self { - let arrow_dtype = self.dtype().to_arrow(true); + let arrow_dtype = self.dtype().to_arrow(CompatLevel::newest()); if let Some(iter) = self.downcast_slices() { let targets = iter.collect::>(); diff --git a/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs b/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs index ba1427e6f6fd..ff52a6601589 100644 --- a/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs +++ b/crates/polars-ops/src/chunked_array/gather_skip_nulls.rs @@ -94,7 +94,8 @@ where .collect(); let gathered = unsafe { gather_skip_nulls_idx_pairs_unchecked(self, index_pairs, indices.len()) }; - let arr = T::Array::from_zeroable_vec(gathered, self.dtype().to_arrow(true)); + let arr = + T::Array::from_zeroable_vec(gathered, self.dtype().to_arrow(CompatLevel::newest())); Ok(ChunkedArray::from_chunk_iter_like(self, [arr])) } } @@ -140,7 +141,8 @@ where gather_skip_nulls_idx_pairs_unchecked(self, index_pairs, indices.as_ref().len()) }; - let mut arr = T::Array::from_zeroable_vec(gathered, self.dtype().to_arrow(true)); + let mut arr = + T::Array::from_zeroable_vec(gathered, self.dtype().to_arrow(CompatLevel::newest())); if indices.null_count() > 0 { let array_refs: Vec<&dyn Array> = indices.chunks().iter().map(|x| &**x).collect(); arr = arr.with_validity_typed(concatenate_validities(&array_refs)); diff --git a/crates/polars-ops/src/chunked_array/repeat_by.rs b/crates/polars-ops/src/chunked_array/repeat_by.rs index bdba858d5719..8ccf9ae58141 100644 --- a/crates/polars-ops/src/chunked_array/repeat_by.rs +++ b/crates/polars-ops/src/chunked_array/repeat_by.rs @@ -39,7 +39,7 @@ where unsafe { LargeListArray::from_iter_primitive_trusted_len( iter, - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), ) } })) diff --git a/crates/polars-ops/src/chunked_array/strings/extract.rs b/crates/polars-ops/src/chunked_array/strings/extract.rs index b56e1251c840..4c65b2ce8c3d 100644 --- a/crates/polars-ops/src/chunked_array/strings/extract.rs +++ b/crates/polars-ops/src/chunked_array/strings/extract.rs @@ -52,7 +52,7 @@ pub(super) fn extract_groups( .map(|ca| ca.into_series()); } - let data_type = dtype.try_to_arrow(true)?; + let data_type = dtype.try_to_arrow(CompatLevel::newest())?; let DataType::Struct(fields) = dtype else { unreachable!() // Implementation error if it isn't a struct. }; diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index 6c54de338676..a585e9837e39 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -107,7 +107,7 @@ pub trait Utf8JsonPathImpl: AsString { let array = polars_json::ndjson::deserialize::deserialize_iter( iter, - dtype.to_arrow(true), + dtype.to_arrow(CompatLevel::newest()), buf_size, ca.len(), ) diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index e253618d5335..a5f0b0197e9f 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -49,7 +49,7 @@ impl JoinCoalesce { use JoinCoalesce::*; use JoinType::*; match join_type { - Left | Inner => { + Left | Inner | Right => { matches!(self, JoinSpecific | CoalesceColumns) }, Full { .. } => { @@ -109,6 +109,7 @@ impl JoinArgs { pub enum JoinType { Inner, Left, + Right, Full, #[cfg(feature = "asof_join")] AsOf(AsOfOptions), @@ -130,6 +131,7 @@ impl Display for JoinType { use JoinType::*; let val = match self { Left => "LEFT", + Right => "RIGHT", Inner => "INNER", Full { .. } => "FULL", #[cfg(feature = "asof_join")] diff --git a/crates/polars-ops/src/frame/join/dispatch_left_right.rs b/crates/polars-ops/src/frame/join/dispatch_left_right.rs new file mode 100644 index 000000000000..d8dd5396b1e2 --- /dev/null +++ b/crates/polars-ops/src/frame/join/dispatch_left_right.rs @@ -0,0 +1,147 @@ +use super::*; +use crate::prelude::*; + +pub(super) fn left_join_from_series( + left: DataFrame, + right: &DataFrame, + s_left: &Series, + s_right: &Series, + args: JoinArgs, + verbose: bool, + drop_names: Option<&[&str]>, +) -> PolarsResult { + let (df_left, df_right) = materialize_left_join_from_series( + left, right, s_left, s_right, &args, verbose, drop_names, + )?; + _finish_join(df_left, df_right, args.suffix.as_deref()) +} + +pub(super) fn right_join_from_series( + left: &DataFrame, + right: DataFrame, + s_left: &Series, + s_right: &Series, + args: JoinArgs, + verbose: bool, + drop_names: Option<&[&str]>, +) -> PolarsResult { + // Swap the order of tables to do a right join. + let (df_right, df_left) = materialize_left_join_from_series( + right, left, s_right, s_left, &args, verbose, drop_names, + )?; + _finish_join(df_left, df_right, args.suffix.as_deref()) +} + +pub fn materialize_left_join_from_series( + mut left: DataFrame, + right_: &DataFrame, + s_left: &Series, + s_right: &Series, + args: &JoinArgs, + verbose: bool, + drop_names: Option<&[&str]>, +) -> PolarsResult<(DataFrame, DataFrame)> { + #[cfg(feature = "dtype-categorical")] + _check_categorical_src(s_left.dtype(), s_right.dtype())?; + + let mut s_left = s_left.clone(); + // Eagerly limit left if possible. + if let Some((offset, len)) = args.slice { + if offset == 0 { + left = left.slice(0, len); + s_left = s_left.slice(0, len); + } + } + + // Ensure that the chunks are aligned otherwise we go OOB. + let mut right = Cow::Borrowed(right_); + let mut s_right = s_right.clone(); + if left.should_rechunk() { + left.as_single_chunk_par(); + s_left = s_left.rechunk(); + } + if right.should_rechunk() { + let mut other = right_.clone(); + other.as_single_chunk_par(); + right = Cow::Owned(other); + s_right = s_right.rechunk(); + } + + let ids = sort_or_hash_left(&s_left, &s_right, verbose, args.validation, args.join_nulls)?; + let right = if let Some(drop_names) = drop_names { + right.drop_many(drop_names) + } else { + right.drop(s_right.name()).unwrap() + }; + Ok(materialize_left_join(&left, &right, ids, args)) +} + +#[cfg(feature = "chunked_ids")] +fn materialize_left_join( + left: &DataFrame, + other: &DataFrame, + ids: LeftJoinIds, + args: &JoinArgs, +) -> (DataFrame, DataFrame) { + let (left_idx, right_idx) = ids; + let materialize_left = || match left_idx { + ChunkJoinIds::Left(left_idx) => unsafe { + let mut left_idx = &*left_idx; + if let Some((offset, len)) = args.slice { + left_idx = slice_slice(left_idx, offset, len); + } + left._create_left_df_from_slice(left_idx, true, true) + }, + ChunkJoinIds::Right(left_idx) => unsafe { + let mut left_idx = &*left_idx; + if let Some((offset, len)) = args.slice { + left_idx = slice_slice(left_idx, offset, len); + } + left.create_left_df_chunked(left_idx, true) + }, + }; + + let materialize_right = || match right_idx { + ChunkJoinOptIds::Left(right_idx) => unsafe { + let mut right_idx = &*right_idx; + if let Some((offset, len)) = args.slice { + right_idx = slice_slice(right_idx, offset, len); + } + IdxCa::with_nullable_idx(right_idx, |idx| other.take_unchecked(idx)) + }, + ChunkJoinOptIds::Right(right_idx) => unsafe { + let mut right_idx = &*right_idx; + if let Some((offset, len)) = args.slice { + right_idx = slice_slice(right_idx, offset, len); + } + other._take_opt_chunked_unchecked(right_idx) + }, + }; + POOL.join(materialize_left, materialize_right) +} + +#[cfg(not(feature = "chunked_ids"))] +fn materialize_left_join( + left: &DataFrame, + other: &DataFrame, + ids: LeftJoinIds, + args: &JoinArgs, +) -> (DataFrame, DataFrame) { + let (left_idx, right_idx) = ids; + + let mut left_idx = &*left_idx; + if let Some((offset, len)) = args.slice { + left_idx = slice_slice(left_idx, offset, len); + } + let materialize_left = || unsafe { left._create_left_df_from_slice(&left_idx, true, true) }; + + let mut right_idx = &*right_idx; + if let Some((offset, len)) = args.slice { + right_idx = slice_slice(right_idx, offset, len); + } + let materialize_right = || { + let right_idx = &*right_idx; + unsafe { IdxCa::with_nullable_idx(right_idx, |idx| other.take_unchecked(idx)) } + }; + POOL.join(materialize_left, materialize_right) +} diff --git a/crates/polars-ops/src/frame/join/hash_join/mod.rs b/crates/polars-ops/src/frame/join/hash_join/mod.rs index dd970d523757..65e6d0a56dce 100644 --- a/crates/polars-ops/src/frame/join/hash_join/mod.rs +++ b/crates/polars-ops/src/frame/join/hash_join/mod.rs @@ -93,121 +93,6 @@ pub trait JoinDispatch: IntoDf { } } - #[cfg(not(feature = "chunked_ids"))] - fn _finish_left_join( - &self, - ids: LeftJoinIds, - other: &DataFrame, - args: JoinArgs, - ) -> PolarsResult { - let ca_self = self.to_df(); - let (left_idx, right_idx) = ids; - let materialize_left = - || unsafe { ca_self._create_left_df_from_slice(&left_idx, true, true) }; - - let materialize_right = || { - let right_idx = &*right_idx; - unsafe { IdxCa::with_nullable_idx(right_idx, |idx| other.take_unchecked(idx)) } - }; - let (df_left, df_right) = POOL.join(materialize_left, materialize_right); - - _finish_join(df_left, df_right, args.suffix.as_deref()) - } - - #[cfg(feature = "chunked_ids")] - fn _finish_left_join( - &self, - ids: LeftJoinIds, - other: &DataFrame, - args: JoinArgs, - ) -> PolarsResult { - let ca_self = self.to_df(); - let suffix = &args.suffix; - let (left_idx, right_idx) = ids; - let materialize_left = || match left_idx { - ChunkJoinIds::Left(left_idx) => unsafe { - let mut left_idx = &*left_idx; - if let Some((offset, len)) = args.slice { - left_idx = slice_slice(left_idx, offset, len); - } - ca_self._create_left_df_from_slice(left_idx, true, true) - }, - ChunkJoinIds::Right(left_idx) => unsafe { - let mut left_idx = &*left_idx; - if let Some((offset, len)) = args.slice { - left_idx = slice_slice(left_idx, offset, len); - } - ca_self.create_left_df_chunked(left_idx, true) - }, - }; - - let materialize_right = || match right_idx { - ChunkJoinOptIds::Left(right_idx) => unsafe { - let mut right_idx = &*right_idx; - if let Some((offset, len)) = args.slice { - right_idx = slice_slice(right_idx, offset, len); - } - IdxCa::with_nullable_idx(right_idx, |idx| other.take_unchecked(idx)) - }, - ChunkJoinOptIds::Right(right_idx) => unsafe { - let mut right_idx = &*right_idx; - if let Some((offset, len)) = args.slice { - right_idx = slice_slice(right_idx, offset, len); - } - other._take_opt_chunked_unchecked(right_idx) - }, - }; - let (df_left, df_right) = POOL.join(materialize_left, materialize_right); - - _finish_join(df_left, df_right, suffix.as_deref()) - } - - fn _left_join_from_series( - &self, - other: &DataFrame, - s_left: &Series, - s_right: &Series, - args: JoinArgs, - verbose: bool, - drop_names: Option<&[&str]>, - ) -> PolarsResult { - let df_self = self.to_df(); - #[cfg(feature = "dtype-categorical")] - _check_categorical_src(s_left.dtype(), s_right.dtype())?; - - let mut left = df_self.clone(); - let mut s_left = s_left.clone(); - // Eagerly limit left if possible. - if let Some((offset, len)) = args.slice { - if offset == 0 { - left = left.slice(0, len); - s_left = s_left.slice(0, len); - } - } - - // Ensure that the chunks are aligned otherwise we go OOB. - let mut right = Cow::Borrowed(other); - let mut s_right = s_right.clone(); - if left.should_rechunk() { - left.as_single_chunk_par(); - s_left = s_left.rechunk(); - } - if right.should_rechunk() { - let mut other = other.clone(); - other.as_single_chunk_par(); - right = Cow::Owned(other); - s_right = s_right.rechunk(); - } - - let ids = sort_or_hash_left(&s_left, &s_right, verbose, args.validation, args.join_nulls)?; - let right = if let Some(drop_names) = drop_names { - right.drop_many(drop_names) - } else { - right.drop(s_right.name()).unwrap() - }; - left._finish_left_join(ids, &right, args) - } - #[cfg(feature = "semi_anti_join")] /// # Safety /// `idx` must be in bounds diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 53c7dba6c0aa..229e13457f81 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -4,6 +4,7 @@ mod asof; #[cfg(feature = "dtype-categorical")] mod checks; mod cross_join; +mod dispatch_left_right; mod general; mod hash_join; #[cfg(feature = "merge_sorted")] @@ -209,8 +210,24 @@ pub trait DataFrameJoinOps: IntoDf { return match args.how { JoinType::Inner => left_df ._inner_join_from_series(other, s_left, s_right, args, _verbose, drop_names), - JoinType::Left => left_df - ._left_join_from_series(other, s_left, s_right, args, _verbose, drop_names), + JoinType::Left => dispatch_left_right::left_join_from_series( + self.to_df().clone(), + other, + s_left, + s_right, + args, + _verbose, + drop_names, + ), + JoinType::Right => dispatch_left_right::right_join_from_series( + self.to_df(), + other.clone(), + s_left, + s_right, + args, + _verbose, + drop_names, + ), JoinType::Full => left_df._full_join_from_series(other, s_left, s_right, args), #[cfg(feature = "semi_anti_join")] JoinType::Anti => left_df._semi_anti_join_from_series( @@ -306,7 +323,8 @@ pub trait DataFrameJoinOps: IntoDf { _verbose, drop_names.as_deref(), ), - JoinType::Left => left_df._left_join_from_series( + JoinType::Left => dispatch_left_right::left_join_from_series( + left_df.clone(), other, &lhs_keys, &rhs_keys, @@ -314,6 +332,15 @@ pub trait DataFrameJoinOps: IntoDf { _verbose, drop_names.as_deref(), ), + JoinType::Right => dispatch_left_right::right_join_from_series( + left_df, + other.clone(), + &lhs_keys, + &rhs_keys, + args, + _verbose, + drop_names.as_deref(), + ), #[cfg(feature = "semi_anti_join")] JoinType::Anti | JoinType::Semi => self._join_impl( other, diff --git a/crates/polars-ops/src/series/ops/ewm_by.rs b/crates/polars-ops/src/series/ops/ewm_by.rs index 1bc3630d6604..9ae0db056ae5 100644 --- a/crates/polars-ops/src/series/ops/ewm_by.rs +++ b/crates/polars-ops/src/series/ops/ewm_by.rs @@ -130,7 +130,7 @@ where } }; }); - let mut arr = T::Array::from_zeroable_vec(out, values.dtype().to_arrow(true)); + let mut arr = T::Array::from_zeroable_vec(out, values.dtype().to_arrow(CompatLevel::newest())); if (times.null_count() > 0) || (values.null_count() > 0) { let validity = binary_concatenate_validities(times, values); arr = arr.with_validity_typed(validity); @@ -179,7 +179,7 @@ where } }; }); - let mut arr = T::Array::from_zeroable_vec(out, values.dtype().to_arrow(true)); + let mut arr = T::Array::from_zeroable_vec(out, values.dtype().to_arrow(CompatLevel::newest())); if (times.null_count() > 0) || (values.null_count() > 0) { let validity = binary_concatenate_validities(times, values); arr = arr.with_validity_typed(validity); diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs index 0263b506920d..cffbe59f5f05 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate.rs @@ -99,7 +99,7 @@ where } let array = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), out.into(), Some(validity.into()), ); diff --git a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs index f425ffaac7e7..7d76f7073cd5 100644 --- a/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs +++ b/crates/polars-ops/src/series/ops/interpolation/interpolate_by.rs @@ -151,7 +151,7 @@ where } let array = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), out.into(), Some(validity.into()), ); @@ -253,7 +253,7 @@ where } let array = PrimitiveArray::new( - T::get_dtype().to_arrow(true), + T::get_dtype().to_arrow(CompatLevel::newest()), out.into(), Some(validity.into()), ); diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs index aa74d1cc9b4a..0262798cc7db 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs @@ -70,7 +70,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { Some(additional), values, page_values, - ), + )?, BinaryState::Required(page) => { for x in page.values.by_ref().take(additional) { values.push(x) @@ -92,7 +92,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { Some(additional), offsets, page_values.lengths.by_ref(), - ); + )?; let length = *offsets.last() - last_offset; @@ -123,7 +123,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { .values .by_ref() .map(|index| page_dict.value(index as usize)), - ); + )?; page_values.values.get_result()?; }, BinaryState::RequiredDictionary(page) => { @@ -148,7 +148,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { Some(additional), values, page_values.by_ref(), - ); + )?; }, BinaryState::FilteredOptionalDelta(page_validity, page_values) => { extend_from_decoder( @@ -157,7 +157,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { Some(additional), values, page_values.by_ref(), - ); + )?; }, BinaryState::FilteredRequiredDictionary(page) => { // Already done on the dict. @@ -186,7 +186,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { .values .by_ref() .map(|index| page_dict.value(index as usize)), - ); + )?; page_values.values.get_result()?; }, BinaryState::OptionalDeltaByteArray(page_validity, page_values) => extend_from_decoder( @@ -195,7 +195,7 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { Some(additional), values, page_values, - ), + )?, BinaryState::DeltaByteArray(page_values) => { for x in page_values.take(additional) { values.push(x) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs index 4593fe16eea8..97c1548b7df3 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs @@ -179,13 +179,13 @@ impl<'a> FilteredDelta<'a> { #[derive(Debug)] pub(crate) struct RequiredDictionary<'a> { - pub values: hybrid_rle::BufferedHybridRleDecoderIter<'a>, + pub values: hybrid_rle::HybridRleDecoder<'a>, pub dict: &'a BinaryDict, } impl<'a> RequiredDictionary<'a> { pub fn try_new(page: &'a DataPage, dict: &'a BinaryDict) -> PolarsResult { - let values = utils::dict_indices_decoder(page)?.into_iter(); + let values = utils::dict_indices_decoder(page)?; Ok(Self { dict, values }) } @@ -198,13 +198,13 @@ impl<'a> RequiredDictionary<'a> { #[derive(Debug)] pub(crate) struct FilteredRequiredDictionary<'a> { - pub values: SliceFilteredIter>, + pub values: SliceFilteredIter>, pub dict: &'a BinaryDict, } impl<'a> FilteredRequiredDictionary<'a> { pub fn try_new(page: &'a DataPage, dict: &'a BinaryDict) -> PolarsResult { - let values = utils::dict_indices_decoder(page)?.into_iter(); + let values = utils::dict_indices_decoder(page)?; let rows = get_selected_rows(page); let values = SliceFilteredIter::new(values, rows); @@ -220,13 +220,13 @@ impl<'a> FilteredRequiredDictionary<'a> { #[derive(Debug)] pub(crate) struct ValuesDictionary<'a> { - pub values: hybrid_rle::BufferedHybridRleDecoderIter<'a>, + pub values: hybrid_rle::HybridRleDecoder<'a>, pub dict: &'a BinaryDict, } impl<'a> ValuesDictionary<'a> { pub fn try_new(page: &'a DataPage, dict: &'a BinaryDict) -> PolarsResult { - let values = utils::dict_indices_decoder(page)?.into_iter(); + let values = utils::dict_indices_decoder(page)?; Ok(Self { dict, values }) } @@ -283,6 +283,7 @@ pub(crate) fn deserialize_plain(values: &[u8], num_values: usize) -> BinaryDict for v in all { dict_values.push(v) } + dict_values.into() } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs index 1b3d65799293..3e0587d2ee17 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview/basic.rs @@ -1,16 +1,19 @@ use std::cell::Cell; use std::collections::VecDeque; -use arrow::array::{Array, ArrayRef, BinaryViewArray, MutableBinaryViewArray, Utf8ViewArray}; +use arrow::array::{Array, ArrayRef, BinaryViewArray, MutableBinaryViewArray, Utf8ViewArray, View}; use arrow::bitmap::{Bitmap, MutableBitmap}; use arrow::datatypes::{ArrowDataType, PhysicalType}; use polars_error::PolarsResult; use polars_utils::iter::FallibleIterator; use super::super::binary::decoders::*; +use crate::parquet::encoding::hybrid_rle::BinaryDictionaryTranslator; +use crate::parquet::error::ParquetError; use crate::parquet::page::{DataPage, DictPage}; -use crate::read::deserialize::utils; -use crate::read::deserialize::utils::{extend_from_decoder, next, DecodedState, MaybeNext}; +use crate::read::deserialize::utils::{ + self, extend_from_decoder, next, DecodedState, MaybeNext, TranslatedHybridRle, +}; use crate::read::{PagesIter, PrimitiveLogicalType}; type DecodedStateTuple = (MutableBinaryViewArray<[u8]>, MutableBitmap); @@ -69,7 +72,7 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { Some(additional), values, page_values, - ), + )?, BinaryState::Required(page) => { for x in page.values.by_ref().take(additional) { values.push_value_ignore_validity(x) @@ -87,7 +90,7 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { Some(additional), values, page_values, - ); + )?; }, BinaryState::FilteredRequired(page) => { for x in page.values.by_ref().take(additional) { @@ -102,33 +105,78 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { BinaryState::OptionalDictionary(page_validity, page_values) => { // Already done on the dict. validate_utf8 = false; + let page_dict = &page_values.dict; + let offsets = page_dict.offsets(); + + // @NOTE: If there is no lengths (i.e. 0-1 offset), then we will have only nulls. + let max_length = offsets.lengths().max().unwrap_or(0); + + // We do not have to push the buffer if all elements fit as inline views. + let buffer_idx = if max_length <= View::MAX_INLINE_SIZE as usize { + 0 + } else { + values.push_buffer(page_dict.values().clone()) + }; + + // @NOTE: we could potentially use the View::new_inline function here, but that + // would require two collectors & two translators. So I don't think it is worth + // it. + let translator = BinaryDictionaryTranslator { + dictionary: page_dict, + buffer_idx, + }; + let collector = TranslatedHybridRle::new(&mut page_values.values, &translator); + utils::extend_from_decoder( validity, page_validity, Some(additional), values, - &mut page_values - .values - .by_ref() - .map(|index| page_dict.value(index as usize)), - ); - page_values.values.get_result()?; + collector, + )?; }, BinaryState::RequiredDictionary(page) => { // Already done on the dict. validate_utf8 = false; + let page_dict = &page.dict; + let offsets = page_dict.offsets(); - for x in page - .values - .by_ref() - .map(|index| page_dict.value(index as usize)) - .take(additional) - { - values.push_value_ignore_validity(x) + if let Some(max_length) = offsets.lengths().max() { + // We do not have to push the buffer if all elements fit as inline views. + let buffer_idx = if max_length <= View::MAX_INLINE_SIZE as usize { + 0 + } else { + values.push_buffer(page_dict.values().clone()) + }; + + // @NOTE: we could potentially use the View::new_inline function here, but that + // would require two collectors & two translators. So I don't think it is worth + // it. + let translator = BinaryDictionaryTranslator { + dictionary: page_dict, + buffer_idx, + }; + + page.values.translate_and_collect_n_into( + values.views_mut(), + additional, + &translator, + )?; + if let Some(validity) = values.validity() { + validity.extend_constant(additional, true); + } + } else { + // @NOTE: If there are no dictionary items, there is no way we can look up + // items. + if additional != 0 { + return Err(ParquetError::oos( + "Attempt to search items with empty dictionary", + ) + .into()); + } } - page.values.get_result()?; }, BinaryState::FilteredOptional(page_validity, page_values) => { extend_from_decoder( @@ -137,7 +185,7 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { Some(additional), values, page_values.by_ref(), - ); + )?; }, BinaryState::FilteredOptionalDelta(page_validity, page_values) => { extend_from_decoder( @@ -146,7 +194,7 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { Some(additional), values, page_values.by_ref(), - ); + )?; }, BinaryState::FilteredRequiredDictionary(page) => { // TODO! directly set the dict as buffers and only insert the proper views. @@ -179,7 +227,7 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { .values .by_ref() .map(|index| page_dict.value(index as usize)), - ); + )?; page_values.values.get_result()?; }, BinaryState::OptionalDeltaByteArray(page_validity, page_values) => extend_from_decoder( @@ -188,7 +236,7 @@ impl<'a> utils::Decoder<'a> for BinViewDecoder { Some(additional), values, page_values, - ), + )?, BinaryState::DeltaByteArray(page_values) => { for x in page_values.take(additional) { values.push_value_ignore_validity(x) @@ -273,17 +321,7 @@ pub(super) fn finish( } match data_type.to_physical_type() { - PhysicalType::BinaryView => unsafe { - Ok(BinaryViewArray::new_unchecked( - data_type.clone(), - array.views().clone(), - array.data_buffers().clone(), - array.validity().cloned(), - array.total_bytes_len(), - array.total_buffer_len(), - ) - .boxed()) - }, + PhysicalType::BinaryView => Ok(array.boxed()), PhysicalType::Utf8View => { // SAFETY: we already checked utf8 unsafe { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs index 6db66a0fba31..0e24c1bb318e 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs @@ -171,7 +171,7 @@ impl<'a> Decoder<'a> for BooleanDecoder { Some(remaining), values, &mut page_values.0, - ), + )?, State::Required(page) => { let remaining = remaining.min(page.length - page.offset); values.extend_from_slice(page.values, page.offset, remaining); @@ -190,7 +190,7 @@ impl<'a> Decoder<'a> for BooleanDecoder { Some(remaining), values, page_values.0.by_ref(), - ); + )?; }, State::RleOptional(page_validity, page_values) => { utils::extend_from_decoder( @@ -199,7 +199,7 @@ impl<'a> Decoder<'a> for BooleanDecoder { Some(remaining), values, &mut *page_values, - ); + )?; }, } Ok(()) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs index 710b30fe0593..03d2a8476714 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs @@ -15,7 +15,7 @@ use super::utils::{ }; use super::PagesIter; use crate::parquet::deserialize::SliceFilteredIter; -use crate::parquet::encoding::hybrid_rle::BufferedHybridRleDecoderIter; +use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; use crate::parquet::encoding::Encoding; use crate::parquet::page::{DataPage, DictPage, Page}; use crate::parquet::schema::Repetition; @@ -26,32 +26,29 @@ pub enum State<'a> { Optional(Optional<'a>), Required(Required<'a>), FilteredRequired(FilteredRequired<'a>), - FilteredOptional( - FilteredOptionalPageValidity<'a>, - BufferedHybridRleDecoderIter<'a>, - ), + FilteredOptional(FilteredOptionalPageValidity<'a>, HybridRleDecoder<'a>), } #[derive(Debug)] pub struct Required<'a> { - values: BufferedHybridRleDecoderIter<'a>, + values: HybridRleDecoder<'a>, } impl<'a> Required<'a> { fn try_new(page: &'a DataPage) -> PolarsResult { - let values = dict_indices_decoder(page)?.into_iter(); + let values = dict_indices_decoder(page)?; Ok(Self { values }) } } #[derive(Debug)] pub struct FilteredRequired<'a> { - values: SliceFilteredIter>, + values: SliceFilteredIter>, } impl<'a> FilteredRequired<'a> { fn try_new(page: &'a DataPage) -> PolarsResult { - let values = dict_indices_decoder(page)?.into_iter(); + let values = dict_indices_decoder(page)?; let rows = get_selected_rows(page); let values = SliceFilteredIter::new(values, rows); @@ -62,13 +59,13 @@ impl<'a> FilteredRequired<'a> { #[derive(Debug)] pub struct Optional<'a> { - values: BufferedHybridRleDecoderIter<'a>, + values: HybridRleDecoder<'a>, validity: OptionalPageValidity<'a>, } impl<'a> Optional<'a> { fn try_new(page: &'a DataPage) -> PolarsResult { - let values = dict_indices_decoder(page)?.into_iter(); + let values = dict_indices_decoder(page)?; Ok(Self { values, @@ -138,7 +135,7 @@ where (Encoding::PlainDictionary | Encoding::RleDictionary, true, true) => { Ok(State::FilteredOptional( FilteredOptionalPageValidity::try_new(page)?, - dict_indices_decoder(page)?.into_iter(), + dict_indices_decoder(page)?, )) }, _ => Err(utils::not_implemented(page)), @@ -173,7 +170,7 @@ where Err(_) => panic!("The maximum key is too small"), } }), - ); + )?; page.values.get_result()?; }, State::Required(page) => { @@ -210,7 +207,7 @@ where }; x }), - ); + )?; page_values.get_result()?; }, State::FilteredRequired(page) => { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs index f6ec4093c174..4ad39b8ea695 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs @@ -10,7 +10,7 @@ use super::super::super::PagesIter; use super::super::nested_utils::*; use super::super::utils::{dict_indices_decoder, not_implemented, MaybeNext, PageState}; use super::finish_key; -use crate::parquet::encoding::hybrid_rle::BufferedHybridRleDecoderIter; +use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; use crate::parquet::encoding::Encoding; use crate::parquet::page::{DataPage, DictPage, Page}; use crate::parquet::schema::Repetition; @@ -18,13 +18,13 @@ use crate::parquet::schema::Repetition; // The state of a required DataPage with a boolean physical type #[derive(Debug)] pub struct Required<'a> { - values: BufferedHybridRleDecoderIter<'a>, + values: HybridRleDecoder<'a>, length: usize, } impl<'a> Required<'a> { fn try_new(page: &'a DataPage) -> PolarsResult { - let values = dict_indices_decoder(page)?.into_iter(); + let values = dict_indices_decoder(page)?; let length = page.num_values(); Ok(Self { values, length }) } @@ -34,7 +34,7 @@ impl<'a> Required<'a> { #[allow(clippy::large_enum_variant)] #[derive(Debug)] pub enum State<'a> { - Optional(BufferedHybridRleDecoderIter<'a>), + Optional(HybridRleDecoder<'a>), Required(Required<'a>), } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs index f09fb2011be0..41d6f4f5e9e0 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs @@ -84,13 +84,13 @@ impl<'a> FilteredRequired<'a> { #[derive(Debug)] pub(super) struct RequiredDictionary<'a> { - pub values: hybrid_rle::BufferedHybridRleDecoderIter<'a>, + pub values: hybrid_rle::HybridRleDecoder<'a>, pub dict: &'a Dict, } impl<'a> RequiredDictionary<'a> { pub(super) fn try_new(page: &'a DataPage, dict: &'a Dict) -> PolarsResult { - let values = dict_indices_decoder(page)?.into_iter(); + let values = dict_indices_decoder(page)?; Ok(Self { dict, values }) } @@ -103,14 +103,14 @@ impl<'a> RequiredDictionary<'a> { #[derive(Debug)] pub(super) struct OptionalDictionary<'a> { - pub(super) values: hybrid_rle::BufferedHybridRleDecoderIter<'a>, + pub(super) values: hybrid_rle::HybridRleDecoder<'a>, pub(super) validity: OptionalPageValidity<'a>, pub(super) dict: &'a Dict, } impl<'a> OptionalDictionary<'a> { pub(super) fn try_new(page: &'a DataPage, dict: &'a Dict) -> PolarsResult { - let values = dict_indices_decoder(page)?.into_iter(); + let values = dict_indices_decoder(page)?; Ok(Self { values, @@ -219,7 +219,7 @@ impl<'a> Decoder<'a> for BinaryDecoder { Some(remaining), values, &mut page.values, - ), + )?, State::Required(page) => { for x in page.values.by_ref().take(remaining) { values.push(x) @@ -236,11 +236,11 @@ impl<'a> Decoder<'a> for BinaryDecoder { &mut page.validity, Some(remaining), values, - page.values.by_ref().map(|index| { + &mut page.values.by_ref().map(|index| { let index = index as usize; &page.dict[index * self.size..(index + 1) * self.size] }), - ); + )?; page.values.get_result()?; }, State::RequiredDictionary(page) => { @@ -264,7 +264,7 @@ impl<'a> Decoder<'a> for BinaryDecoder { Some(remaining), values, page_values.by_ref(), - ); + )?; }, } Ok(()) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index 395ea7f8e253..0aa505e65a84 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -8,7 +8,7 @@ use polars_utils::slice::GetSaferUnchecked; use super::super::PagesIter; use super::utils::{DecodedState, MaybeNext, PageState}; -use crate::parquet::encoding::hybrid_rle::{BufferedHybridRleDecoderIter, HybridRleDecoder}; +use crate::parquet::encoding::hybrid_rle::HybridRleDecoder; use crate::parquet::page::{split_buffer, DataPage, DictPage, Page}; use crate::parquet::read::levels::get_bit_width; @@ -239,7 +239,7 @@ pub fn init_nested(init: &[InitNested], capacity: usize) -> NestedState { } pub struct NestedPage<'a> { - iter: Peekable, BufferedHybridRleDecoderIter<'a>>>, + iter: Peekable, HybridRleDecoder<'a>>>, } impl<'a> NestedPage<'a> { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs index 4c3655e0a01a..ca24ff535f42 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs @@ -5,16 +5,17 @@ use arrow::bitmap::MutableBitmap; use arrow::datatypes::ArrowDataType; use arrow::types::NativeType; use polars_error::PolarsResult; -use polars_utils::iter::FallibleIterator; use super::super::utils::{ get_selected_rows, FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, }; use super::super::{utils, PagesIter}; use crate::parquet::deserialize::SliceFilteredIter; +use crate::parquet::encoding::hybrid_rle::DictionaryTranslator; use crate::parquet::encoding::{byte_stream_split, hybrid_rle, Encoding}; use crate::parquet::page::{split_buffer, DataPage, DictPage}; use crate::parquet::types::{decode, NativeType as ParquetNativeType}; +use crate::read::deserialize::utils::TranslatedHybridRle; #[derive(Debug)] pub(super) struct FilteredRequiredValues<'a> { @@ -65,23 +66,23 @@ pub(super) struct ValuesDictionary<'a, T> where T: NativeType, { - pub values: hybrid_rle::BufferedHybridRleDecoderIter<'a>, - pub dict: &'a Vec, + pub values: hybrid_rle::HybridRleDecoder<'a>, + pub dict: &'a [T], } impl<'a, T> ValuesDictionary<'a, T> where T: NativeType, { - pub fn try_new(page: &'a DataPage, dict: &'a Vec) -> PolarsResult { - let values = utils::dict_indices_decoder(page)?.into_iter(); + pub fn try_new(page: &'a DataPage, dict: &'a [T]) -> PolarsResult { + let values = utils::dict_indices_decoder(page)?; Ok(Self { dict, values }) } #[inline] pub fn len(&self) -> usize { - self.values.size_hint().0 + self.values.len() } } @@ -233,8 +234,8 @@ where page_validity, Some(remaining), values, - page_values.values.by_ref().map(decode).map(self.op), - ), + &mut page_values.values.by_ref().map(decode).map(self.op), + )?, State::Required(page) => { values.extend( page.values @@ -245,20 +246,22 @@ where ); }, State::OptionalDictionary(page_validity, page_values) => { - let op1 = |index: u32| page_values.dict[index as usize]; + let translator = DictionaryTranslator(page_values.dict); + let translated_hybridrle = + TranslatedHybridRle::new(&mut page_values.values, &translator); + utils::extend_from_decoder( validity, page_validity, Some(remaining), values, - &mut page_values.values.by_ref().map(op1), - ); - page_values.values.get_result()?; + translated_hybridrle, + )?; }, State::RequiredDictionary(page) => { - let op1 = |index: u32| page.dict[index as usize]; - values.extend(page.values.by_ref().map(op1).take(remaining)); - page.values.get_result()?; + let translator = DictionaryTranslator(page.dict); + page.values + .translate_and_collect_n_into(values, remaining, &translator)?; }, State::FilteredRequired(page) => { values.extend( @@ -275,8 +278,8 @@ where page_validity, Some(remaining), values, - page_values.values.by_ref().map(decode).map(self.op), - ); + &mut page_values.values.by_ref().map(decode).map(self.op), + )?; }, State::RequiredByteStreamSplit(decoder) => { values.extend(decoder.iter_converted(decode).map(self.op).take(remaining)); @@ -286,8 +289,8 @@ where page_validity, Some(remaining), values, - decoder.iter_converted(decode).map(self.op), - ), + &mut decoder.iter_converted(decode).map(self.op), + )?, } Ok(()) } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs index c23e6822567f..6d74f6417a2a 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs @@ -155,11 +155,11 @@ where page_validity, Some(remaining), values, - page_values + &mut page_values .by_ref() .map(|x| x.unwrap().as_()) .map(self.0.op), - ) + )? }, State::FilteredDeltaBinaryPackedRequired(page) => { values.extend( @@ -175,11 +175,11 @@ where page_validity, Some(remaining), values, - page_values + &mut page_values .by_ref() .map(|x| x.unwrap().as_()) .map(self.0.op), - ); + )?; }, } Ok(()) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs index 23e45bece122..994e31571111 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs @@ -1,5 +1,6 @@ use std::collections::VecDeque; +use arrow::array::{MutableBinaryViewArray, View}; use arrow::bitmap::utils::BitmapIter; use arrow::bitmap::MutableBitmap; use arrow::pushable::Pushable; @@ -9,7 +10,8 @@ use super::super::PagesIter; use crate::parquet::deserialize::{ FilteredHybridEncoded, FilteredHybridRleDecoderIter, HybridDecoderBitmapIter, HybridEncoded, }; -use crate::parquet::encoding::hybrid_rle; +use crate::parquet::encoding::hybrid_rle::{self, HybridRleDecoder, Translator}; +use crate::parquet::error::ParquetResult; use crate::parquet::indexes::Interval; use crate::parquet::page::{split_buffer, DataPage, DictPage, Page}; use crate::parquet::schema::Repetition; @@ -33,6 +35,91 @@ pub(super) trait PageValidity<'a> { fn next_limited(&mut self, limit: usize) -> Option>; } +pub trait BatchableCollector { + fn reserve(target: &mut T, n: usize); + fn push_n(&mut self, target: &mut T, n: usize) -> ParquetResult<()>; + fn push_n_nulls(&mut self, target: &mut T, n: usize) -> ParquetResult<()>; + fn skip_n(&mut self, n: usize) -> ParquetResult<()>; +} + +/// This batches sequential collect operations to try and prevent unnecessary buffering and +/// `Iterator::next` polling. +#[must_use] +pub struct BatchedCollector<'a, I, T, C: BatchableCollector> { + num_waiting_valids: usize, + num_waiting_invalids: usize, + + target: &'a mut T, + collector: C, + _pd: std::marker::PhantomData, +} + +impl<'a, I, T, C: BatchableCollector> BatchedCollector<'a, I, T, C> { + pub fn new(collector: C, target: &'a mut T) -> Self { + Self { + num_waiting_valids: 0, + num_waiting_invalids: 0, + target, + collector, + _pd: Default::default(), + } + } + + #[inline] + pub fn push_n_valids(&mut self, n: usize) -> ParquetResult<()> { + if self.num_waiting_invalids == 0 { + self.num_waiting_valids += n; + return Ok(()); + } + + self.collector + .push_n(self.target, self.num_waiting_valids)?; + self.collector + .push_n_nulls(self.target, self.num_waiting_invalids)?; + + self.num_waiting_valids = n; + self.num_waiting_invalids = 0; + + Ok(()) + } + + #[inline] + pub fn push_n_invalids(&mut self, n: usize) { + self.num_waiting_invalids += n; + } + + #[inline] + pub fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + if n == 0 { + return Ok(()); + } + + if self.num_waiting_valids > 0 { + self.collector + .push_n(self.target, self.num_waiting_valids)?; + } + if self.num_waiting_invalids > 0 { + self.collector + .push_n_nulls(self.target, self.num_waiting_invalids)?; + } + self.collector.skip_n(n)?; + + self.num_waiting_valids = 0; + self.num_waiting_invalids = 0; + + Ok(()) + } + + #[inline] + pub fn finalize(mut self) -> ParquetResult<()> { + self.collector + .push_n(self.target, self.num_waiting_valids)?; + self.collector + .push_n_nulls(self.target, self.num_waiting_invalids)?; + Ok(()) + } +} + #[derive(Debug, Clone)] pub struct FilteredOptionalPageValidity<'a> { iter: FilteredHybridRleDecoderIter<'a>, @@ -121,33 +208,6 @@ impl<'a> PageValidity<'a> for FilteredOptionalPageValidity<'a> { } } -pub struct Zip { - validity: V, - values: I, -} - -impl Zip { - pub fn new(validity: V, values: I) -> Self { - Self { validity, values } - } -} - -impl, I: Iterator> Iterator for Zip { - type Item = Option; - - #[inline] - fn next(&mut self) -> Option { - self.validity - .next() - .map(|x| if x { self.values.next() } else { None }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.validity.size_hint() - } -} - #[derive(Debug, Clone)] pub struct OptionalPageValidity<'a> { iter: HybridDecoderBitmapIter<'a>, @@ -227,11 +287,11 @@ impl<'a> PageValidity<'a> for OptionalPageValidity<'a> { } } -fn reserve_pushable_and_validity<'a, T, P: Pushable>( +fn reserve_pushable_and_validity<'a, I, T, C: BatchableCollector>( validity: &mut MutableBitmap, page_validity: &'a mut dyn PageValidity, limit: Option, - pushable: &mut P, + target: &mut T, ) -> Vec> { let limit = limit.unwrap_or(usize::MAX); @@ -257,20 +317,22 @@ fn reserve_pushable_and_validity<'a, T, P: Pushable>( }; runs.push(run) } - pushable.reserve(reserve_pushable); + C::reserve(target, reserve_pushable); validity.reserve(reserve_pushable); runs } /// Extends a [`Pushable`] from an iterator of non-null values and an hybrid-rle decoder -pub(super) fn extend_from_decoder, I: Iterator>( +pub(super) fn extend_from_decoder>( validity: &mut MutableBitmap, page_validity: &mut dyn PageValidity, limit: Option, - pushable: &mut P, - mut values_iter: I, -) { - let runs = reserve_pushable_and_validity(validity, page_validity, limit, pushable); + target: &mut T, + collector: C, +) -> ParquetResult<()> { + let runs = reserve_pushable_and_validity::(validity, page_validity, limit, target); + + let mut batched_collector = BatchedCollector::new(collector, target); // then a second loop to really fill the buffers for run in runs { @@ -281,31 +343,160 @@ pub(super) fn extend_from_decoder, I: Iterator>( length, } => { // consume `length` items - let iter = BitmapIter::new(values, offset, length); - let iter = Zip::new(iter, &mut values_iter); - - for item in iter { - if let Some(item) = item { - pushable.push(item) - } else { - pushable.push_null() - } + let mut validity_iter = BitmapIter::new(values, offset, length); + + let mut bit_sum = 0; + while validity_iter.num_remaining() != 0 { + let num_valid = validity_iter.take_leading_ones(); + bit_sum += num_valid; + batched_collector.push_n_valids(num_valid)?; + + let num_invalid = validity_iter.take_leading_zeros(); + bit_sum += num_invalid; + batched_collector.push_n_invalids(num_invalid); } + + debug_assert_eq!(bit_sum, length); + validity.extend_from_slice(values, offset, length); }, FilteredHybridEncoded::Repeated { is_set, length } => { validity.extend_constant(length, is_set); if is_set { - for v in (&mut values_iter).take(length) { - pushable.push(v) - } + batched_collector.push_n_valids(length)?; } else { - pushable.extend_null_constant(length); + batched_collector.push_n_invalids(length); } }, - FilteredHybridEncoded::Skipped(valids) => for _ in values_iter.by_ref().take(valids) {}, + FilteredHybridEncoded::Skipped(valids) => batched_collector.skip_in_place(valids)?, }; } + + batched_collector.finalize()?; + + Ok(()) +} + +/// This translates and collects items from a [`HybridRleDecoder`] into a target [`Vec`]. +/// +/// This batches sequential collect operations to try and prevent unnecessary buffering. +pub struct TranslatedHybridRle<'a, 'b, 'c, O, T> +where + O: Clone + Default, + T: Translator, +{ + decoder: &'a mut HybridRleDecoder<'b>, + translator: &'c T, + _pd: std::marker::PhantomData, +} + +impl<'a, 'b, 'c, O, T> TranslatedHybridRle<'a, 'b, 'c, O, T> +where + O: Clone + Default, + T: Translator, +{ + pub fn new(decoder: &'a mut HybridRleDecoder<'b>, translator: &'c T) -> Self { + Self { + decoder, + translator, + _pd: Default::default(), + } + } +} + +impl<'a, 'b, 'c, O, T> BatchableCollector> for TranslatedHybridRle<'a, 'b, 'c, O, T> +where + O: Clone + Default, + T: Translator, +{ + #[inline] + fn reserve(target: &mut Vec, n: usize) { + target.reserve(n); + } + + #[inline] + fn push_n(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + self.decoder + .translate_and_collect_n_into(target, n, self.translator) + } + + #[inline] + fn push_n_nulls(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + target.resize(target.len() + n, O::default()); + Ok(()) + } + + #[inline] + fn skip_n(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } +} + +impl<'a, 'b, 'c, T> BatchableCollector> + for TranslatedHybridRle<'a, 'b, 'c, View, T> +where + T: Translator, +{ + #[inline] + fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { + target.reserve(n); + } + + #[inline] + fn push_n(&mut self, target: &mut MutableBinaryViewArray<[u8]>, n: usize) -> ParquetResult<()> { + self.decoder + .translate_and_collect_n_into(target.views_mut(), n, self.translator)?; + + if let Some(validity) = target.validity() { + validity.extend_constant(n, true); + } + + Ok(()) + } + + #[inline] + fn push_n_nulls( + &mut self, + target: &mut MutableBinaryViewArray<[u8]>, + n: usize, + ) -> ParquetResult<()> { + target.extend_null(n); + Ok(()) + } + + #[inline] + fn skip_n(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } +} + +impl, I: Iterator> BatchableCollector for I { + #[inline] + fn reserve(target: &mut P, n: usize) { + target.reserve(n); + } + + #[inline] + fn push_n(&mut self, target: &mut P, n: usize) -> ParquetResult<()> { + target.extend_n(n, self); + Ok(()) + } + + #[inline] + fn push_n_nulls(&mut self, target: &mut P, n: usize) -> ParquetResult<()> { + target.extend_null_constant(n); + Ok(()) + } + + #[inline] + fn skip_n(&mut self, n: usize) -> ParquetResult<()> { + if n == 0 { + return Ok(()); + } + + _ = self.nth(n); + Ok(()) + } } /// The state of a partially deserialized page diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index 90ed2269c1a5..9022bab0e2c9 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -81,7 +81,7 @@ pub struct WriteOptions { /// The compression to apply to every page pub compression: CompressionOptions, /// The size to flush a page, defaults to 1024 * 1024 if None - pub data_pagesize_limit: Option, + pub data_page_size: Option, } use arrow::compute::aggregate::estimated_bytes_size; @@ -298,7 +298,7 @@ pub fn array_to_pages( let byte_size = estimated_bytes_size(primitive_array); const DEFAULT_PAGE_SIZE: usize = 1024 * 1024; - let max_page_size = options.data_pagesize_limit.unwrap_or(DEFAULT_PAGE_SIZE); + let max_page_size = options.data_page_size.unwrap_or(DEFAULT_PAGE_SIZE); let max_page_size = max_page_size.min(2usize.pow(31) - 2usize.pow(25)); // allowed maximum page size let bytes_per_row = if number_of_rows == 0 { 0 diff --git a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs index d3361f0f44c0..a85616a6c300 100644 --- a/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs +++ b/crates/polars-parquet/src/parquet/encoding/bitpacked/decode.rs @@ -89,7 +89,97 @@ impl<'a, T: Unpackable> Decoder<'a, T> { } } +/// A iterator over the exact chunks in a [`Decoder`]. +/// +/// The remainder can be accessed using `remainder` or `next_inexact`. +pub struct ChunkedDecoder<'a, 'b, T: Unpackable> { + pub(crate) decoder: &'b mut Decoder<'a, T>, +} + +impl<'a, 'b, T: Unpackable> Iterator for ChunkedDecoder<'a, 'b, T> { + type Item = T::Unpacked; + + #[inline] + fn next(&mut self) -> Option { + if self.decoder.len() < T::Unpacked::LENGTH { + return None; + } + + let mut unpacked = T::Unpacked::zero(); + let packed = self.decoder.packed.next()?; + decode_pack::(packed, self.decoder.num_bits, &mut unpacked); + self.decoder.length -= T::Unpacked::LENGTH; + Some(unpacked) + } + + fn size_hint(&self) -> (usize, Option) { + let is_exact = self.decoder.len() % T::Unpacked::LENGTH == 0; + let (low, high) = self.decoder.packed.size_hint(); + + let delta = usize::from(!is_exact); + + (low - delta, high.map(|h| h - delta)) + } +} + +impl<'a, 'b, T: Unpackable> ExactSizeIterator for ChunkedDecoder<'a, 'b, T> {} + +impl<'a, 'b, T: Unpackable> ChunkedDecoder<'a, 'b, T> { + /// Get and consume the remainder chunk if it exists + pub fn remainder(&mut self) -> Option<(T::Unpacked, usize)> { + let remainder_len = self.decoder.len() % T::Unpacked::LENGTH; + + if remainder_len > 0 { + let mut unpacked = T::Unpacked::zero(); + let packed = self.decoder.packed.next_back().unwrap(); + decode_pack::(packed, self.decoder.num_bits, &mut unpacked); + self.decoder.length -= remainder_len; + return Some((unpacked, remainder_len)); + } + + None + } + + /// Get the next (possibly partial) chunk and its filled length + pub fn next_inexact(&mut self) -> Option<(T::Unpacked, usize)> { + if self.decoder.len() >= T::Unpacked::LENGTH { + Some((self.next().unwrap(), T::Unpacked::LENGTH)) + } else { + self.remainder() + } + } +} + impl<'a, T: Unpackable> Decoder<'a, T> { + pub fn chunked<'b>(&'b mut self) -> ChunkedDecoder<'a, 'b, T> { + ChunkedDecoder { decoder: self } + } + + pub fn len(&self) -> usize { + self.length + } + + pub fn skip_chunks(&mut self, n: usize) { + for _ in (&mut self.packed).take(n) {} + } + + pub fn take(&mut self) -> Self { + let block_size = std::mem::size_of::() * self.num_bits; + let packed = std::mem::replace(&mut self.packed, [].chunks(block_size)); + let length = self.length; + self.length = 0; + + debug_assert_eq!(self.len(), 0); + + Self { + packed, + num_bits: self.num_bits, + length, + _pd: Default::default(), + } + } + + #[inline] pub fn collect_into(mut self, vec: &mut Vec) { // @NOTE: // When microbenchmarking changing this from a element-wise iterator to a collect into diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs new file mode 100644 index 000000000000..6c40d4c27720 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/buffered.rs @@ -0,0 +1,267 @@ +use super::Translator; +use crate::parquet::encoding::bitpacked::{self, Unpackable, Unpacked}; +use crate::parquet::error::ParquetResult; + +#[derive(Debug, Clone)] +pub struct BufferedBitpacked<'a> { + pub unpacked: [u32; 32], + pub unpacked_start: usize, + pub unpacked_end: usize, + + pub decoder: bitpacked::Decoder<'a, u32>, +} + +#[derive(Debug, Clone)] +pub struct BufferedRle { + pub value: u32, + pub length: usize, +} + +/// A buffered set of items for the [`HybridRleDecoder`]. This can be iterated over and stopped at +/// any time. +#[derive(Debug, Clone)] +pub enum HybridRleBuffered<'a> { + Bitpacked(BufferedBitpacked<'a>), + Rle(BufferedRle), +} + +impl Iterator for BufferedRle { + type Item = u32; + + fn next(&mut self) -> Option { + if self.length > 0 { + self.length -= 1; + Some(self.value) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.length, Some(self.length)) + } +} + +impl ExactSizeIterator for BufferedRle {} + +impl<'a> Iterator for BufferedBitpacked<'a> { + type Item = u32; + + fn next(&mut self) -> Option { + if self.unpacked_start < self.unpacked_end { + let value = self.unpacked[self.unpacked_start]; + self.unpacked_start += 1; + return Some(value); + } + + self.decoder + .chunked() + .next_inexact() + .map(|(unpacked, unpacked_length)| { + debug_assert!(unpacked_length > 0); + let value = unpacked[0]; + self.unpacked = unpacked; + self.unpacked_end = unpacked_length; + self.unpacked_start = 1; + value + }) + } + + fn size_hint(&self) -> (usize, Option) { + let unpacked_num_elements = self.unpacked_end - self.unpacked_start; + let exact = unpacked_num_elements + self.decoder.len(); + (exact, Some(exact)) + } +} + +impl<'a> ExactSizeIterator for BufferedBitpacked<'a> {} + +impl<'a> Iterator for HybridRleBuffered<'a> { + type Item = u32; + + fn next(&mut self) -> Option { + match self { + HybridRleBuffered::Bitpacked(b) => b.next(), + HybridRleBuffered::Rle(b) => b.next(), + } + } + + fn size_hint(&self) -> (usize, Option) { + match self { + HybridRleBuffered::Bitpacked(b) => b.size_hint(), + HybridRleBuffered::Rle(b) => b.size_hint(), + } + } +} + +impl<'a> ExactSizeIterator for HybridRleBuffered<'a> {} + +impl<'a> BufferedBitpacked<'a> { + fn translate_and_collect_limited_into( + &mut self, + target: &mut Vec, + limit: usize, + translator: &impl Translator, + ) -> ParquetResult { + let unpacked_num_elements = self.unpacked_end - self.unpacked_start; + if limit <= unpacked_num_elements { + translator.translate_slice( + target, + &self.unpacked[self.unpacked_start..self.unpacked_start + limit], + )?; + self.unpacked_start += limit; + return Ok(limit); + } + + translator.translate_slice( + target, + &self.unpacked[self.unpacked_start..self.unpacked_end], + )?; + self.unpacked_end = 0; + self.unpacked_start = 0; + let limit = limit - unpacked_num_elements; + + let decoder = self.decoder.take(); + let decoder_len = decoder.len(); + if limit >= decoder_len { + translator.translate_bitpacked_all(target, decoder)?; + Ok(unpacked_num_elements + decoder_len) + } else { + let buffered = translator.translate_bitpacked_limited(target, limit, decoder)?; + *self = buffered; + Ok(unpacked_num_elements + limit) + } + } + + pub fn translate_and_collect_into( + self, + target: &mut Vec, + translator: &impl Translator, + ) -> ParquetResult { + let unpacked_num_elements = self.unpacked_end - self.unpacked_start; + translator.translate_slice( + target, + &self.unpacked[self.unpacked_start..self.unpacked_end], + )?; + let decoder_len = self.decoder.len(); + translator.translate_bitpacked_all(target, self.decoder)?; + Ok(unpacked_num_elements + decoder_len) + } + + pub fn skip_in_place(&mut self, n: usize) -> usize { + let unpacked_num_elements = self.unpacked_end - self.unpacked_start; + + if n < unpacked_num_elements { + self.unpacked_start += n; + return n; + } + + let n = n - unpacked_num_elements; + + if self.decoder.len() > n { + let num_chunks = n / ::Unpacked::LENGTH; + let unpacked_offset = n % ::Unpacked::LENGTH; + self.decoder.skip_chunks(num_chunks); + let (unpacked, unpacked_length) = self.decoder.chunked().next_inexact().unwrap(); + + self.unpacked = unpacked; + self.unpacked_start = unpacked_offset; + self.unpacked_end = unpacked_length; + + return unpacked_num_elements + n; + } + + self.decoder.len() + unpacked_num_elements + } +} + +impl BufferedRle { + pub fn translate_and_collect_limited_into( + &mut self, + target: &mut Vec, + limit: usize, + translator: &impl Translator, + ) -> ParquetResult { + let value = translator.translate(self.value)?; + let num_elements = usize::min(self.length, limit); + self.length -= num_elements; + target.resize(target.len() + num_elements, value); + Ok(num_elements) + } + + pub fn translate_and_collect_into( + self, + target: &mut Vec, + translator: &impl Translator, + ) -> ParquetResult { + let value = translator.translate(self.value)?; + target.resize(target.len() + self.length, value); + Ok(self.length) + } + + pub fn skip_in_place(&mut self, n: usize) -> usize { + let num_elements = usize::min(self.length, n); + self.length -= num_elements; + num_elements + } +} + +impl<'a> HybridRleBuffered<'a> { + pub fn translate_and_collect_limited_into( + &mut self, + target: &mut Vec, + limit: usize, + translator: &impl Translator, + ) -> ParquetResult { + let start_target_length = target.len(); + let start_length = self.len(); + + let num_processed = match self { + HybridRleBuffered::Bitpacked(b) => { + b.translate_and_collect_limited_into(target, limit, translator) + }, + HybridRleBuffered::Rle(b) => { + b.translate_and_collect_limited_into(target, limit, translator) + }, + }?; + + debug_assert!(num_processed <= limit); + debug_assert_eq!(num_processed, target.len() - start_target_length); + debug_assert_eq!(num_processed, start_length - self.len()); + + Ok(num_processed) + } + + pub fn translate_and_collect_into( + self, + target: &mut Vec, + translator: &impl Translator, + ) -> ParquetResult { + let start_target_length = target.len(); + let start_length = self.len(); + + let num_processed = match self { + HybridRleBuffered::Bitpacked(b) => b.translate_and_collect_into(target, translator), + HybridRleBuffered::Rle(b) => b.translate_and_collect_into(target, translator), + }?; + + debug_assert_eq!(num_processed, target.len() - start_target_length); + debug_assert_eq!(num_processed, start_length); + + Ok(num_processed) + } + + pub fn skip_in_place(&mut self, n: usize) -> usize { + let start_length = self.len(); + + let num_skipped = match self { + HybridRleBuffered::Bitpacked(b) => b.skip_in_place(n), + HybridRleBuffered::Rle(b) => b.skip_in_place(n), + }; + + debug_assert!(num_skipped <= n); + debug_assert_eq!(num_skipped, start_length - self.len()); + + num_skipped + } +} diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/fuzz.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/fuzz.rs new file mode 100644 index 000000000000..1e2c42d985f5 --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/fuzz.rs @@ -0,0 +1,359 @@ +/// Since the HybridRle decoder is very widely used within the Parquet reader and the code is quite +/// complex to facilitate performance. We create this small fuzzer +use std::collections::VecDeque; + +use rand::Rng; + +use super::*; + +fn run_iteration( + bs: &[u32], + collects: impl Iterator, + encoded: &mut Vec, + decoded: &mut Vec, + num_bits: u32, +) -> ParquetResult<()> { + encoded.clear(); + decoded.clear(); + + encoder::encode(encoded, bs.iter().copied(), num_bits).unwrap(); + + let mut decoder = HybridRleDecoder::new(&encoded[..], num_bits, bs.len()); + + for c in collects { + decoder.collect_n_into(decoded, c)?; + } + + Ok(()) +} + +/// Minimizes a failing case +fn minimize_failing_case( + bs: &mut Vec, + collects: &mut VecDeque, + encoded: &mut Vec, + decoded: &mut Vec, + num_bits: u32, +) -> ParquetResult<()> { + loop { + let initial_bs_len = bs.len(); + let initial_collects_len = collects.len(); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + assert_ne!(&bs, &decoded); + + while collects.len() > 2 { + let last = collects.pop_back().unwrap(); + + *collects.back_mut().unwrap() += last; + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + *collects.back_mut().unwrap() -= last; + collects.push_back(last); + break; + } + } + + while collects.len() > 2 { + let first = collects.pop_front().unwrap(); + + *collects.front_mut().unwrap() += first; + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + *collects.front_mut().unwrap() -= first; + collects.push_front(first); + break; + } + } + + while bs.len() > 1 { + let last = bs.pop().unwrap(); + *collects.back_mut().unwrap() -= 1; + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + bs.push(last); + *collects.back_mut().unwrap() += 1; + break; + } + + if *collects.back().unwrap() == 0 { + collects.pop_back().unwrap(); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + collects.push_back(0); + break; + } + } + } + + while bs.len() > 1 { + let last = bs.pop().unwrap(); + *collects.front_mut().unwrap() -= 1; + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + bs.push(last); + *collects.front_mut().unwrap() += 1; + break; + } + + if *collects.front().unwrap() == 0 { + collects.pop_front().unwrap(); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + collects.push_front(0); + break; + } + } + } + + while bs.len() > 1 { + let first = bs.remove(0); + *collects.back_mut().unwrap() -= 1; + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + bs.insert(0, first); + *collects.back_mut().unwrap() += 1; + break; + } + + if *collects.back().unwrap() == 0 { + collects.pop_back().unwrap(); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + collects.push_back(0); + break; + } + } + } + + while bs.len() > 1 { + let first = bs.remove(0); + *collects.front_mut().unwrap() -= 1; + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + bs.insert(0, first); + *collects.front_mut().unwrap() += 1; + break; + } + + if *collects.front().unwrap() == 0 { + collects.pop_front().unwrap(); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + collects.push_front(0); + break; + } + } + } + + let mut start_offset = collects[0]; + for i in 1..collects.len() - 1 { + loop { + let start_length = collects[i]; + + while collects[i] > 0 { + collects[i] -= 1; + let item = bs.remove(start_offset); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + bs.insert(start_offset, item); + collects[i] += 1; + break; + } + + if collects[i] == 0 { + collects.remove(i); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + collects.insert(i, 0); + break; + } + } + } + + while collects[i] > 0 { + collects[i] -= 1; + let end_offset = start_offset + collects[i] - 1; + let item = bs.remove(end_offset); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + bs.insert(end_offset, item); + collects[i] += 1; + break; + } + + if collects[i] == 0 { + collects.remove(i); + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + if bs == decoded { + collects.insert(i, 0); + break; + } + } + } + + if collects[i] == start_length { + break; + } + } + + start_offset += collects[i]; + } + + let now_bs_len = bs.len(); + let now_collects_len = collects.len(); + + if initial_bs_len == now_bs_len && initial_collects_len == now_collects_len { + break; + } + } + + run_iteration(bs, collects.iter().copied(), encoded, decoded, num_bits)?; + + Ok(()) +} + +fn fuzz_loops(num_loops: usize) -> ParquetResult<()> { + let mut rng = rand::thread_rng(); + + const MAX_LENGTH: usize = 10_000; + + let mut encoded = Vec::with_capacity(1024); + let mut decoded = Vec::with_capacity(1024); + + let mut bs = Vec::with_capacity(MAX_LENGTH); + let mut collects: VecDeque = VecDeque::with_capacity(2000); + + for i in 0..num_loops { + collects.clear(); + bs.clear(); + + let num_bits = rng.gen_range(0..=32); + let mask = 1u32.wrapping_shl(num_bits).wrapping_sub(1); + + let length = rng.gen_range(1..=MAX_LENGTH); + + unsafe { bs.set_len(length) }; + rng.fill(&mut bs[..]); + + let mut filled = 0; + while filled < bs.len() { + if rng.gen() { + let num_repeats = rng.gen_range(0..=(bs.len() - filled)); + let value = bs[filled] & mask; + for j in 0..num_repeats { + bs[filled + j] = value; + } + filled += num_repeats; + } else { + bs[filled] &= mask; + filled += 1; + } + } + + if rng.gen() { + let mut num_values = bs.len(); + while num_values > 0 { + let n = rng.gen_range(0..=num_values); + collects.push_back(n); + num_values -= n; + } + } else { + collects.resize(1, bs.len()); + } + + run_iteration( + &bs, + collects.iter().copied(), + &mut encoded, + &mut decoded, + num_bits, + )?; + + if decoded != bs { + minimize_failing_case(&mut bs, &mut collects, &mut encoded, &mut decoded, num_bits)?; + + eprintln!("Minimized case:"); + eprintln!("Expected: {bs:?}"); + eprintln!("Found: {decoded:?}"); + eprintln!("Collects: {collects:?}"); + eprintln!(); + + panic!("Found a failing case..."); + } + + if i % 512 == 0 { + eprintln!("{i} iterations done."); + } + } + + Ok(()) +} + +#[test] +fn small_fuzz() -> ParquetResult<()> { + fuzz_loops(2048) +} + +#[test] +#[ignore = "Large fuzz test. Too slow"] +fn large_fuzz() -> ParquetResult<()> { + fuzz_loops(1_000_000) +} + +#[test] +fn found_cases() -> ParquetResult<()> { + let mut encoded = Vec::with_capacity(1024); + let mut decoded = Vec::with_capacity(1024); + + let num_bits = 7; + + let bs: [u32; 1024] = std::array::from_fn(|i| (i / 10) as u32); + + encoder::encode(&mut encoded, bs.iter().copied(), num_bits).unwrap(); + let mut decoder = HybridRleDecoder::new(&encoded[..], num_bits, bs.len()); + + while decoder.len() != 0 { + let n = decoder.next().unwrap(); + decoded.push(n); + } + + for _ in 0..1 { + _ = decoder.next(); + } + + decoder.get_result()?; + + assert_eq!(&decoded, &bs); + + Ok(()) +} diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs index 149fa3ceaa4e..079f600fd45a 100644 --- a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/mod.rs @@ -1,15 +1,27 @@ // See https://github.com/apache/parquet-format/blob/master/Encodings.md#run-length-encoding--bit-packing-hybrid-rle--3 mod bitmap; +mod buffered; mod decoder; mod encoder; +mod translator; + +#[cfg(test)] +mod fuzz; pub use bitmap::{encode_bool as bitpacked_encode, BitmapIter}; +pub use buffered::BufferedBitpacked; pub use decoder::Decoder; pub use encoder::encode; use polars_utils::iter::FallibleIterator; use polars_utils::slice::GetSaferUnchecked; +pub use translator::{ + BinaryDictionaryTranslator, DictionaryTranslator, FnTranslator, Translator, UnitTranslator, +}; +use self::buffered::HybridRleBuffered; use super::{bitpacked, ceil8, uleb128}; +use crate::parquet::encoding::bitpacked::{Unpackable, Unpacked}; +use crate::parquet::encoding::hybrid_rle::buffered::BufferedRle; use crate::parquet::error::{ParquetError, ParquetResult}; /// The two possible states of an RLE-encoded run. @@ -22,95 +34,80 @@ pub enum HybridEncoded<'a> { Rle(&'a [u8], usize), } -/// A decoder for Hybrid-RLE encoded values +/// A [`Iterator`] for Hybrid Run-Length Encoding +/// +/// The hybrid here means that each second is prepended by a bit that differentiates between two +/// modes. +/// +/// 1. Run-Length Encoding in the shape of `[Number of Values, Value]` +/// 2. Bitpacking in the shape of `[Value 1 in n bits, Value 2 in n bits, ...]` +/// +/// Note, that this can iterate, but the set of `collect_*` and `translate_and_collect_*` methods +/// should be highly preferred as they are way more efficient and have better error handling. #[derive(Debug, Clone)] pub struct HybridRleDecoder<'a> { data: &'a [u8], num_bits: usize, num_values: usize, -} -/// A buffered [`Iterator`] of Hybrid-RLE encoded values -#[derive(Debug, Clone)] -pub struct BufferedHybridRleDecoderIter<'a> { - decoder: HybridRleDecoder<'a>, - - buffer: Vec, - buffer_index: usize, + buffered: Option>, + /// The result after iterating. + /// + /// This is only needed because we iterate over individual elements. result: Option, } -impl<'a> BufferedHybridRleDecoderIter<'a> { - // @NOTE: - // These were not taken with too much thought to be honest. It might be better to increase - // these because it allows for more buffering at the cost of utilizing more memory. - const BASE_CAPACITY: usize = 128; - const STOP_AT_SIZE: usize = 64; +impl<'a> FallibleIterator for HybridRleDecoder<'a> { + fn get_result(&mut self) -> Result<(), ParquetError> { + match self.result.take() { + None => Ok(()), + Some(err) => Err(err), + } + } } -impl<'a> Iterator for BufferedHybridRleDecoderIter<'a> { +impl<'a> Iterator for HybridRleDecoder<'a> { type Item = u32; fn next(&mut self) -> Option { - if self.buffer_index < self.buffer.len() { - let value = self.buffer[self.buffer_index]; - self.buffer_index += 1; - return Some(value); - } - - if self.decoder.num_values == 0 { + if self.num_values == 0 { return None; } - if self.decoder.num_bits == 0 { - self.decoder.num_values -= 1; + if self.num_bits == 0 { + self.num_values -= 1; return Some(0); } - self.buffer.clear(); - self.buffer_index = 1; - while self.buffer.len() < Self::STOP_AT_SIZE && self.decoder.num_values > 0 { - let result = self.decoder.collect_once(&mut self.buffer); - if let Err(err) = result { - self.result = Some(err); - return None; + if let Some(buffered) = self.buffered.as_mut() { + match buffered.next() { + None => self.buffered = None, + Some(value) => { + self.num_values -= 1; + return Some(value); + }, } } - self.buffer.first().copied() - } - - fn size_hint(&self) -> (usize, Option) { - let size = self.decoder.num_values + self.buffer.len() - self.buffer_index; - (size, Some(size)) - } -} + let mut buffer = Vec::with_capacity(1); + let result = self.translate_and_collect_limited_once(&mut buffer, Some(1), &UnitTranslator); -impl<'a> FallibleIterator for BufferedHybridRleDecoderIter<'a> { - fn get_result(&mut self) -> Result<(), ParquetError> { - match self.result.take() { - None => Ok(()), - Some(err) => Err(err), + match result { + Ok(_) => Some(buffer[0]), + Err(err) => { + self.result = Some(err); + None + }, } } -} -impl<'a> ExactSizeIterator for BufferedHybridRleDecoderIter<'a> {} - -impl<'a> IntoIterator for HybridRleDecoder<'a> { - type Item = u32; - type IntoIter = BufferedHybridRleDecoderIter<'a>; - - fn into_iter(self) -> Self::IntoIter { - BufferedHybridRleDecoderIter { - decoder: self, - buffer: Vec::with_capacity(BufferedHybridRleDecoderIter::BASE_CAPACITY), - buffer_index: 0, - result: None, - } + fn size_hint(&self) -> (usize, Option) { + (self.num_values, Some(self.num_values)) } } +impl<'a> ExactSizeIterator for HybridRleDecoder<'a> {} + impl<'a> HybridRleDecoder<'a> { /// Returns a new [`HybridRleDecoder`] pub fn new(data: &'a [u8], num_bits: u32, num_values: usize) -> Self { @@ -118,47 +115,49 @@ impl<'a> HybridRleDecoder<'a> { data, num_bits: num_bits as usize, num_values, - } - } - pub fn iter(&self) -> BufferedHybridRleDecoderIter<'a> { - BufferedHybridRleDecoderIter { - decoder: self.clone(), - buffer: Vec::with_capacity(BufferedHybridRleDecoderIter::BASE_CAPACITY), - buffer_index: 0, + buffered: None, result: None, } } - fn collect_once(&mut self, vec: &mut Vec) -> ParquetResult<()> { + /// Translate and collect at most `limit` items into `target`. + /// + /// This function expects `num_values > 0` and `num_bits > 0`. + fn translate_and_collect_limited_once( + &mut self, + target: &mut Vec, + limit: Option, + translator: &impl Translator, + ) -> ParquetResult { + if limit == Some(0) { + return Ok(0); + } + + let start_target_length = target.len(); + let start_num_values = self.num_values; + // @NOTE: // This is basically a collapsed version of the `decoder::Decoder`. Any change here // probably also applies there. In a microbenchmark this collapse did around 3x for this // specific piece of code, but I think this actually also makes the code more readable. - debug_assert!(self.num_values > 0); + debug_assert!(self.num_values > 0, "{:?}", target.len()); debug_assert!(self.num_bits > 0); let (indicator, consumed) = uleb128::decode(self.data); self.data = unsafe { self.data.get_unchecked_release(consumed..) }; if consumed == 0 { - // We don't step everything at once because that might allocate a lot at once. So, we - // do it in steps. This reasoning might not hold up 100% for just HybridRleDecoder but - // it does for BufferedHybridRleDecoderIter. - // - // @TODO: There might be a better solution for this. - - const MAX_STEP_SIZE: usize = 64; - - let step_size = usize::min(self.num_values, MAX_STEP_SIZE); - vec.resize(vec.len() + step_size, 0); + let step_size = + limit.map_or(self.num_values, |limit| usize::min(self.num_values, limit)); + target.resize(target.len() + step_size, translator.translate(0)?); self.num_values -= step_size; - return Ok(()); + return Ok(step_size); } - if indicator & 1 == 1 { + let num_processed = if indicator & 1 == 1 { // is bitpacking let bytes = (indicator as usize >> 1) * self.num_bits; let bytes = std::cmp::min(bytes, self.data.len()); @@ -167,8 +166,13 @@ impl<'a> HybridRleDecoder<'a> { let length = std::cmp::min(packed.len() * 8 / self.num_bits, self.num_values); let decoder = bitpacked::Decoder::::try_new(packed, self.num_bits, length)?; - decoder.collect_into(vec); - self.num_values -= length; + + let (num_processed, buffered) = + translator.translate_bitpacked_decoder(decoder, target, limit)?; + debug_assert!(limit.map_or(true, |limit| limit >= num_processed)); + self.buffered = buffered; + + num_processed } else { // is rle let run_length = indicator as usize >> 1; @@ -177,51 +181,248 @@ impl<'a> HybridRleDecoder<'a> { let (pack, remaining) = self.data.split_at(rle_bytes); self.data = remaining; - let mut bytes = [0u8; std::mem::size_of::()]; - pack.iter().zip(bytes.iter_mut()).for_each(|(src, dst)| { - *dst = *src; - }); - let value = u32::from_le_bytes(bytes); - vec.resize(vec.len() + run_length, value); - self.num_values -= run_length; - } + if run_length == 0 { + 0 + } else { + let mut bytes = [0u8; std::mem::size_of::()]; + pack.iter().zip(bytes.iter_mut()).for_each(|(src, dst)| { + *dst = *src; + }); + let value = u32::from_le_bytes(bytes); + + let num_elements = limit.map_or(run_length, |limit| usize::min(run_length, limit)); + + // Only translate once. Then, just do a memset. + let translated = translator.translate(value)?; + target.resize(target.len() + num_elements, translated); + + if let Some(limit) = limit { + if run_length > limit { + self.buffered = (run_length != limit).then_some({ + HybridRleBuffered::Rle(BufferedRle { + value, + length: run_length - num_elements, + }) + }); + } + } + + num_elements + } + }; - Ok(()) + self.num_values -= num_processed; + + debug_assert_eq!(num_processed, start_num_values - self.num_values); + debug_assert_eq!(num_processed, target.len() - start_target_length); + debug_assert!(limit.map_or(true, |limit| num_processed <= limit)); + + Ok(num_processed) } - #[inline] - pub fn collect_into(mut self, vec: &mut Vec) -> ParquetResult<()> { - // @NOTE: - // When microbenchmarking, this performs around 2x better than using an element-wise - // iterator. + pub fn translate_and_collect_into( + mut self, + target: &mut Vec, + translator: &impl Translator, + ) -> Result<(), ParquetError> { if self.num_values == 0 { return Ok(()); } if self.num_bits == 0 { - vec.resize(vec.len() + self.num_values, 0); + target.resize(target.len() + self.num_values, translator.translate(0)?); return Ok(()); } - vec.reserve(self.num_values); + target.reserve(self.num_values); + if let Some(buffered) = self.buffered.take() { + let num_buffered = buffered.translate_and_collect_into(target, translator)?; + self.num_values -= num_buffered; + } while self.num_values > 0 { - self.collect_once(vec)?; + self.translate_and_collect_limited_once(target, None, translator)?; } Ok(()) } + pub fn translate_and_collect_n_into( + &mut self, + target: &mut Vec, + n: usize, + translator: &impl Translator, + ) -> ParquetResult<()> { + if self.num_values == 0 || n == 0 { + return Ok(()); + } + + if self.num_bits == 0 { + target.resize(target.len() + n, translator.translate(0)?); + self.num_values -= n; + return Ok(()); + } + + let target_length = target.len() + n; + target.reserve(n); + + if let Some(buffered) = self.buffered.as_mut() { + let num_buffered = + buffered.translate_and_collect_limited_into(target, n, translator)?; + debug_assert!(num_buffered <= n); + self.num_values -= num_buffered; + + if num_buffered < n { + self.buffered = None; + } + } + + while target.len() < target_length && self.num_values > 0 { + self.translate_and_collect_limited_once( + target, + Some(target_length - target.len()), + translator, + )?; + } + + Ok(()) + } + + pub fn translate_and_collect( + self, + translator: &impl Translator, + ) -> ParquetResult> { + let mut vec = Vec::new(); + self.translate_and_collect_into(&mut vec, translator)?; + Ok(vec) + } + + pub fn collect_into(self, target: &mut Vec) -> Result<(), ParquetError> { + self.translate_and_collect_into(target, &UnitTranslator) + } + + pub fn collect_n_into(&mut self, target: &mut Vec, n: usize) -> ParquetResult<()> { + self.translate_and_collect_n_into(target, n, &UnitTranslator) + } + pub fn collect(self) -> ParquetResult> { let mut vec = Vec::new(); self.collect_into(&mut vec)?; Ok(vec) } + + pub fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + if self.num_values == 0 || n == 0 { + return Ok(()); + } + + if n >= self.num_values { + self.data = &[]; + self.num_values = 0; + self.buffered = None; + return Ok(()); + } + + if self.num_bits == 0 { + self.num_values -= n; + return Ok(()); + } + + let mut n = n; + if let Some(buffered) = self.buffered.as_mut() { + let num_skipped = buffered.skip_in_place(n); + + if num_skipped < n { + self.buffered = None; + } + + self.num_values -= num_skipped; + n -= num_skipped; + } + + while n > 0 && self.num_values > 0 { + let start_num_values = self.num_values; + + let (indicator, consumed) = uleb128::decode(self.data); + self.data = unsafe { self.data.get_unchecked_release(consumed..) }; + + let num_skipped = if consumed == 0 { + n + } else if indicator & 1 == 1 { + // is bitpacking + let bytes = (indicator as usize >> 1) * self.num_bits; + let bytes = std::cmp::min(bytes, self.data.len()); + let (packed, remaining) = self.data.split_at(bytes); + self.data = remaining; + + let length = std::cmp::min(packed.len() * 8 / self.num_bits, self.num_values); + let mut decoder = + bitpacked::Decoder::::try_new(packed, self.num_bits, length)?; + + // Skip the whole decoder if it is possible + if decoder.len() <= n { + decoder.len() + } else { + const CHUNK_SIZE: usize = ::Unpacked::LENGTH; + + let num_full_chunks = n / CHUNK_SIZE; + decoder.skip_chunks(num_full_chunks); + + let (unpacked, unpacked_length) = decoder.chunked().next_inexact().unwrap(); + let unpacked_offset = n % CHUNK_SIZE; + debug_assert!(unpacked_offset < unpacked_length); + + self.buffered = Some(HybridRleBuffered::Bitpacked(BufferedBitpacked { + unpacked, + + unpacked_start: unpacked_offset, + unpacked_end: unpacked_length, + decoder, + })); + + n + } + } else { + // is rle + let run_length = indicator as usize >> 1; + // repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width) + let rle_bytes = ceil8(self.num_bits); + let (pack, remaining) = self.data.split_at(rle_bytes); + self.data = remaining; + + // Skip the whole run-length encoded value if it is possible + if run_length <= n { + run_length + } else { + let mut bytes = [0u8; std::mem::size_of::()]; + pack.iter().zip(bytes.iter_mut()).for_each(|(src, dst)| { + *dst = *src; + }); + let value = u32::from_le_bytes(bytes); + + self.buffered = Some(HybridRleBuffered::Rle(BufferedRle { + value, + length: run_length - n, + })); + + n + } + }; + + n -= num_skipped; + self.num_values -= num_skipped; + + debug_assert_eq!(num_skipped, start_num_values - self.num_values); + debug_assert!(num_skipped <= n); + debug_assert!(num_skipped > 0); + } + + Ok(()) + } } #[cfg(test)] mod tests { - use super::*; #[test] @@ -318,10 +519,13 @@ mod tests { let num_bits = 10; let decoder = HybridRleDecoder::new(&data, num_bits, 1000); - let result = decoder.collect()?; + let mut decoder = HybridRleDecoder::new(&data, num_bits, 1000); + let iterator_result: Vec<_> = Iterator::collect(&mut decoder); + assert_eq!(result, (0..1000).collect::>()); + assert_eq!(iterator_result, (0..1000).collect::>()); Ok(()) } diff --git a/crates/polars-parquet/src/parquet/encoding/hybrid_rle/translator.rs b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/translator.rs new file mode 100644 index 000000000000..b36df14978cc --- /dev/null +++ b/crates/polars-parquet/src/parquet/encoding/hybrid_rle/translator.rs @@ -0,0 +1,297 @@ +use arrow::array::{BinaryArray, View}; + +use crate::parquet::encoding::bitpacked::{Decoder, Unpackable, Unpacked}; +use crate::parquet::encoding::hybrid_rle::{BufferedBitpacked, HybridRleBuffered}; +use crate::parquet::error::{ParquetError, ParquetResult}; + +/// A trait to describe a translation from a HybridRLE encoding to an another format. +/// +/// In essence, this is one method ([`Translator::translate`]) that maps an `u32` to the desired +/// output type `O`. There are several other methods that may provide optimized routines +/// for slices, chunks and decoders. +/// +/// # Motivation +/// +/// The [`HybridRleDecoder`] is used extensively during Parquet decoding because it is used for +/// Dremel decoding and dictionary decoding. We want to perform a transformation from this +/// space-efficient encoding to a buffer. Here, items might be skipped, might be mapped and only a +/// few items might be needed. There are 3 main ways to do this. +/// +/// 1. Element-by-element translation using iterator `map`, `filter`, `skip`, etc. This suffers +/// from the problem that is difficult to SIMD the translation and that a `collect` might need +/// to constantly poll the `next` function. Next to that monomorphization might need to generate +/// many, many variants. +/// 2. Buffer most everything, filter and translate later. This has high memory-consumption and +/// might suffer from cache-eviction problems. This is computationally the most efficient, but +/// probably still has a high runtime. Also, this fails to utilize run-length information and +/// needs to retranslate all repeated elements. +/// 3. Batched operations. Here, we try to utilize the run-length information and utilize SIMD to +/// process many bitpacked items. This can provide the best of both worlds. +/// +/// The [`HybridRleDecoder`][super::HybridRleDecoder] decoders utilizing both run-length encoding +/// and bitpacking. In both processes, this [`Translator`] trait allows for translation with (i) no +/// heap allocations and (ii) cheap buffering and can stop and start at any point. Consequently, +/// the memory consumption while doing these translations can be relatively low while still +/// processing items in batches. +/// +/// [`HybridRleDecoder`]: super::HybridRleDecoder +pub trait Translator { + /// Translate from a decoded value to the output format + fn translate(&self, value: u32) -> ParquetResult; + + /// Translate from a slice of decoded values to the output format and write them to a `target`. + /// + /// This can overwritten to be more optimized. + fn translate_slice(&self, target: &mut Vec, source: &[u32]) -> ParquetResult<()> { + target.reserve(source.len()); + for v in source { + target.push(self.translate(*v)?); + } + Ok(()) + } + + /// Translate from a chunk of unpacked items to the output format and write them to a `target`. + /// + /// This is the same as [`Translator::translate_slice`] but with a known slice size. This can + /// allow SIMD routines to better optimize the procedure. + /// + /// This can overwritten to be more optimized. + fn translate_chunk( + &self, + target: &mut Vec, + source: &::Unpacked, + ) -> ParquetResult<()> { + self.translate_slice(target, &source[..]) + } + + /// Translate and collect all the items in a [`Decoder`] to a `target`. + /// + /// This can overwritten to be more optimized. + fn translate_bitpacked_all( + &self, + target: &mut Vec, + mut decoder: Decoder, + ) -> ParquetResult<()> { + target.reserve(decoder.len()); + + let mut chunked = decoder.chunked(); + + for unpacked in &mut chunked { + self.translate_chunk(target, &unpacked)?; + } + + if let Some((last, last_length)) = chunked.remainder() { + self.translate_slice(target, &last[..last_length])?; + } + + Ok(()) + } + + /// Translate and collect a limited number of items in a [`Decoder`] to a `target`. + /// + /// This can overwritten to be more optimized. + /// + /// # Panics + /// + /// This method panics when `limit` is larger than the `decoder` length. + fn translate_bitpacked_limited<'a>( + &self, + target: &mut Vec, + limit: usize, + mut decoder: Decoder<'a, u32>, + ) -> ParquetResult> { + assert!(limit < decoder.len()); + + const CHUNK_SIZE: usize = ::Unpacked::LENGTH; + + let mut chunked = decoder.chunked(); + + let num_full_chunks = limit / CHUNK_SIZE; + for unpacked in (&mut chunked).take(num_full_chunks) { + self.translate_chunk(target, &unpacked)?; + } + + let (unpacked, unpacked_length) = chunked.next_inexact().unwrap(); + let unpacked_offset = limit % CHUNK_SIZE; + debug_assert!(unpacked_offset < unpacked_length); + self.translate_slice(target, &unpacked[..unpacked_offset])?; + + Ok(BufferedBitpacked { + unpacked, + + unpacked_start: unpacked_offset, + unpacked_end: unpacked_length, + decoder, + }) + } + + /// Translate and collect items in a [`Decoder`] to a `target`. + /// + /// This can overwritten to be more optimized. + fn translate_bitpacked_decoder<'a>( + &self, + decoder: Decoder<'a, u32>, + target: &mut Vec, + limit: Option, + ) -> ParquetResult<(usize, Option>)> { + let length = decoder.len(); + + match limit { + None => self + .translate_bitpacked_all(target, decoder) + .map(|_| (length, None)), + Some(limit) if limit >= length => self + .translate_bitpacked_all(target, decoder) + .map(|_| (length, None)), + Some(limit) => self + .translate_bitpacked_limited(target, limit, decoder) + .map(|b| (limit, Some(HybridRleBuffered::Bitpacked(b)))), + } + } +} + +/// This is a unit translation variant of [`Translator`]. This just maps all encoded values from a +/// [`HybridRleDecoder`] to themselves. +/// +/// [`HybridRleDecoder`]: super::HybridRleDecoder +pub struct UnitTranslator; + +impl Translator for UnitTranslator { + fn translate(&self, value: u32) -> ParquetResult { + Ok(value) + } + + fn translate_slice(&self, target: &mut Vec, source: &[u32]) -> ParquetResult<()> { + target.extend_from_slice(source); + Ok(()) + } + fn translate_chunk( + &self, + target: &mut Vec, + source: &::Unpacked, + ) -> ParquetResult<()> { + target.extend_from_slice(&source[..]); + Ok(()) + } + fn translate_bitpacked_all( + &self, + target: &mut Vec, + decoder: Decoder, + ) -> ParquetResult<()> { + decoder.collect_into(target); + Ok(()) + } +} + +/// This is a dictionary translation variant of [`Translator`]. +/// +/// All the [`HybridRleDecoder`] values are regarded as a offset into a dictionary. +/// +/// [`HybridRleDecoder`]: super::HybridRleDecoder +pub struct DictionaryTranslator<'a, T>(pub &'a [T]); + +impl<'a, T: Copy> Translator for DictionaryTranslator<'a, T> { + fn translate(&self, value: u32) -> ParquetResult { + self.0 + .get(value as usize) + .cloned() + .ok_or(ParquetError::oos("Dictionary index is out of range")) + } + + fn translate_slice(&self, target: &mut Vec, source: &[u32]) -> ParquetResult<()> { + let Some(source_max) = source.iter().copied().max() else { + return Ok(()); + }; + + if source_max as usize >= self.0.len() { + return Err(ParquetError::oos("Dictionary index is out of range")); + } + + // Safety: We have checked before that source only has indexes that are smaller than the + // dictionary length. + target.extend( + source + .iter() + .map(|&src_idx| unsafe { *self.0.get_unchecked(src_idx as usize) }), + ); + + Ok(()) + } + + fn translate_chunk( + &self, + target: &mut Vec, + source: &::Unpacked, + ) -> ParquetResult<()> { + let source_max: u32 = source.iter().copied().max().unwrap(); + + if source_max as usize >= self.0.len() { + return Err(ParquetError::oos("Dictionary index is out of range")); + } + + // Safety: We have checked before that source only has indexes that are smaller than the + // dictionary length. + target.extend( + source + .iter() + .map(|&src_idx| unsafe { *self.0.get_unchecked(src_idx as usize) }), + ); + + Ok(()) + } +} + +/// This is a binary dictionary translation variant of [`Translator`]. +/// +/// All the [`HybridRleDecoder`] values are regarded as a offset into a binary array regarded as a +/// dictionary. +/// +/// [`HybridRleDecoder`]: super::HybridRleDecoder +pub struct BinaryDictionaryTranslator<'a> { + pub dictionary: &'a BinaryArray, + pub buffer_idx: u32, +} + +impl<'a> Translator for BinaryDictionaryTranslator<'a> { + fn translate(&self, index: u32) -> ParquetResult { + if index as usize >= self.dictionary.len() { + return Err(ParquetError::oos("Dictionary index is out of range")); + } + + let value = self.dictionary.value(index as usize); + let (start, _) = self.dictionary.offsets().start_end(index as usize); + Ok(View::new_from_bytes(value, self.buffer_idx, start as u32)) + } + + fn translate_slice(&self, target: &mut Vec, source: &[u32]) -> ParquetResult<()> { + let Some(source_max) = source.iter().copied().max() else { + return Ok(()); + }; + + if source_max as usize >= self.dictionary.len() { + return Err(ParquetError::oos("Dictionary index is out of range")); + } + + let offsets = self.dictionary.offsets(); + + target.extend(source.iter().map(|&src_idx| { + // Safety: We have checked before that source only has indexes that are smaller than + // the dictionary length. + let value = unsafe { self.dictionary.value_unchecked(src_idx as usize) }; + debug_assert!((src_idx as usize) < offsets.len_proxy()); + let (start, _) = unsafe { offsets.start_end_unchecked(src_idx as usize) }; + View::new_from_bytes(value, self.buffer_idx, start as u32) + })); + + Ok(()) + } +} + +/// A closure-based translator +pub struct FnTranslator ParquetResult>(pub F); + +impl ParquetResult> Translator for FnTranslator { + fn translate(&self, value: u32) -> ParquetResult { + (self.0)(value) + } +} diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs index 82afe6a0b40c..c82c7678ff85 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/mean.rs @@ -77,7 +77,7 @@ where let arr = values.chunks().get_unchecked(0); arr.sliced_unchecked(offset as usize, length as usize) }; - let dtype = K::PolarsType::get_dtype().to_arrow(true); + let dtype = K::PolarsType::get_dtype().to_arrow(CompatLevel::newest()); let arr = arrow::compute::cast::cast_unchecked(arr.as_ref(), &dtype).unwrap(); let arr = unsafe { arr.as_any() diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs index b256ca41720f..e58cc09d132a 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/sum.rs @@ -55,7 +55,7 @@ where let arr = values.chunks().get_unchecked(0); arr.sliced_unchecked(offset as usize, length as usize) }; - let dtype = K::PolarsType::get_dtype().to_arrow(true); + let dtype = K::PolarsType::get_dtype().to_arrow(CompatLevel::newest()); let arr = arrow::compute::cast::cast_unchecked(arr.as_ref(), &dtype).unwrap(); let arr = unsafe { arr.as_any() diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs index ccfd390bcf62..84e504816daa 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/eval.rs @@ -82,7 +82,7 @@ impl Eval { _ => s.to_physical_repr().into_owned(), }; let s = prepare_key(&s, chunk); - keys_columns.push(s.to_arrow(0, true)); + keys_columns.push(s.to_arrow(0, CompatLevel::newest())); } polars_row::convert_columns_amortized( diff --git a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs index 3a1ca17a183a..2bb4f57b46a1 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/generic/hash_table.rs @@ -261,7 +261,7 @@ impl AggHashTable { .output_schema .iter_dtypes() .take(self.num_keys) - .map(|dtype| dtype.to_physical().to_arrow(true)) + .map(|dtype| dtype.to_physical().to_arrow(CompatLevel::newest())) .collect::>(); let fields = vec![Default::default(); self.num_keys]; let key_columns = diff --git a/crates/polars-pipe/src/executors/sinks/io.rs b/crates/polars-pipe/src/executors/sinks/io.rs index ac2b27717a75..cabf560dadb2 100644 --- a/crates/polars-pipe/src/executors/sinks/io.rs +++ b/crates/polars-pipe/src/executors/sinks/io.rs @@ -177,7 +177,7 @@ impl IOThread { path.push(format!("{count}.ipc")); let file = File::create(path).unwrap(); - let writer = IpcWriter::new(file).with_pl_flavor(true); + let writer = IpcWriter::new(file).with_compat_level(CompatLevel::newest()); let mut writer = writer.batched(&schema).unwrap(); writer.write_batch(&df).unwrap(); writer.finish().unwrap(); @@ -188,7 +188,7 @@ impl IOThread { path.push(format!("{count}_0_pass.ipc")); let file = File::create(path).unwrap(); - let writer = IpcWriter::new(file).with_pl_flavor(true); + let writer = IpcWriter::new(file).with_compat_level(CompatLevel::newest()); let mut writer = writer.batched(&schema).unwrap(); for mut df in iter { @@ -227,7 +227,7 @@ impl IOThread { path.push(format!("_{count}_full.ipc")); let file = File::create(path).unwrap(); - let mut writer = IpcWriter::new(file).with_pl_flavor(true); + let mut writer = IpcWriter::new(file).with_compat_level(CompatLevel::newest()); writer.finish(&mut df).unwrap(); } else { let iter = Box::new(std::iter::once(df)); @@ -260,7 +260,7 @@ impl IOThread { // duplicates path.push(format!("_{count}.ipc")); let file = File::create(path).unwrap(); - let writer = IpcWriter::new(file).with_pl_flavor(true); + let writer = IpcWriter::new(file).with_compat_level(CompatLevel::newest()); let mut writer = writer.batched(&self.schema).unwrap(); writer.write_batch(&df).unwrap(); writer.finish().unwrap(); diff --git a/crates/polars-pipe/src/executors/sinks/output/parquet.rs b/crates/polars-pipe/src/executors/sinks/output/parquet.rs index b3d64341d129..e591279bbc74 100644 --- a/crates/polars-pipe/src/executors/sinks/output/parquet.rs +++ b/crates/polars-pipe/src/executors/sinks/output/parquet.rs @@ -63,7 +63,7 @@ impl ParquetSink { let file = std::fs::File::create(path)?; let writer = ParquetWriter::new(file) .with_compression(options.compression) - .with_data_page_size(options.data_pagesize_limit) + .with_data_page_size(options.data_page_size) .with_statistics(options.statistics) .with_row_group_size(options.row_group_size) // This is important! Otherwise we will deadlock @@ -154,7 +154,7 @@ impl ParquetCloudSink { let cloud_writer = polars_io::cloud::CloudWriter::new(uri, cloud_options).await?; let writer = ParquetWriter::new(cloud_writer) .with_compression(parquet_options.compression) - .with_data_page_size(parquet_options.data_pagesize_limit) + .with_data_page_size(parquet_options.data_page_size) .with_statistics(parquet_options.statistics) .with_row_group_size(parquet_options.row_group_size) // This is important! Otherwise we will deadlock diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs index 1e7976afc431..c7256f084aeb 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink_multiple.rs @@ -275,7 +275,7 @@ impl Sink for SortSinkMultiple { let sort_dtypes = self.sort_dtypes.take().map(|arr| { arr.iter() - .map(|dt| dt.to_physical().to_arrow(true)) + .map(|dt| dt.to_physical().to_arrow(CompatLevel::newest())) .collect::>() }); diff --git a/crates/polars-plan/src/dsl/function_expr/plugin.rs b/crates/polars-plan/src/dsl/function_expr/plugin.rs index 83c429e738c3..5ce4875fe68d 100644 --- a/crates/polars-plan/src/dsl/function_expr/plugin.rs +++ b/crates/polars-plan/src/dsl/function_expr/plugin.rs @@ -130,7 +130,7 @@ pub(super) unsafe fn plugin_field( // we deallocate the fields buffer let ffi_fields = fields .iter() - .map(|field| arrow::ffi::export_field_to_c(&field.to_arrow(true))) + .map(|field| arrow::ffi::export_field_to_c(&field.to_arrow(CompatLevel::newest()))) .collect::>() .into_boxed_slice(); let n_args = ffi_fields.len(); diff --git a/crates/polars-plan/src/dsl/function_expr/struct_.rs b/crates/polars-plan/src/dsl/function_expr/struct_.rs index d0c6ddb0a223..22fe7092b4f9 100644 --- a/crates/polars-plan/src/dsl/function_expr/struct_.rs +++ b/crates/polars-plan/src/dsl/function_expr/struct_.rs @@ -211,7 +211,7 @@ pub(super) fn suffix_fields(s: &Series, suffix: Arc) -> PolarsResult PolarsResult { let ca = s.struct_()?; - let dtype = ca.dtype().to_arrow(true); + let dtype = ca.dtype().to_arrow(CompatLevel::newest()); let iter = ca.chunks().iter().map(|arr| { let arr = arrow::compute::cast::cast_unchecked(arr.as_ref(), &dtype).unwrap(); diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index a539f13e183e..aec6952cb057 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -573,8 +573,9 @@ impl StringNameSpace { #[cfg(feature = "extract_jsonpath")] pub fn json_decode(self, dtype: Option, infer_schema_len: Option) -> Expr { + // Apply, because dtype should be inferred only once and be consistent over chunks/morsels. self.0 - .map_private(FunctionExpr::StringExpr(StringFunction::JsonDecode { + .apply_private(FunctionExpr::StringExpr(StringFunction::JsonDecode { dtype, infer_schema_len, })) diff --git a/crates/polars-plan/src/dsl/struct_.rs b/crates/polars-plan/src/dsl/struct_.rs index e5810f967298..d609547c1ff7 100644 --- a/crates/polars-plan/src/dsl/struct_.rs +++ b/crates/polars-plan/src/dsl/struct_.rs @@ -68,35 +68,41 @@ impl StructNameSpace { .map_private(FunctionExpr::StructExpr(StructFunction::JsonEncode)) } - pub fn with_fields(self, fields: Vec) -> Expr { - fn materialize_field(this: &Expr, field: Expr) -> Expr { - field.map_expr(|e| match e { + pub fn with_fields(self, fields: Vec) -> PolarsResult { + fn materialize_field(this: &Expr, field: Expr) -> PolarsResult { + field.try_map_expr(|e| match e { Expr::Field(names) => { let this = this.clone().struct_(); - if names.len() == 1 { + Ok(if names.len() == 1 { this.field_by_name(names[0].as_ref()) } else { this.field_by_names_impl(names) - } + }) }, - _ => e, + Expr::Exclude(_, _) => { + polars_bail!(InvalidOperation: "'exclude' not allowed in 'field'") + }, + _ => Ok(e), }) } let mut new_fields = Vec::with_capacity(fields.len()); new_fields.push(Default::default()); - new_fields.extend(fields.into_iter().map(|e| materialize_field(&self.0, e))); + for e in fields.into_iter().map(|e| materialize_field(&self.0, e)) { + new_fields.push(e?) + } new_fields[0] = self.0; - Expr::Function { + Ok(Expr::Function { input: new_fields, function: FunctionExpr::StructExpr(StructFunction::WithFields), options: FunctionOptions { collect_groups: ApplyOptions::ElementWise, pass_name_to_apply: true, allow_group_aware: false, + input_wildcard_expansion: true, ..Default::default() }, - } + }) } } diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 297c5cc554ee..fd8a58d79e53 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -115,6 +115,7 @@ impl DslBuilder { } #[cfg(feature = "ipc")] + #[allow(clippy::too_many_arguments)] pub fn scan_ipc>>( paths: P, options: IpcScanOptions, @@ -123,6 +124,7 @@ impl DslBuilder { row_index: Option, rechunk: bool, cloud_options: Option, + hive_options: HiveOptions, ) -> PolarsResult { let paths = paths.into(); @@ -137,11 +139,7 @@ impl DslBuilder { rechunk, row_index, file_counter: Default::default(), - // TODO: Support Hive partitioning. - hive_options: HiveOptions { - enabled: Some(false), - ..Default::default() - }, + hive_options, }, predicate: None, scan_type: FileScan::Ipc { diff --git a/crates/polars-plan/src/plans/hive.rs b/crates/polars-plan/src/plans/hive.rs index adc7b2256c7f..33f2b0fff593 100644 --- a/crates/polars-plan/src/plans/hive.rs +++ b/crates/polars-plan/src/plans/hive.rs @@ -1,7 +1,5 @@ use std::path::{Path, PathBuf}; -use percent_encoding::percent_decode; -use polars_core::error::to_compute_err; use polars_core::prelude::*; use polars_io::predicates::{BatchStats, ColumnStats}; use polars_io::prelude::schema_inference::{finish_infer_field_schema, infer_field_schema}; @@ -68,19 +66,6 @@ pub fn hive_partitions_from_paths( reader_schema: &Schema, try_parse_dates: bool, ) -> PolarsResult>> { - let paths = paths - .iter() - .map(|x| { - Ok(PathBuf::from( - percent_decode(x.to_str().unwrap().as_bytes()) - .decode_utf8() - .map_err(to_compute_err)? - .as_ref(), - )) - }) - .collect::>>()?; - let paths = paths.as_slice(); - let Some(path) = paths.first() else { return Ok(None); }; @@ -88,6 +73,15 @@ pub fn hive_partitions_from_paths( let sep = separator(path); let path_string = path.to_str().unwrap(); + fn parse_hive_string_and_decode(part: &'_ str) -> Option<(&'_ str, std::borrow::Cow<'_, str>)> { + let (k, v) = parse_hive_string(part)?; + let v = percent_encoding::percent_decode(v.as_bytes()) + .decode_utf8() + .ok()?; + + Some((k, v)) + } + macro_rules! get_hive_parts_iter { ($e:expr) => {{ let path_parts = $e[hive_start_idx..].split(sep); @@ -97,7 +91,8 @@ pub fn hive_partitions_from_paths( if index == file_index { return None; } - parse_hive_string(part) + + parse_hive_string_and_decode(part) }) }}; } @@ -158,7 +153,7 @@ pub fn hive_partitions_from_paths( continue; } - entry.insert(infer_field_schema(value, try_parse_dates, false)); + entry.insert(infer_field_schema(value.as_ref(), try_parse_dates, false)); } } @@ -264,7 +259,7 @@ fn parse_hive_string(part: &'_ str) -> Option<(&'_ str, &'_ str)> { // Files are not Hive partitions, so globs are not valid. if value.contains('*') { return None; - } + }; Some((name, value)) } diff --git a/crates/polars-plan/src/plans/optimizer/cache_states.rs b/crates/polars-plan/src/plans/optimizer/cache_states.rs index 53c546613661..9dd51f067935 100644 --- a/crates/polars-plan/src/plans/optimizer/cache_states.rs +++ b/crates/polars-plan/src/plans/optimizer/cache_states.rs @@ -170,17 +170,37 @@ pub(super) fn set_cache_states( match lp { // don't allow parallelism as caches need each others work // also self-referencing plans can deadlock on the files they lock - Join { options, .. } if options.allow_parallel => { - if let Join { options, .. } = lp_arena.get_mut(frame.current) { - let options = Arc::make_mut(options); - options.allow_parallel = false; + Join { + options, + input_left, + input_right, + .. + } if options.allow_parallel => { + let has_cache_in_children = [*input_left, *input_right].iter().any(|node| { + (&*lp_arena) + .iter(*node) + .any(|(_, ir)| matches!(ir, IR::Cache { .. })) + }); + if has_cache_in_children { + if let Join { options, .. } = lp_arena.get_mut(frame.current) { + let options = Arc::make_mut(options); + options.allow_parallel = false; + } } }, // don't allow parallelism as caches need each others work // also self-referencing plans can deadlock on the files they lock - Union { options, .. } if options.parallel => { - if let Union { options, .. } = lp_arena.get_mut(frame.current) { - options.parallel = false; + Union { options, inputs } if options.parallel => { + // Only toggle if children have a cache, otherwise we loose potential parallelism for nothing. + let has_cache_in_children = inputs.iter().any(|node| { + (&*lp_arena) + .iter(*node) + .any(|(_, ir)| matches!(ir, IR::Cache { .. })) + }); + if has_cache_in_children { + if let Union { options, .. } = lp_arena.get_mut(frame.current) { + options.parallel = false; + } } }, Cache { input, id, .. } => { diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs index e7bf461a76f6..6f635f1e354e 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/joins.rs @@ -143,7 +143,7 @@ pub(super) fn process_asof_join( true } else { let name = column_node_to_name(proj, expr_arena); - !local_projected_names.contains(&name) + !local_projected_names.contains(name) }; process_projection( @@ -327,7 +327,7 @@ pub(super) fn process_join( true } else { let name = column_node_to_name(proj, expr_arena); - !local_projected_names.contains(&name) + !local_projected_names.contains(name) }; process_projection( @@ -414,7 +414,7 @@ fn process_projection( // this branch tries to pushdown the column without suffix { // Column name of the projection without any alias. - let leaf_column_name = column_node_to_name(proj, expr_arena); + let leaf_column_name = column_node_to_name(proj, expr_arena).clone(); let suffix = options.args.suffix(); // If _right suffix exists we need to push a projection down without this @@ -479,14 +479,14 @@ fn resolve_join_suffixes( let projections = local_projection .iter() .map(|proj| { - let name = column_node_to_name(*proj, expr_arena); + let name = column_node_to_name(*proj, expr_arena).clone(); if name.ends_with(suffix) && schema_after_join.get(&name).is_none() { let downstream_name = &name.as_ref()[..name.len() - suffix.len()]; let col = AExpr::Column(ColumnName::from(downstream_name)); let node = expr_arena.add(col); - ExprIR::new(node, OutputName::Alias(name)) + ExprIR::new(node, OutputName::Alias(name.clone())) } else { - ExprIR::new(proj.0, OutputName::ColumnLhs(name)) + ExprIR::new(proj.0, OutputName::ColumnLhs(name.clone())) } }) .collect::>(); diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs index a73b5abf6807..d6e1e910aa83 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs @@ -47,12 +47,12 @@ fn get_scan_columns( // we shouldn't project the row-count column, as that is generated // in the scan let push = match row_index { - Some(rc) if name != rc.name => true, + Some(rc) if (*name).as_ref() != rc.name.as_ref() => true, None => true, _ => false, }; if push { - columns.push((*name).to_owned()) + columns.push((**name).to_owned()) } } with_columns = Some(Arc::from(columns)); @@ -83,7 +83,7 @@ fn split_acc_projections( .partition(|expr| check_input_column_node(*expr, down_schema, expr_arena)); let mut names = init_set(); for proj in &acc_projections { - let name = column_node_to_name(*proj, expr_arena); + let name = column_node_to_name(*proj, expr_arena).clone(); names.insert(name); } (acc_projections, local_projections, names) @@ -98,7 +98,7 @@ fn add_expr_to_accumulated( expr_arena: &Arena, ) { for root_node in aexpr_to_column_nodes_iter(expr, expr_arena) { - let name = column_node_to_name(root_node, expr_arena); + let name = column_node_to_name(root_node, expr_arena).clone(); if projected_names.insert(name) { acc_projections.push(root_node) } @@ -128,7 +128,7 @@ fn update_scan_schema( let mut new_cols = Vec::with_capacity(acc_projections.len()); for node in acc_projections.iter() { let name = column_node_to_name(*node, expr_arena); - let item = schema.try_get_full(&name)?; + let item = schema.try_get_full(name)?; new_cols.push(item); } // make sure that the projections are sorted by the schema. @@ -227,8 +227,8 @@ impl ProjectionPushDown { let mut already_projected = false; let name = column_node_to_name(proj, expr_arena); - let is_in_left = names_left.contains(&name); - let is_in_right = names_right.contains(&name); + let is_in_left = names_left.contains(name); + let is_in_right = names_right.contains(name); already_projected |= is_in_left; already_projected |= is_in_right; diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs index 7b2032f4843c..3f0a39d05a7b 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/rename.rs @@ -33,46 +33,29 @@ pub(super) fn process_rename( new: &[SmartString], swapping: bool, ) -> PolarsResult<()> { - let mut processed = BTreeSet::new(); if swapping { - // We clone otherwise we update a data structure whilst we rename it. - let mut new_projected_names = projected_names.clone(); - for (existing, new) in existing.iter().zip(new.iter()) { - let has_existing = projected_names.contains(existing.as_str()); - // Only if the new column name is projected by the upper node we must update the name. - let has_new = projected_names.contains(new.as_str()); - let has_both = has_existing && has_new; + let reverse_map: PlHashMap<_, _> = new + .iter() + .map(|s| s.as_str()) + .zip(existing.iter().map(|s| s.as_str())) + .collect(); + let mut new_projected_names = PlHashSet::with_capacity(projected_names.len()); + + for col in acc_projections { + let name = column_node_to_name(*col, expr_arena); - if has_new { - // swapping path - // this must leave projected names intact, as we only swap - if has_both { - iter_and_update_nodes( - existing, - new, - acc_projections, - expr_arena, - &mut processed, - ); - } - // simple new name path - // this must add and remove names - else { - new_projected_names.remove(new.as_str()); - let name = ColumnName::from(existing.as_str()); - new_projected_names.insert(name); - iter_and_update_nodes( - existing, - new, - acc_projections, - expr_arena, - &mut processed, - ); - } + if let Some(previous) = reverse_map.get(name.as_ref()) { + let previous: Arc = Arc::from(*previous); + let new = expr_arena.add(AExpr::Column(previous.clone())); + *col = ColumnNode(new); + let _ = new_projected_names.insert(previous); + } else { + let _ = new_projected_names.insert(name.clone()); } } *projected_names = new_projected_names; } else { + let mut processed = BTreeSet::new(); for (existing, new) in existing.iter().zip(new.iter()) { if projected_names.remove(new.as_str()) { let name: Arc = ColumnName::from(existing.as_str()); diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs index b804e50692ea..3eaa1aa14134 100644 --- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs @@ -206,6 +206,16 @@ impl SlicePushDown { scan_type }; + Ok(lp) + }, + (DataFrameScan {df, schema, output_schema, filter, }, Some(state)) if filter.is_none() => { + let df = df.slice(state.offset, state.len as usize); + let lp = DataFrameScan { + df: Arc::new(df), + schema, + output_schema, + filter + }; Ok(lp) } (Union {mut inputs, mut options }, Some(state)) => { diff --git a/crates/polars-plan/src/plans/schema.rs b/crates/polars-plan/src/plans/schema.rs index 82b3a17fd3f5..9b0d0d52eb0a 100644 --- a/crates/polars-plan/src/plans/schema.rs +++ b/crates/polars-plan/src/plans/schema.rs @@ -244,6 +244,52 @@ pub(crate) fn det_join_schema( // the schema will never change. #[cfg(feature = "semi_anti_join")] JoinType::Semi | JoinType::Anti => Ok(schema_left.clone()), + JoinType::Right => { + // Get join names. + let mut arena = Arena::with_capacity(8); + let mut join_on_left: PlHashSet<_> = PlHashSet::with_capacity(left_on.len()); + for e in left_on { + let field = e.to_field_amortized(schema_right, Context::Default, &mut arena)?; + join_on_left.insert(field.name); + } + + let mut join_on_right: PlHashSet<_> = PlHashSet::with_capacity(right_on.len()); + for e in right_on { + let field = e.to_field_amortized(schema_right, Context::Default, &mut arena)?; + join_on_right.insert(field.name); + } + + // init + let mut new_schema = Schema::with_capacity(schema_left.len() + schema_right.len()); + let should_coalesce = options.args.should_coalesce(); + + // Prepare left table schema + if !should_coalesce { + for (name, dtype) in schema_left.iter() { + new_schema.with_column(name.clone(), dtype.clone()); + } + } else { + for (name, dtype) in schema_left.iter() { + if !join_on_left.contains(name) { + new_schema.with_column(name.clone(), dtype.clone()); + } + } + } + + // Prepare right table schema + for (name, dtype) in schema_right.iter() { + { + let left_is_removed = join_on_left.contains(name.as_str()) && should_coalesce; + if schema_left.contains(name.as_str()) && !left_is_removed { + let new_name = format_smartstring!("{}{}", name, options.args.suffix()); + new_schema.with_column(new_name, dtype.clone()); + } else { + new_schema.with_column(name.clone(), dtype.clone()); + } + } + } + Ok(Arc::new(new_schema)) + }, _how => { let mut new_schema = Schema::with_capacity(schema_left.len() + schema_right.len()); @@ -252,17 +298,10 @@ pub(crate) fn det_join_schema( } let should_coalesce = options.args.should_coalesce(); - // make sure that expression are assigned to the schema - // an expression can have an alias, and change a dtype. - // we only do this for the left hand side as the right hand side - // is dropped. let mut arena = Arena::with_capacity(8); - for e in left_on { - let field = e.to_field_amortized(schema_left, Context::Default, &mut arena)?; - new_schema.with_column(field.name, field.dtype); - arena.clear(); - } - // Except in asof joins. Asof joins are not equi-joins + + // Handles coalescing of asof-joins. + // Asof joins are not equi-joins // so the columns that are joined on, may have different // values so if the right has a different name, it is added to the schema #[cfg(feature = "asof_join")] @@ -284,7 +323,6 @@ pub(crate) fn det_join_schema( } } } - let mut join_on_right: PlHashSet<_> = PlHashSet::with_capacity(right_on.len()); for e in right_on { let field = e.to_field_amortized(schema_right, Context::Default, &mut arena)?; diff --git a/crates/polars-plan/src/utils.rs b/crates/polars-plan/src/utils.rs index 3620efabd822..5ef5168fda18 100644 --- a/crates/polars-plan/src/utils.rs +++ b/crates/polars-plan/src/utils.rs @@ -253,9 +253,9 @@ pub(crate) fn aexpr_to_column_nodes_iter<'a>( }) } -pub fn column_node_to_name(node: ColumnNode, arena: &Arena) -> Arc { +pub fn column_node_to_name(node: ColumnNode, arena: &Arena) -> &Arc { if let AExpr::Column(name) = arena.get(node.0) { - name.clone() + name } else { unreachable!() } diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index 03b963b54559..a8741189f7dd 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -20,6 +20,7 @@ rand = { workspace = true } rayon = { workspace = true } recursive = { workspace = true } slotmap = { workspace = true } +tokio = { workspace = true } polars-core = { workspace = true } polars-error = { workspace = true } diff --git a/crates/polars-stream/src/async_executor/mod.rs b/crates/polars-stream/src/async_executor/mod.rs index 7eced5a76965..b32454da8813 100644 --- a/crates/polars-stream/src/async_executor/mod.rs +++ b/crates/polars-stream/src/async_executor/mod.rs @@ -34,9 +34,16 @@ slotmap::new_key_type! { struct TaskKey; } +/// High priority tasks are scheduled preferentially over low priority tasks. +#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TaskPriority { + Low, + High, +} + /// Metadata associated with a task to help schedule it and clean it up. struct TaskMetadata { - priority: bool, + priority: TaskPriority, task_key: TaskKey, completed_tasks: Weak>>, @@ -80,7 +87,7 @@ impl Executor { // SAFETY: this slot may only be accessed from the local thread, which we are. let slot = unsafe { &mut *ttl.local_slot.get() }; - if priority { + if priority == TaskPriority::High { // Insert new task into thread local slot, taking out the old task. let Some(task) = slot.replace(task) else { // We pushed a task into our local slot which was empty. Since @@ -102,7 +109,7 @@ impl Executor { } } else { // Scheduled from an unknown thread, add to global queue. - if priority { + if priority == TaskPriority::High { self.global_high_prio_task_queue.push(task); } else { self.global_low_prio_task_queue.push(task); @@ -257,7 +264,7 @@ impl<'scope, 'env> TaskScope<'scope, 'env> { pub fn spawn_task( &self, - priority: bool, + priority: TaskPriority, fut: F, ) -> JoinHandle where diff --git a/crates/polars-stream/src/execute.rs b/crates/polars-stream/src/execute.rs index 22d2ddf99d15..b1d87b62c618 100644 --- a/crates/polars-stream/src/execute.rs +++ b/crates/polars-stream/src/execute.rs @@ -2,7 +2,7 @@ use polars_core::frame::DataFrame; use polars_core::POOL; use polars_error::PolarsResult; use polars_expr::state::ExecutionState; -use polars_utils::aliases::{InitHashMaps, PlHashSet}; +use polars_utils::aliases::PlHashSet; use slotmap::{SecondaryMap, SparseSecondaryMap}; use crate::async_executor; @@ -10,21 +10,26 @@ use crate::async_primitives::pipe::{pipe, Receiver, Sender}; use crate::graph::{Graph, GraphNodeKey, LogicalPipeKey, PortState}; use crate::morsel::Morsel; -/// Finds all pipeline blockers in the graph, that is, those nodes which do not -/// have ready outputs but do have ready inputs. -fn find_pipeline_blockers(graph: &Graph) -> Vec { +/// Finds all runnable pipeline blockers in the graph, that is, nodes which: +/// - Only have blocked output ports. +/// - Have at least one ready input port connected to a ready output port. +fn find_runnable_pipeline_blockers(graph: &Graph) -> Vec { let mut blockers = Vec::new(); for (node_key, node) in graph.nodes.iter() { // TODO: how does the multiplexer fit into this? - let no_output_ready = node + let only_has_blocked_outputs = node .outputs .iter() - .all(|o| graph.pipes[*o].send_state != PortState::Ready); - let has_input_ready = node - .inputs - .iter() - .any(|o| graph.pipes[*o].send_state == PortState::Ready); - if no_output_ready && has_input_ready { + .all(|o| graph.pipes[*o].send_state == PortState::Blocked); + if !only_has_blocked_outputs { + continue; + } + + let has_input_ready = node.inputs.iter().any(|i| { + graph.pipes[*i].send_state == PortState::Ready + && graph.pipes[*i].recv_state == PortState::Ready + }); + if has_input_ready { blockers.push(node_key); } } @@ -63,16 +68,27 @@ fn expand_ready_subgraph( fn find_runnable_subgraph(graph: &mut Graph) -> (PlHashSet, Vec) { // Find pipeline blockers, choose a subset with at most one memory intensive // pipeline blocker, and return the subgraph needed to feed them. - let blockers = find_pipeline_blockers(graph); - let (expensive, cheap): (Vec<_>, Vec<_>) = blockers.into_iter().partition(|b| { + let blockers = find_runnable_pipeline_blockers(graph); + let (mut expensive, cheap): (Vec<_>, Vec<_>) = blockers.into_iter().partition(|b| { graph.nodes[*b] .compute .is_memory_intensive_pipeline_blocker() }); + // TODO: choose which expensive pipeline blocker to run more intelligently. + expensive.sort_by_key(|node_key| { + // Prefer to run nodes whose outputs are ready to be consumed. + let outputs_ready_to_receive = graph.nodes[*node_key] + .outputs + .iter() + .filter(|o| graph.pipes[**o].recv_state == PortState::Ready) + .count(); + outputs_ready_to_receive + }); + let mut to_run = cheap; - if let Some(node) = expensive.into_iter().next() { - to_run.push(node); // TODO: choose which expensive pipeline blocker to run intelligently. + if let Some(node) = expensive.pop() { + to_run.push(node); } expand_ready_subgraph(graph, to_run) } @@ -82,12 +98,8 @@ fn run_subgraph( graph: &mut Graph, nodes: &PlHashSet, pipes: &[LogicalPipeKey], - finalize_output: &mut SparseSecondaryMap, + num_pipelines: usize, ) -> PolarsResult<()> { - // Get the number of threads from the rayon thread-pool as that respects our config. - let num_pipes = POOL.current_num_threads(); - async_executor::set_num_threads(num_pipes); - // Construct pipes. let mut physical_senders = SecondaryMap::new(); let mut physical_receivers = SecondaryMap::new(); @@ -96,7 +108,7 @@ fn run_subgraph( // The first step is to create N physical pipes for every logical pipe in the graph. for pipe_key in pipes.iter().copied() { let (senders, receivers): (Vec>, Vec>) = - (0..num_pipes).map(|_| pipe()).unzip(); + (0..num_pipelines).map(|_| pipe()).unzip(); physical_senders.insert(pipe_key, senders); physical_receivers.insert(pipe_key, receivers); @@ -117,12 +129,11 @@ fn run_subgraph( if !nodes.contains(&node_key) { continue; } - node.compute.initialize(num_pipes); // Scatter inputs/outputs per pipeline. let num_inputs = node.inputs.len(); let num_outputs = node.outputs.len(); - phys_recv.resize_with(num_inputs * num_pipes, || None); + phys_recv.resize_with(num_inputs * num_pipelines, || None); for (input_idx, input) in node.inputs.iter().copied().enumerate() { if let Some(receivers) = physical_receivers.remove(input) { for (recv_idx, recv) in receivers.into_iter().enumerate() { @@ -131,7 +142,7 @@ fn run_subgraph( } } - phys_send.resize_with(num_outputs * num_pipes, || None); + phys_send.resize_with(num_outputs * num_pipelines, || None); for (output_idx, output) in node.outputs.iter().copied().enumerate() { if let Some(senders) = physical_senders.remove(output) { for (send_idx, send) in senders.into_iter().enumerate() { @@ -140,8 +151,13 @@ fn run_subgraph( } } + // Spawn the global task, if any. + if let Some(handle) = node.compute.spawn_global(scope, &execution_state) { + join_handles.push(handle); + } + // Spawn a task per pipeline. - for pipeline in 0..num_pipes { + for pipeline in 0..num_pipelines { join_handles.push(node.compute.spawn( scope, pipeline, @@ -161,31 +177,52 @@ fn run_subgraph( }) })?; - // Finalize computation and get any in-memory results. - for node_key in nodes.iter().copied() { - if let Some(df) = graph.nodes[node_key].compute.finalize()? { - finalize_output.insert(node_key, df); - } - } - Ok(()) } pub fn execute_graph( graph: &mut Graph, ) -> PolarsResult> { - let mut out = SparseSecondaryMap::new(); + // Get the number of threads from the rayon thread-pool as that respects our config. + let num_pipelines = POOL.current_num_threads(); + async_executor::set_num_threads(num_pipelines); + + for node in graph.nodes.values_mut() { + node.compute.initialize(num_pipelines); + } + loop { - // println!("updating state"); + if polars_core::config::verbose() { + eprintln!("polars-stream: updating graph state"); + } graph.update_all_states(); let (nodes, pipes) = find_runnable_subgraph(graph); - // for node in &nodes { - // println!("running {}", graph.nodes[*node].compute.name()); - // } + if polars_core::config::verbose() { + for node in &nodes { + eprintln!( + "polars-stream: running {} in subgraph", + graph.nodes[*node].compute.name() + ); + } + } if nodes.is_empty() { break; } - run_subgraph(graph, &nodes, &pipes, &mut out)?; + run_subgraph(graph, &nodes, &pipes, num_pipelines)?; } + + // Ensure everything is done. + for pipe in graph.pipes.values() { + assert!(pipe.send_state == PortState::Done && pipe.recv_state == PortState::Done); + } + + // Extract output from in-memory nodes. + let mut out = SparseSecondaryMap::new(); + for (node_key, node) in graph.nodes.iter_mut() { + if let Some(df) = node.compute.get_output()? { + out.insert(node_key, df); + } + } + Ok(out) } diff --git a/crates/polars-stream/src/graph.rs b/crates/polars-stream/src/graph.rs index c9ab0c78cf46..055d8df4a5ae 100644 --- a/crates/polars-stream/src/graph.rs +++ b/crates/polars-stream/src/graph.rs @@ -82,14 +82,16 @@ impl Graph { send_state.extend(node.outputs.iter().map(|o| self.pipes[*o].recv_state)); // Compute the new state of this node given its environment. - // println!("updating {}, before: {recv_state:?} {send_state:?}", node.compute.name()); + // eprintln!("updating {}, before: {recv_state:?} {send_state:?}", node.compute.name()); node.compute.update_state(&mut recv_state, &mut send_state); - // println!("updating {}, after: {recv_state:?} {send_state:?}", node.compute.name()); + // eprintln!("updating {}, after: {recv_state:?} {send_state:?}", node.compute.name()); // Propagate information. for (input, state) in node.inputs.iter().zip(recv_state.iter()) { let pipe = &mut self.pipes[*input]; if pipe.recv_state != *state { + // eprintln!("transitioning input pipe from {:?} to {state:?}", pipe.recv_state); + assert!(pipe.recv_state != PortState::Done, "implementation error: state transition from Done to Blocked/Ready attempted"); pipe.recv_state = *state; if scheduled_for_update.insert(pipe.sender, ()).is_none() { to_update.push(pipe.sender); @@ -100,6 +102,8 @@ impl Graph { for (output, state) in node.outputs.iter().zip(send_state.iter()) { let pipe = &mut self.pipes[*output]; if pipe.send_state != *state { + // eprintln!("transitioning output pipe from {:?} to {state:?}", pipe.send_state); + assert!(pipe.send_state != PortState::Done, "implementation error: state transition from Done to Blocked/Ready attempted"); pipe.send_state = *state; if scheduled_for_update.insert(pipe.receiver, ()).is_none() { to_update.push(pipe.receiver); @@ -136,7 +140,7 @@ pub struct LogicalPipe { pub recv_state: PortState, } -#[derive(Copy, Clone, PartialEq, Eq, Debug)] +#[derive(Copy, Clone, PartialEq, Eq, Debug, PartialOrd, Ord)] pub enum PortState { Blocked, Ready, diff --git a/crates/polars-stream/src/lib.rs b/crates/polars-stream/src/lib.rs index f3443e876ba3..768848b761b6 100644 --- a/crates/polars-stream/src/lib.rs +++ b/crates/polars-stream/src/lib.rs @@ -13,3 +13,7 @@ mod morsel; mod nodes; mod physical_plan; mod utils; + +// TODO: experiment with these, and make them configurable through environment variables. +const DEFAULT_LINEARIZER_BUFFER_SIZE: usize = 4; +const DEFAULT_DISTRIBUTOR_BUFFER_SIZE: usize = 4; diff --git a/crates/polars-stream/src/morsel.rs b/crates/polars-stream/src/morsel.rs index 6e0570128894..ced521435d95 100644 --- a/crates/polars-stream/src/morsel.rs +++ b/crates/polars-stream/src/morsel.rs @@ -14,7 +14,7 @@ pub fn get_ideal_morsel_size() -> usize { }) } -#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Default)] pub struct MorselSeq(u64); impl MorselSeq { @@ -31,6 +31,11 @@ impl MorselSeq { Self(self.0.checked_add(2).unwrap()) } + // Ensures this morsel sequence comes after the offset. + pub fn offset_by(self, offset: Self) -> Self { + Self(self.0 + offset.0) + } + pub fn to_u64(self) -> u64 { self.0 } @@ -69,6 +74,10 @@ impl Morsel { self.seq } + pub fn set_seq(&mut self, seq: MorselSeq) { + self.seq = seq; + } + pub fn map DataFrame>(mut self, f: F) -> Self { self.df = f(self.df); self diff --git a/crates/polars-stream/src/nodes/filter.rs b/crates/polars-stream/src/nodes/filter.rs index 396babb8af97..60a06e8fd782 100644 --- a/crates/polars-stream/src/nodes/filter.rs +++ b/crates/polars-stream/src/nodes/filter.rs @@ -1,13 +1,9 @@ use std::sync::Arc; -use polars_error::{polars_err, PolarsResult}; +use polars_error::polars_err; use polars_expr::prelude::PhysicalExpr; -use polars_expr::state::ExecutionState; -use super::{ComputeNode, PortState}; -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::morsel::Morsel; +use super::compute_node_prelude::*; pub struct FilterNode { predicate: Arc, @@ -20,7 +16,7 @@ impl FilterNode { } impl ComputeNode for FilterNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "filter" } @@ -41,7 +37,7 @@ impl ComputeNode for FilterNode { let mut recv = recv[0].take().unwrap(); let mut send = send[0].take().unwrap(); - scope.spawn_task(true, async move { + scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = recv.recv().await { let morsel = morsel.try_map(|df| { let mask = self.predicate.evaluate(&df, state)?; diff --git a/crates/polars-stream/src/nodes/in_memory_map.rs b/crates/polars-stream/src/nodes/in_memory_map.rs index 11982d9b0852..d316492e75d6 100644 --- a/crates/polars-stream/src/nodes/in_memory_map.rs +++ b/crates/polars-stream/src/nodes/in_memory_map.rs @@ -1,91 +1,100 @@ use std::sync::Arc; -use polars_core::frame::DataFrame; use polars_core::schema::Schema; -use polars_error::PolarsResult; -use polars_expr::state::ExecutionState; use polars_plan::plans::DataFrameUdf; +use super::compute_node_prelude::*; use super::in_memory_sink::InMemorySinkNode; use super::in_memory_source::InMemorySourceNode; -use super::ComputeNode; -use crate::async_executor::JoinHandle; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::graph::PortState; -use crate::morsel::Morsel; pub enum InMemoryMapNode { - Sink(InMemorySinkNode, Arc), + Sink { + sink_node: InMemorySinkNode, + num_pipelines: usize, + map: Arc, + }, Source(InMemorySourceNode), + Done, } impl InMemoryMapNode { pub fn new(input_schema: Arc, map: Arc) -> Self { - Self::Sink(InMemorySinkNode::new(input_schema), map) + Self::Sink { + sink_node: InMemorySinkNode::new(input_schema), + num_pipelines: 0, + map, + } } } impl ComputeNode for InMemoryMapNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "in_memory_map" } + fn initialize(&mut self, num_pipelines_: usize) { + match self { + Self::Sink { num_pipelines, .. } => *num_pipelines = num_pipelines_, + _ => unreachable!(), + } + } + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { assert!(recv.len() == 1 && send.len() == 1); - // If the output doesn't want any more data, we are always done. - if send[0] == PortState::Done { - recv[0] = PortState::Done; - return; + // If the output doesn't want any more data, transition to being done. + if send[0] == PortState::Done && !matches!(self, Self::Done) { + *self = Self::Done; + } + + // If the input is done, transition to being a source. + if let Self::Sink { + sink_node, + num_pipelines, + map, + } = self + { + if recv[0] == PortState::Done { + let df = sink_node.get_output().unwrap(); + let mut source_node = + InMemorySourceNode::new(Arc::new(map.call_udf(df.unwrap()).unwrap())); + source_node.initialize(*num_pipelines); + *self = Self::Source(source_node); + } } match self { - Self::Sink(sink, _) => { - sink.update_state(recv, &mut []); + Self::Sink { sink_node, .. } => { + sink_node.update_state(recv, &mut []); send[0] = PortState::Blocked; }, - Self::Source(source) => { - source.update_state(&mut [], send); + Self::Source(source_node) => { + recv[0] = PortState::Done; + source_node.update_state(&mut [], send); + }, + Self::Done => { recv[0] = PortState::Done; + send[0] = PortState::Done; }, } } fn is_memory_intensive_pipeline_blocker(&self) -> bool { - matches!(self, Self::Sink(_, _)) - } - - fn initialize(&mut self, num_pipelines: usize) { - match self { - Self::Sink(sink, _) => sink.initialize(num_pipelines), - Self::Source(source) => source.initialize(num_pipelines), - } + matches!(self, Self::Sink { .. }) } fn spawn<'env, 's>( &'env self, - scope: &'s crate::async_executor::TaskScope<'s, 'env>, + scope: &'s TaskScope<'s, 'env>, pipeline: usize, recv: &mut [Option>], send: &mut [Option>], state: &'s ExecutionState, ) -> JoinHandle> { match self { - Self::Sink(sink, _) => sink.spawn(scope, pipeline, recv, &mut [], state), + Self::Sink { sink_node, .. } => sink_node.spawn(scope, pipeline, recv, &mut [], state), Self::Source(source) => source.spawn(scope, pipeline, &mut [], send, state), + Self::Done => unreachable!(), } } - - fn finalize(&mut self) -> PolarsResult> { - match self { - Self::Sink(sink, map) => { - let df = sink.finalize()?.unwrap(); - *self = Self::Source(InMemorySourceNode::new(Arc::new(map.call_udf(df)?))); - }, - Self::Source(source) => { - source.finalize()?; - }, - }; - Ok(None) - } } diff --git a/crates/polars-stream/src/nodes/in_memory_sink.rs b/crates/polars-stream/src/nodes/in_memory_sink.rs index ab3b3d5026e1..e1094f509ced 100644 --- a/crates/polars-stream/src/nodes/in_memory_sink.rs +++ b/crates/polars-stream/src/nodes/in_memory_sink.rs @@ -1,23 +1,15 @@ use std::sync::Arc; use parking_lot::Mutex; -use polars_core::frame::DataFrame; use polars_core::schema::Schema; -use polars_core::series::Series; use polars_core::utils::accumulate_dataframes_vertical_unchecked; -use polars_error::PolarsResult; -use polars_expr::state::ExecutionState; -use super::{ComputeNode, PortState}; -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::morsel::Morsel; +use super::compute_node_prelude::*; use crate::utils::in_memory_linearize::linearize; pub struct InMemorySinkNode { morsels_per_pipe: Mutex>>, schema: Arc, - done: bool, } impl InMemorySinkNode { @@ -25,13 +17,12 @@ impl InMemorySinkNode { Self { morsels_per_pipe: Mutex::default(), schema, - done: false, } } } impl ComputeNode for InMemorySinkNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "in_memory_sink" } @@ -39,19 +30,15 @@ impl ComputeNode for InMemorySinkNode { assert!(send.is_empty()); assert!(recv.len() == 1); - // If a sink is done, it's done, otherwise it will just reflect its - // input state. - if self.done { - recv[0] = PortState::Done; + // We are always ready to receive, unless the sender is done, then we're + // also done. + if recv[0] != PortState::Done { + recv[0] = PortState::Ready; } } fn is_memory_intensive_pipeline_blocker(&self) -> bool { - !self.done - } - - fn initialize(&mut self, _num_pipelines: usize) { - self.morsels_per_pipe.get_mut().clear(); + true } fn spawn<'env, 's>( @@ -65,7 +52,7 @@ impl ComputeNode for InMemorySinkNode { assert!(recv.len() == 1 && send.is_empty()); let mut recv = recv[0].take().unwrap(); - scope.spawn_task(true, async move { + scope.spawn_task(TaskPriority::High, async move { let mut morsels = Vec::new(); while let Ok(mut morsel) = recv.recv().await { morsel.take_consume_token(); @@ -77,9 +64,7 @@ impl ComputeNode for InMemorySinkNode { }) } - fn finalize(&mut self) -> PolarsResult> { - self.done = true; - + fn get_output(&mut self) -> PolarsResult> { let morsels_per_pipe = core::mem::take(&mut *self.morsels_per_pipe.get_mut()); let dataframes = linearize(morsels_per_pipe); if dataframes.is_empty() { diff --git a/crates/polars-stream/src/nodes/in_memory_source.rs b/crates/polars-stream/src/nodes/in_memory_source.rs index 046d442ab49e..b2cf4d5780fa 100644 --- a/crates/polars-stream/src/nodes/in_memory_source.rs +++ b/crates/polars-stream/src/nodes/in_memory_source.rs @@ -1,15 +1,9 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -use polars_core::frame::DataFrame; -use polars_error::PolarsResult; -use polars_expr::state::ExecutionState; - -use super::{ComputeNode, PortState}; -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; +use super::compute_node_prelude::*; use crate::async_primitives::wait_group::WaitGroup; -use crate::morsel::{get_ideal_morsel_size, Morsel, MorselSeq}; +use crate::morsel::{get_ideal_morsel_size, MorselSeq}; pub struct InMemorySourceNode { source: Option>, @@ -28,21 +22,10 @@ impl InMemorySourceNode { } impl ComputeNode for InMemorySourceNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "in_memory_source" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { - assert!(recv.is_empty()); - assert!(send.len() == 1); - - if self.source.is_some() && send[0] != PortState::Done { - send[0] = PortState::Ready; - } else { - send[0] = PortState::Done; - } - } - fn initialize(&mut self, num_pipelines: usize) { let len = self.source.as_ref().unwrap().height(); let ideal_block_count = (len / get_ideal_morsel_size()).max(1); @@ -51,6 +34,26 @@ impl ComputeNode for InMemorySourceNode { self.seq = AtomicU64::new(0); } + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + assert!(recv.is_empty()); + assert!(send.len() == 1); + + let exhausted = self + .source + .as_ref() + .map(|s| { + self.seq.load(Ordering::Relaxed) * self.morsel_size as u64 >= s.height() as u64 + }) + .unwrap_or(true); + + if send[0] == PortState::Done || exhausted { + send[0] = PortState::Done; + self.source = None; + } else { + send[0] = PortState::Ready; + } + } + fn spawn<'env, 's>( &'env self, scope: &'s TaskScope<'s, 'env>, @@ -63,7 +66,7 @@ impl ComputeNode for InMemorySourceNode { let mut send = send[0].take().unwrap(); let source = self.source.as_ref().unwrap(); - scope.spawn_task(false, async move { + scope.spawn_task(TaskPriority::Low, async move { let wait_group = WaitGroup::default(); loop { let seq = self.seq.fetch_add(1, Ordering::Relaxed); @@ -84,9 +87,4 @@ impl ComputeNode for InMemorySourceNode { Ok(()) }) } - - fn finalize(&mut self) -> PolarsResult> { - drop(self.source.take()); - Ok(None) - } } diff --git a/crates/polars-stream/src/nodes/map.rs b/crates/polars-stream/src/nodes/map.rs index 87a6ece118c9..b7e98446b88d 100644 --- a/crates/polars-stream/src/nodes/map.rs +++ b/crates/polars-stream/src/nodes/map.rs @@ -1,17 +1,8 @@ use std::sync::Arc; -use polars_core::frame::DataFrame; -use polars_core::schema::Schema; -use polars_core::series::Series; -use polars_error::PolarsResult; -use polars_expr::prelude::PhysicalExpr; -use polars_expr::state::ExecutionState; use polars_plan::plans::DataFrameUdf; -use super::{ComputeNode, PortState}; -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::morsel::Morsel; +use super::compute_node_prelude::*; /// A simple mapping node. Assumes the given udf is elementwise. pub struct MapNode { @@ -25,7 +16,7 @@ impl MapNode { } impl ComputeNode for MapNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "map" } @@ -40,13 +31,13 @@ impl ComputeNode for MapNode { _pipeline: usize, recv: &mut [Option>], send: &mut [Option>], - state: &'s ExecutionState, + _state: &'s ExecutionState, ) -> JoinHandle> { assert!(recv.len() == 1 && send.len() == 1); let mut recv = recv[0].take().unwrap(); let mut send = send[0].take().unwrap(); - scope.spawn_task(true, async move { + scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = recv.recv().await { let morsel = morsel.try_map(|df| self.map.call_udf(df))?; if send.send(morsel).await.is_err() { diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index c8a260e771e3..70cabe6d5d11 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -1,26 +1,39 @@ -use polars_core::frame::DataFrame; -use polars_error::PolarsResult; -use polars_expr::state::ExecutionState; - -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::graph::PortState; -use crate::morsel::Morsel; - pub mod filter; pub mod in_memory_map; pub mod in_memory_sink; pub mod in_memory_source; pub mod map; +pub mod ordered_union; pub mod select; pub mod simple_projection; +pub mod streaming_slice; + +/// The imports you'll always need for implementing a ComputeNode. +mod compute_node_prelude { + pub use polars_core::frame::DataFrame; + pub use polars_error::PolarsResult; + pub use polars_expr::state::ExecutionState; + + pub use super::ComputeNode; + pub use crate::async_executor::{JoinHandle, TaskPriority, TaskScope}; + pub use crate::async_primitives::pipe::{Receiver, Sender}; + pub use crate::graph::PortState; + pub use crate::morsel::{Morsel, MorselSeq}; +} + +use compute_node_prelude::*; pub trait ComputeNode: Send + Sync { - fn name(&self) -> &'static str; + /// The name of this node. + fn name(&self) -> &str; + + /// Called once before the first execution phase to indicate with how many + /// pipelines we will execute the graph. + fn initialize(&mut self, _num_pipelines: usize) {} /// Update the state of this node given the state of our input and output /// ports. May be called multiple times until fully resolved for each - /// execution cycle. + /// execution phase. /// /// For each input pipe `recv` will contain a respective state of the /// send port that pipe is connected to when called, and it is expected when @@ -37,11 +50,18 @@ pub trait ComputeNode: Send + Sync { false } - /// Initialize for processing using the given amount of pipelines. - fn initialize(&mut self, _num_pipelines: usize) {} + /// Opportunity to spawn task(s) without being beholden to a specific + /// pipeline. Called once per execution phase. + fn spawn_global<'env, 's>( + &'env self, + scope: &'s TaskScope<'s, 'env>, + state: &'s ExecutionState, + ) -> Option>> { + None + } /// Spawn a task that should receive input(s), process it and send to its - /// output(s). Called once for each pipeline. + /// output(s). Called once for each pipeline per execution phase. fn spawn<'env, 's>( &'env self, scope: &'s TaskScope<'s, 'env>, @@ -51,8 +71,9 @@ pub trait ComputeNode: Send + Sync { state: &'s ExecutionState, ) -> JoinHandle>; - /// Called after this computation is complete. - fn finalize(&mut self) -> PolarsResult> { + /// Called once after the last execution phase to extract output from + /// in-memory nodes. + fn get_output(&mut self) -> PolarsResult> { Ok(None) } } diff --git a/crates/polars-stream/src/nodes/ordered_union.rs b/crates/polars-stream/src/nodes/ordered_union.rs new file mode 100644 index 000000000000..d78649ae57df --- /dev/null +++ b/crates/polars-stream/src/nodes/ordered_union.rs @@ -0,0 +1,84 @@ +use parking_lot::Mutex; + +use super::compute_node_prelude::*; + +/// A node that first passes through all data from the first input, then the +/// second input, etc. +pub struct OrderedUnionNode { + cur_input_idx: usize, + max_morsel_seq_sent: Mutex, + morsel_offset: MorselSeq, +} + +impl OrderedUnionNode { + pub fn new() -> Self { + Self { + cur_input_idx: 0, + max_morsel_seq_sent: Mutex::new(MorselSeq::new(0)), + morsel_offset: MorselSeq::new(0), + } + } +} + +impl ComputeNode for OrderedUnionNode { + fn name(&self) -> &str { + "ordered_union" + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + assert!(self.cur_input_idx <= recv.len() && send.len() == 1); + + // Skip inputs that are done. + while self.cur_input_idx < recv.len() && recv[self.cur_input_idx] == PortState::Done { + self.cur_input_idx += 1; + } + + // Act like a normal pass-through node for the current input, or mark + // ourselves as done if all inputs are handled. + if self.cur_input_idx < recv.len() { + core::mem::swap(&mut recv[self.cur_input_idx], &mut send[0]); + } else { + send[0] = PortState::Done; + } + + // Mark all inputs after the current one as blocked. + for r in recv.iter_mut().skip(self.cur_input_idx + 1) { + *r = PortState::Blocked; + } + + // Set the morsel offset one higher than any sent so far. + self.morsel_offset = self.max_morsel_seq_sent.lock().successor(); + } + + fn spawn<'env, 's>( + &'env self, + scope: &'s TaskScope<'s, 'env>, + _pipeline: usize, + recv: &mut [Option>], + send: &mut [Option>], + _state: &'s ExecutionState, + ) -> JoinHandle> { + let ready_count = recv.iter().filter(|r| r.is_some()).count(); + assert!(ready_count == 1 && send.len() == 1); + let mut recv = recv[self.cur_input_idx].take().unwrap(); + let mut send = send[0].take().unwrap(); + + scope.spawn_task(TaskPriority::High, async move { + let mut max_seq = MorselSeq::new(0); + while let Ok(mut morsel) = recv.recv().await { + // Ensure the morsel sequence id stream is monotonic. + let seq = morsel.seq().offset_by(self.morsel_offset); + max_seq = max_seq.max(seq); + morsel.set_seq(seq); + if send.send(morsel).await.is_err() { + break; + } + } + + // Update our global maximum. + let mut max_morsel_seq_sent = self.max_morsel_seq_sent.lock(); + *max_morsel_seq_sent = max_morsel_seq_sent.max(max_seq.successor()); + Ok(()) + }) + } +} diff --git a/crates/polars-stream/src/nodes/select.rs b/crates/polars-stream/src/nodes/select.rs index b2750827deca..fb581b57a9ae 100644 --- a/crates/polars-stream/src/nodes/select.rs +++ b/crates/polars-stream/src/nodes/select.rs @@ -1,16 +1,10 @@ use std::sync::Arc; -use polars_core::frame::DataFrame; use polars_core::schema::Schema; use polars_core::series::Series; -use polars_error::PolarsResult; use polars_expr::prelude::PhysicalExpr; -use polars_expr::state::ExecutionState; -use super::{ComputeNode, PortState}; -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::morsel::Morsel; +use super::compute_node_prelude::*; pub struct SelectNode { selectors: Vec>, @@ -33,7 +27,7 @@ impl SelectNode { } impl ComputeNode for SelectNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "select" } @@ -54,7 +48,7 @@ impl ComputeNode for SelectNode { let mut recv = recv[0].take().unwrap(); let mut send = send[0].take().unwrap(); - scope.spawn_task(true, async move { + scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = recv.recv().await { let morsel = morsel.try_map(|df| { // Select columns. diff --git a/crates/polars-stream/src/nodes/simple_projection.rs b/crates/polars-stream/src/nodes/simple_projection.rs index a3d4075bac96..8eecd6ea29b9 100644 --- a/crates/polars-stream/src/nodes/simple_projection.rs +++ b/crates/polars-stream/src/nodes/simple_projection.rs @@ -1,25 +1,21 @@ -use polars_core::schema::SchemaRef; -use polars_error::PolarsResult; -use polars_expr::state::ExecutionState; +use std::sync::Arc; -use super::ComputeNode; -use crate::async_executor::{JoinHandle, TaskScope}; -use crate::async_primitives::pipe::{Receiver, Sender}; -use crate::graph::PortState; -use crate::morsel::Morsel; +use polars_core::schema::Schema; + +use super::compute_node_prelude::*; pub struct SimpleProjectionNode { - schema: SchemaRef, + schema: Arc, } impl SimpleProjectionNode { - pub fn new(schema: SchemaRef) -> Self { + pub fn new(schema: Arc) -> Self { Self { schema } } } impl ComputeNode for SimpleProjectionNode { - fn name(&self) -> &'static str { + fn name(&self) -> &str { "simple_projection" } @@ -40,7 +36,7 @@ impl ComputeNode for SimpleProjectionNode { let mut recv = recv[0].take().unwrap(); let mut send = send[0].take().unwrap(); - scope.spawn_task(true, async move { + scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = recv.recv().await { let morsel = morsel.try_map(|df| { // TODO: can this be unchecked? diff --git a/crates/polars-stream/src/nodes/streaming_slice.rs b/crates/polars-stream/src/nodes/streaming_slice.rs new file mode 100644 index 000000000000..6c8ab578aee2 --- /dev/null +++ b/crates/polars-stream/src/nodes/streaming_slice.rs @@ -0,0 +1,149 @@ +use std::sync::Arc; + +use parking_lot::Mutex; + +use super::compute_node_prelude::*; +use crate::async_primitives::distributor_channel::{ + distributor_channel, Receiver as DistrReceiver, +}; +use crate::utils::linearizer::{Inserter, Linearizer}; +use crate::{DEFAULT_DISTRIBUTOR_BUFFER_SIZE, DEFAULT_LINEARIZER_BUFFER_SIZE}; + +#[derive(Copy, Clone, Default)] +struct GlobalState { + stream_offset: usize, + morsel_seq: MorselSeq, +} + +/// A node that will pass-through up to length rows, starting at start_offset. +/// Since start_offset must be non-negative this can be done in a streaming +/// manner. +pub struct StreamingSliceNode { + start_offset: usize, + length: usize, + + global_state: Mutex, + + num_pipelines: usize, + #[allow(clippy::type_complexity)] + per_pipeline_resources: Mutex)>>>, +} + +impl StreamingSliceNode { + pub fn new(start_offset: usize, length: usize) -> Self { + Self { + start_offset, + length, + global_state: Mutex::default(), + num_pipelines: 0, + per_pipeline_resources: Mutex::default(), + } + } +} + +impl ComputeNode for StreamingSliceNode { + fn name(&self) -> &str { + "streaming_slice" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.num_pipelines = num_pipelines; + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + let global_state = self.global_state.lock(); + if global_state.stream_offset >= self.start_offset + self.length || self.length == 0 { + recv[0] = PortState::Done; + send[0] = PortState::Done; + } else { + recv.swap_with_slice(send); + } + } + + fn spawn_global<'env, 's>( + &'env self, + scope: &'s TaskScope<'s, 'env>, + state: &'s ExecutionState, + ) -> Option>> { + let (mut linearizer, inserters) = + Linearizer::new(self.num_pipelines, DEFAULT_LINEARIZER_BUFFER_SIZE); + let (mut sender, receivers) = + distributor_channel(self.num_pipelines, DEFAULT_DISTRIBUTOR_BUFFER_SIZE); + { + let per_pipeline_resources = &mut *self.per_pipeline_resources.lock(); + per_pipeline_resources.clear(); + per_pipeline_resources.extend(inserters.into_iter().zip(receivers).map(Some)); + } + + Some(scope.spawn_task(TaskPriority::High, async move { + let mut global_state = *self.global_state.lock(); + let stop_offset = self.start_offset + self.length; + + while let Some(morsel) = linearizer.get().await { + let mut df = morsel.into_df(); + let height = df.height(); + + // Start/stop offsets within df. + let relative_start_offset = self + .start_offset + .saturating_sub(global_state.stream_offset) + .min(height); + let relative_stop_offset = stop_offset + .saturating_sub(global_state.stream_offset) + .min(height); + if relative_start_offset < relative_stop_offset { + let new_height = relative_stop_offset - relative_start_offset; + if new_height != height { + df = df.slice(relative_start_offset as i64, new_height); + } + sender.send(Morsel::new(df, global_state.morsel_seq)).await; + global_state.morsel_seq = global_state.morsel_seq.successor(); + } + + global_state.stream_offset += height; + if global_state.stream_offset >= stop_offset { + break; + } + } + + *self.global_state.lock() = global_state; + Ok(()) + })) + } + + fn spawn<'env, 's>( + &'env self, + scope: &'s TaskScope<'s, 'env>, + pipeline: usize, + recv: &mut [Option>], + send: &mut [Option>], + state: &'s ExecutionState, + ) -> JoinHandle> { + assert!(recv.len() == 1 && send.len() == 1); + let mut recv = recv[0].take().unwrap(); + let mut send = send[0].take().unwrap(); + let (mut inserter, mut distr_recv) = + self.per_pipeline_resources.lock()[pipeline].take().unwrap(); + + let insert_join = scope.spawn_task(TaskPriority::High, async move { + while let Ok(morsel) = recv.recv().await { + if inserter.insert(morsel).await.is_err() { + break; + } + } + + PolarsResult::Ok(()) + }); + + scope.spawn_task(TaskPriority::High, async move { + while let Ok(morsel) = distr_recv.recv().await { + if send.send(morsel).await.is_err() { + break; + } + } + + insert_join.await?; + Ok(()) + }) + } +} diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 25646b441784..9cce563ab779 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -63,6 +63,21 @@ pub fn lower_ir( })) }, + IR::Slice { input, offset, len } => { + if *offset >= 0 { + let offset = *offset as usize; + let length = *len as usize; + let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; + Ok(phys_sm.insert(PhysNode::StreamingSlice { + input, + offset, + length, + })) + } else { + todo!() + } + }, + IR::Filter { input, predicate } if is_streamable(predicate.node(), expr_arena) => { let predicate = predicate.clone(); let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; @@ -147,6 +162,19 @@ pub fn lower_ir( Ok(phys_sm.insert(phys_node)) }, + IR::Union { inputs, options } => { + if options.slice.is_some() { + todo!() + } + + let inputs = inputs + .clone() // Needed to borrow ir_arena mutably. + .into_iter() + .map(|input| lower_ir(input, ir_arena, expr_arena, phys_sm)) + .collect::>()?; + Ok(phys_sm.insert(PhysNode::OrderedUnion { inputs })) + }, + _ => todo!(), } } diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index d22901f9afdd..cf3719e8e680 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -3,10 +3,8 @@ use std::sync::Arc; use polars_core::frame::DataFrame; use polars_core::prelude::SortMultipleOptions; use polars_core::schema::Schema; -use polars_error::PolarsResult; use polars_plan::plans::DataFrameUdf; use polars_plan::prelude::expr_ir::ExprIR; -use polars_utils::arena::Node; mod lower_ir; mod to_graph; @@ -36,6 +34,12 @@ pub enum PhysNode { output_schema: Arc, }, + StreamingSlice { + input: PhysNodeKey, + offset: usize, + length: usize, + }, + Filter { input: PhysNodeKey, predicate: ExprIR, @@ -69,4 +73,8 @@ pub enum PhysNode { slice: Option<(i64, usize)>, sort_options: SortMultipleOptions, }, + + OrderedUnion { + inputs: Vec, + }, } diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 5d0c5e2be9e5..cc9a8227c5cb 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -13,7 +13,6 @@ use slotmap::{SecondaryMap, SlotMap}; use super::{PhysNode, PhysNodeKey}; use crate::graph::{Graph, GraphNodeKey}; use crate::nodes; -use crate::nodes::in_memory_map::InMemoryMapNode; use crate::utils::late_materialized_df::LateMaterializedDataFrame; struct GraphConversionContext<'a> { @@ -61,6 +60,18 @@ fn to_graph_rec<'a>( [], ), + StreamingSlice { + input, + offset, + length, + } => { + let input_key = to_graph_rec(*input, ctx)?; + ctx.graph.add_node( + nodes::streaming_slice::StreamingSliceNode::new(*offset, *length), + [input_key], + ) + }, + Filter { predicate, input } => { let phys_predicate_expr = create_physical_expr( predicate, @@ -174,6 +185,15 @@ fn to_graph_rec<'a>( [input_key], ) }, + + OrderedUnion { inputs } => { + let input_keys = inputs + .iter() + .map(|i| to_graph_rec(*i, ctx)) + .collect::, _>>()?; + ctx.graph + .add_node(nodes::ordered_union::OrderedUnionNode::new(), input_keys) + }, }; ctx.phys_to_graph.insert(phys_node_key, graph_key); diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs index cc946f3063af..17b583fe67c8 100644 --- a/crates/polars-stream/src/utils/late_materialized_df.rs +++ b/crates/polars-stream/src/utils/late_materialized_df.rs @@ -44,7 +44,7 @@ impl AnonymousScan for LateMaterializedDataFrame { unimplemented!() } - fn scan(&self, scan_opts: AnonymousScanArgs) -> PolarsResult { + fn scan(&self, _scan_opts: AnonymousScanArgs) -> PolarsResult { Ok(self.df.lock().take().unwrap()) } } diff --git a/crates/polars-stream/src/utils/linearizer.rs b/crates/polars-stream/src/utils/linearizer.rs new file mode 100644 index 000000000000..e7ed4ffa496c --- /dev/null +++ b/crates/polars-stream/src/utils/linearizer.rs @@ -0,0 +1,89 @@ +use std::cmp::Reverse; +use std::collections::{BinaryHeap, VecDeque}; + +use polars_utils::priority::Priority; +use tokio::sync::mpsc::{channel, Receiver, Sender}; + +use crate::async_primitives::task_parker::TaskParker; +use crate::morsel::{Morsel, MorselSeq}; + +/// Stores the state for which inserter we need to poll. +enum PollState { + NoPoll, + Poll(usize), + PollAll, +} + +pub struct Linearizer { + num_inserters: usize, + receivers: Vec>, + poll_state: PollState, + + heap: BinaryHeap, (usize, Morsel)>>, +} + +impl Linearizer { + pub fn new(num_inserters: usize, buffer_size: usize) -> (Self, Vec) { + let mut receivers = Vec::with_capacity(num_inserters); + let mut inserters = Vec::with_capacity(num_inserters); + + for _ in 0..num_inserters { + // We could perhaps use a bespoke spsc bounded channel here in the + // future, instead of tokio's mpsc channel. + let (sender, receiver) = channel(buffer_size); + receivers.push(receiver); + inserters.push(Inserter { sender }); + } + let slf = Self { + num_inserters, + receivers, + poll_state: PollState::PollAll, + heap: BinaryHeap::default(), + }; + (slf, inserters) + } + + pub async fn get(&mut self) -> Option { + // The idea is that we have exactly one morsel per inserter in the + // binary heap, and when we take one out we must refill it. This way we + // always ensure we have the morsel with the lowest global sequence id. + let poll_range = match self.poll_state { + PollState::NoPoll => 0..0, + PollState::Poll(i) => i..i + 1, + PollState::PollAll => 0..self.receivers.len(), + }; + for recv_idx in poll_range { + // If no morsel was received from that particular inserter, that + // stream is done and thus we no longer need to consider it for the + // global order. + if let Some(morsel) = self.receivers[recv_idx].recv().await { + self.heap + .push(Priority(Reverse(morsel.seq()), (recv_idx, morsel))); + } + } + + if let Some(first_in_merged_streams) = self.heap.pop() { + let (receiver_idx, morsel) = first_in_merged_streams.1; + self.poll_state = PollState::Poll(receiver_idx); + Some(morsel) + } else { + self.poll_state = PollState::NoPoll; + None + } + } +} + +pub struct Inserter { + sender: Sender, +} + +impl Inserter { + pub async fn insert(&mut self, mut morsel: Morsel) -> Result<(), Morsel> { + // Drop the consume token, but only after the send has succeeded. This + // ensures we have backpressure, but only once the channel fills up. + let consume_token = morsel.take_consume_token(); + self.sender.send(morsel).await.map_err(|e| e.0)?; + drop(consume_token); + Ok(()) + } +} diff --git a/crates/polars-stream/src/utils/mod.rs b/crates/polars-stream/src/utils/mod.rs index 03f4df442f70..018b893ea992 100644 --- a/crates/polars-stream/src/utils/mod.rs +++ b/crates/polars-stream/src/utils/mod.rs @@ -1,2 +1,3 @@ pub mod in_memory_linearize; pub mod late_materialized_df; +pub mod linearizer; diff --git a/crates/polars-time/src/chunkedarray/datetime.rs b/crates/polars-time/src/chunkedarray/datetime.rs index 7a2bf97d88e7..de14c83c6e72 100644 --- a/crates/polars-time/src/chunkedarray/datetime.rs +++ b/crates/polars-time/src/chunkedarray/datetime.rs @@ -12,7 +12,7 @@ fn cast_and_apply< ca: &DatetimeChunked, func: F, ) -> ChunkedArray { - let dtype = ca.dtype().to_arrow(true); + let dtype = ca.dtype().to_arrow(CompatLevel::newest()); let chunks = ca.downcast_iter().map(|arr| { let arr = cast( arr, diff --git a/crates/polars/tests/it/io/ipc.rs b/crates/polars/tests/it/io/ipc.rs index f69bf78602da..6b5e2a83ba41 100644 --- a/crates/polars/tests/it/io/ipc.rs +++ b/crates/polars/tests/it/io/ipc.rs @@ -12,7 +12,7 @@ fn test_ipc_compression_variadic_buffers() { let mut file = std::io::Cursor::new(vec![]); IpcWriter::new(&mut file) .with_compression(Some(IpcCompression::LZ4)) - .with_pl_flavor(true) + .with_compat_level(CompatLevel::newest()) .finish(&mut df) .unwrap(); @@ -82,7 +82,7 @@ fn test_read_ipc_with_columns() { .unwrap(); df_read.equals(&expected); - for pl_flavor in [false, true] { + for compat_level in [0, 1].map(|level| CompatLevel::with_level(level).unwrap()) { let mut buf: Cursor> = Cursor::new(Vec::new()); let mut df = df![ "letters" => ["x", "y", "z"], @@ -92,7 +92,7 @@ fn test_read_ipc_with_columns() { ] .unwrap(); IpcWriter::new(&mut buf) - .with_pl_flavor(pl_flavor) + .with_compat_level(compat_level) .finish(&mut df) .expect("ipc writer"); buf.set_position(0); diff --git a/crates/polars/tests/it/io/parquet/arrow/mod.rs b/crates/polars/tests/it/io/parquet/arrow/mod.rs index f70866a58a50..2a3423ddb9e2 100644 --- a/crates/polars/tests/it/io/parquet/arrow/mod.rs +++ b/crates/polars/tests/it/io/parquet/arrow/mod.rs @@ -1260,7 +1260,7 @@ fn integration_write( statistics: StatisticsOptions::full(), compression: CompressionOptions::Uncompressed, version: Version::V1, - data_pagesize_limit: None, + data_page_size: None, }; let encodings = schema diff --git a/crates/polars/tests/it/io/parquet/arrow/read_indexes.rs b/crates/polars/tests/it/io/parquet/arrow/read_indexes.rs index d758557a6c1d..8cffac4b35f6 100644 --- a/crates/polars/tests/it/io/parquet/arrow/read_indexes.rs +++ b/crates/polars/tests/it/io/parquet/arrow/read_indexes.rs @@ -32,7 +32,7 @@ fn pages( statistics: StatisticsOptions::full(), compression: CompressionOptions::Uncompressed, version: Version::V1, - data_pagesize_limit: None, + data_page_size: None, }; let pages1 = [array11, array12, array13] @@ -82,7 +82,7 @@ fn read_with_indexes( statistics: StatisticsOptions::full(), compression: CompressionOptions::Uncompressed, version: Version::V1, - data_pagesize_limit: None, + data_page_size: None, }; let to_compressed = |pages: Vec| { diff --git a/crates/polars/tests/it/io/parquet/arrow/write.rs b/crates/polars/tests/it/io/parquet/arrow/write.rs index 4dc71dd49ea2..9c25f346c2e1 100644 --- a/crates/polars/tests/it/io/parquet/arrow/write.rs +++ b/crates/polars/tests/it/io/parquet/arrow/write.rs @@ -48,7 +48,7 @@ fn round_trip_opt_stats( statistics: StatisticsOptions::full(), compression, version, - data_pagesize_limit: None, + data_page_size: None, }; let iter = vec![RecordBatchT::try_new(vec![array.clone()])]; diff --git a/crates/polars/tests/it/io/parquet/read/binary.rs b/crates/polars/tests/it/io/parquet/read/binary.rs index 63eaf49bd474..724e7d791c42 100644 --- a/crates/polars/tests/it/io/parquet/read/binary.rs +++ b/crates/polars/tests/it/io/parquet/read/binary.rs @@ -1,3 +1,4 @@ +use polars_parquet::parquet::encoding::hybrid_rle::FnTranslator; use polars_parquet::parquet::error::ParquetResult; use polars_parquet::parquet::page::DataPage; @@ -22,15 +23,14 @@ pub fn page_to_vec( .map(Some) .map(|x| x.transpose()) .collect(), - FixedLenBinaryPageState::RequiredDictionary(dict) => dict - .indexes - .iter() - .map(|x| dict.dict.value(x as usize).map(|x| x.to_vec()).map(Some)) - .collect(), + FixedLenBinaryPageState::RequiredDictionary(dict) => { + let dictionary = + FnTranslator(|v| dict.dict.value(v as usize).map(|v| Some(v.to_vec()))); + dict.indexes.translate_and_collect(&dictionary) + }, FixedLenBinaryPageState::OptionalDictionary(validity, dict) => { let values = dict .indexes - .iter() .map(|x| dict.dict.value(x as usize).map(|x| x.to_vec())); deserialize_optional(validity, values) }, diff --git a/crates/polars/tests/it/io/parquet/read/fixed_binary.rs b/crates/polars/tests/it/io/parquet/read/fixed_binary.rs index 6951e09367ad..7158864e21bf 100644 --- a/crates/polars/tests/it/io/parquet/read/fixed_binary.rs +++ b/crates/polars/tests/it/io/parquet/read/fixed_binary.rs @@ -21,13 +21,11 @@ pub fn page_to_vec( }, FixedLenBinaryPageState::RequiredDictionary(dict) => dict .indexes - .iter() .map(|x| dict.dict.value(x as usize).map(|x| x.to_vec()).map(Some)) .collect(), FixedLenBinaryPageState::OptionalDictionary(validity, dict) => { let values = dict .indexes - .iter() .map(|x| dict.dict.value(x as usize).map(|x| x.to_vec())); deserialize_optional(validity, values) }, diff --git a/crates/polars/tests/it/io/parquet/read/primitive.rs b/crates/polars/tests/it/io/parquet/read/primitive.rs index c11df388ee58..f9c47ace5679 100644 --- a/crates/polars/tests/it/io/parquet/read/primitive.rs +++ b/crates/polars/tests/it/io/parquet/read/primitive.rs @@ -1,8 +1,9 @@ use polars_parquet::parquet::deserialize::{ HybridRleDecoderIter, HybridRleIter, SliceFilteredIter, }; -use polars_parquet::parquet::encoding::hybrid_rle::Decoder; +use polars_parquet::parquet::encoding::hybrid_rle::{Decoder, FnTranslator}; use polars_parquet::parquet::encoding::Encoding; +use polars_parquet::parquet::error::ParquetResult; use polars_parquet::parquet::page::{split_buffer, DataPage, EncodedSplitBuffer}; use polars_parquet::parquet::schema::Repetition; use polars_parquet::parquet::types::NativeType; @@ -90,7 +91,7 @@ impl<'a, T: NativeType> PageState<'a, T> { pub fn page_to_vec( page: &DataPage, dict: Option<&PrimitivePageDict>, -) -> Result>, ParquetError> { +) -> ParquetResult>> { assert_eq!(page.descriptor.max_rep_level, 0); let state = PageState::::try_new(page, dict)?; @@ -100,16 +101,12 @@ pub fn page_to_vec( deserialize_optional(validity, values.by_ref().map(Ok)) }, NativePageState::Required(values) => Ok(values.map(Some).collect()), - NativePageState::RequiredDictionary(dict) => dict - .indexes - .iter() - .map(|x| dict.dict.value(x as usize).copied().map(Some)) - .collect(), + NativePageState::RequiredDictionary(dict) => { + let dictionary = FnTranslator(|x| dict.dict.value(x as usize).copied().map(Some)); + dict.indexes.translate_and_collect(&dictionary) + }, NativePageState::OptionalDictionary(validity, dict) => { - let values = dict - .indexes - .iter() - .map(|x| dict.dict.value(x as usize).copied()); + let values = dict.indexes.map(|x| dict.dict.value(x as usize).copied()); deserialize_optional(validity, values) }, }, diff --git a/crates/polars/tests/it/io/parquet/read/primitive_nested.rs b/crates/polars/tests/it/io/parquet/read/primitive_nested.rs index b336cea2f498..e3fead47187c 100644 --- a/crates/polars/tests/it/io/parquet/read/primitive_nested.rs +++ b/crates/polars/tests/it/io/parquet/read/primitive_nested.rs @@ -88,7 +88,7 @@ fn read_array_impl>( let num_bits = get_bit_width(rep_level_encoding.1); let rep_levels = HybridRleDecoder::new(rep_levels, num_bits, length); compose_array( - rep_levels.iter(), + rep_levels, std::iter::repeat(0).take(length), max_rep_level, max_def_level, @@ -100,7 +100,7 @@ fn read_array_impl>( let def_levels = HybridRleDecoder::new(def_levels, num_bits, length); compose_array( std::iter::repeat(0).take(length), - def_levels.iter(), + def_levels, max_rep_level, max_def_level, values, @@ -108,11 +108,9 @@ fn read_array_impl>( }, ((Encoding::Rle, false), (Encoding::Rle, false)) => { let rep_levels = - HybridRleDecoder::new(rep_levels, get_bit_width(rep_level_encoding.1), length) - .iter(); + HybridRleDecoder::new(rep_levels, get_bit_width(rep_level_encoding.1), length); let def_levels = - HybridRleDecoder::new(def_levels, get_bit_width(def_level_encoding.1), length) - .iter(); + HybridRleDecoder::new(def_levels, get_bit_width(def_level_encoding.1), length); compose_array(rep_levels, def_levels, max_rep_level, max_def_level, values) }, _ => todo!(), diff --git a/crates/polars/tests/it/io/parquet/read/struct_.rs b/crates/polars/tests/it/io/parquet/read/struct_.rs index 3d25dbeefe3d..1acf0cf834ac 100644 --- a/crates/polars/tests/it/io/parquet/read/struct_.rs +++ b/crates/polars/tests/it/io/parquet/read/struct_.rs @@ -21,7 +21,7 @@ pub fn extend_validity(val: &mut Vec, page: &DataPage) -> ParquetResult<() ); let mut def_levels = - HybridRleDecoder::new(def_levels, get_bit_width(def_level_encoding.1), length).iter(); + HybridRleDecoder::new(def_levels, get_bit_width(def_level_encoding.1), length); val.reserve(length); def_levels.try_for_each(|x| { diff --git a/crates/polars/tests/it/io/parquet/read/utils.rs b/crates/polars/tests/it/io/parquet/read/utils.rs index 240303a4024c..64564f473b2f 100644 --- a/crates/polars/tests/it/io/parquet/read/utils.rs +++ b/crates/polars/tests/it/io/parquet/read/utils.rs @@ -1,7 +1,5 @@ use polars_parquet::parquet::deserialize::{HybridDecoderBitmapIter, HybridEncoded, HybridRleIter}; -use polars_parquet::parquet::encoding::hybrid_rle::{ - self, BitmapIter, BufferedHybridRleDecoderIter, HybridRleDecoder, -}; +use polars_parquet::parquet::encoding::hybrid_rle::{self, BitmapIter, HybridRleDecoder}; use polars_parquet::parquet::error::{ParquetError, ParquetResult}; use polars_parquet::parquet::page::{split_buffer, DataPage, EncodedSplitBuffer}; use polars_parquet::parquet::read::levels::get_bit_width; @@ -41,7 +39,7 @@ pub enum DefLevelsDecoder<'a> { /// that decodes the runs, but not the individual values Bitmap(HybridDecoderBitmapIter<'a>), /// When the maximum definition level is larger than 1 - Levels(BufferedHybridRleDecoderIter<'a>, u32), + Levels(HybridRleDecoder<'a>, u32), } impl<'a> DefLevelsDecoder<'a> { @@ -59,8 +57,7 @@ impl<'a> DefLevelsDecoder<'a> { Self::Bitmap(iter) } else { let iter = - HybridRleDecoder::new(def_levels, get_bit_width(max_def_level), page.num_values()) - .iter(); + HybridRleDecoder::new(def_levels, get_bit_width(max_def_level), page.num_values()); Self::Levels(iter, max_def_level as u32) }) } @@ -141,7 +138,7 @@ fn deserialize_bitmap>>( } fn deserialize_levels>>( - levels: BufferedHybridRleDecoderIter, + levels: HybridRleDecoder, max: u32, mut values: I, ) -> Result>, ParquetError> { diff --git a/crates/polars/tests/it/io/parquet/roundtrip.rs b/crates/polars/tests/it/io/parquet/roundtrip.rs index dd55eac0e9f5..355f2b732532 100644 --- a/crates/polars/tests/it/io/parquet/roundtrip.rs +++ b/crates/polars/tests/it/io/parquet/roundtrip.rs @@ -23,7 +23,7 @@ fn round_trip( statistics: StatisticsOptions::full(), compression, version, - data_pagesize_limit: None, + data_page_size: None, }; let iter = vec![RecordBatchT::try_new(vec![array.clone()])]; diff --git a/docs/development/contributing/index.md b/docs/development/contributing/index.md index 52ef891c3855..6758a6d79301 100644 --- a/docs/development/contributing/index.md +++ b/docs/development/contributing/index.md @@ -176,10 +176,20 @@ Two other things to keep in mind: When you have resolved your issue, [open a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork) in the Polars repository. Please adhere to the following guidelines: -- Start your pull request title with a [conventional commit](https://www.conventionalcommits.org/) tag. This helps us add your contribution to the right section of the changelog. We use the [Angular convention](https://github.com/angular/angular/blob/22b96b9/CONTRIBUTING.md#type). Scope can be `rust` and/or `python`, depending on your contribution. -- Use a descriptive title starting with an uppercase letter. This text will end up in the [changelog](https://github.com/pola-rs/polars/releases). -- In the pull request description, [link](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) to the issue you were working on. -- Add any relevant information to the description that you think may help the maintainers review your code. +- Title + - Start your pull request title with a [conventional commit](https://www.conventionalcommits.org/) tag. + This helps us add your contribution to the right section of the changelog. + We use the [Angular convention](https://github.com/angular/angular/blob/22b96b9/CONTRIBUTING.md#type). + Scope can be `rust` and/or `python`, depending on your contribution: this tag determines which changelog(s) will include your change. + Omit the scope if your change affects both Rust and Python. + - Use a descriptive title starting with an uppercase letter. + This text will end up in the [changelog](https://github.com/pola-rs/polars/releases), so make sure the text is meaningful to the user. + Use single backticks to annotate code snippets. + Use active language and do not end your title with punctuation. + - Example: ``fix(python): Fix `DataFrame.top_k` not handling nulls correctly`` +- Description + - In the pull request description, [link](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) to the issue you were working on. + - Add any relevant information to the description that you think may help the maintainers review your code. - Make sure your branch is [rebased](https://docs.github.com/en/get-started/using-git/about-git-rebase) against the latest version of the `main` branch. - Make sure all [GitHub Actions checks](./ci.md) pass. diff --git a/docs/src/python/user-guide/io/hive.py b/docs/src/python/user-guide/io/hive.py new file mode 100644 index 000000000000..215b4eea6f87 --- /dev/null +++ b/docs/src/python/user-guide/io/hive.py @@ -0,0 +1,131 @@ +# --8<-- [start:init_paths] +import polars as pl +from pathlib import Path + +dfs = [ + pl.DataFrame({"x": [1, 2]}), + pl.DataFrame({"x": [3, 4, 5]}), + pl.DataFrame({"x": [6, 7]}), + pl.DataFrame({"x": [8, 9, 10, 11]}), +] + +parts = [ + "year=2023/month=11", + "year=2023/month=12", + "year=2024/month=01", + "year=2024/month=02", +] + +for df, part in zip(dfs, parts): + path = Path("docs/data/hive/") / part / "data.parquet" + Path(path).parent.mkdir(exist_ok=True, parents=True) + df.write_parquet(path) + + path = Path("docs/data/hive_mixed/") / part / "data.parquet" + Path(path).parent.mkdir(exist_ok=True, parents=True) + df.write_parquet(path) + +Path("docs/data/hive_mixed/description.txt").touch() + + +def print_paths(path: str) -> None: + def dir_recurse(path: Path): + if path.is_dir(): + for p in path.iterdir(): + yield from dir_recurse(p) + else: + yield path + + df = ( + pl.Series( + "File path", + (str(x) for x in dir_recurse(Path(path))), + dtype=pl.String, + ) + .sort() + .to_frame() + ) + + with pl.Config( + tbl_hide_column_data_types=True, + tbl_hide_dataframe_shape=True, + fmt_str_lengths=999, + ): + print(df) + + +print_paths("docs/data/hive/") +# --8<-- [end:init_paths] + +# --8<-- [start:show_mixed_paths] +print_paths("docs/data/hive_mixed/") +# --8<-- [end:show_mixed_paths] + +# --8<-- [start:scan_dir] +import polars as pl + +df = pl.scan_parquet("docs/data/hive/").collect() + +with pl.Config(tbl_rows=99): + print(df) +# --8<-- [end:scan_dir] + +# --8<-- [start:scan_dir_err] +from pathlib import Path + +try: + pl.scan_parquet("docs/data/hive_mixed/").collect() +except Exception as e: + print(e) + +# --8<-- [end:scan_dir_err] + +# --8<-- [start:scan_glob] +df = pl.scan_parquet( + # Glob to match all files ending in `.parquet` + "docs/data/hive_mixed/**/*.parquet", + hive_partitioning=True, +).collect() + +with pl.Config(tbl_rows=99): + print(df) + +# --8<-- [end:scan_glob] + +# --8<-- [start:scan_file_no_hive] +df = pl.scan_parquet( + [ + "docs/data/hive/year=2024/month=01/data.parquet", + "docs/data/hive/year=2024/month=02/data.parquet", + ], +).collect() + +print(df) + +# --8<-- [end:scan_file_no_hive] + +# --8<-- [start:scan_file_hive] +df = pl.scan_parquet( + [ + "docs/data/hive/year=2024/month=01/data.parquet", + "docs/data/hive/year=2024/month=02/data.parquet", + ], + hive_partitioning=True, +).collect() + +print(df) + +# --8<-- [end:scan_file_hive] + +# --8<-- [start:write_parquet_partitioned_show_data] +df = pl.DataFrame({"a": [1, 1, 2, 2, 3], "b": [1, 1, 1, 2, 2], "c": 1}) +print(df) +# --8<-- [end:write_parquet_partitioned_show_data] + +# --8<-- [start:write_parquet_partitioned] +df.write_parquet_partitioned("docs/data/hive_write/", ["a", "b"]) +# --8<-- [end:write_parquet_partitioned] + +# --8<-- [start:write_parquet_partitioned_show_paths] +print_paths("docs/data/hive_write/") +# --8<-- [end:write_parquet_partitioned_show_paths] diff --git a/docs/user-guide/getting-started.md b/docs/user-guide/getting-started.md index 2a601597bb3d..8be3c2f54566 100644 --- a/docs/user-guide/getting-started.md +++ b/docs/user-guide/getting-started.md @@ -42,7 +42,7 @@ For more examples on the CSV file format and other data formats, start with the ## Expressions -`Expressions` are the core strength of Polars. The `expressions` offer a modular structure that allows you to combine simple concepts into complex queries. Below we cover the basic components that serve as building block (or in Polars terminology contexts) for all your queries: +`Expressions` are the core strength of Polars. The `expressions` offer a modular structure that allows you to combine simple concepts into complex queries. Below we cover the basic components that serve as building blocks (or in Polars terminology contexts) for all your queries: - `select` - `filter` diff --git a/docs/user-guide/io/hive.md b/docs/user-guide/io/hive.md new file mode 100644 index 000000000000..27af6b6c18ee --- /dev/null +++ b/docs/user-guide/io/hive.md @@ -0,0 +1,101 @@ +## Scanning hive partitioned data + +Polars supports scanning hive partitioned parquet and IPC datasets, with planned support for other +formats in the future. + +Hive partition parsing is enabled by default if `scan_parquet` receives a single directory path, +otherwise it is disabled by default. This can be explicitly configured using the `hive_partitioning` +parameter. + +### Scanning a hive directory + +For this example the following directory structure is used: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:init_paths" +``` + +Simply pass the directory to `scan_parquet`, and all files will be loaded with the hive parts in the +path included in the output: + +{{code_block('user-guide/io/hive','scan_dir',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_dir" +``` + +### Handling mixed files + +Passing a directory to `scan_parquet` may not work if there are extra non-data files next to the +data files. + +For this example the following directory structure is used: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:show_mixed_paths" +``` + +{{code_block('user-guide/io/hive','scan_dir_err',['scan_parquet'])}} + +The above fails as `description.txt` is not a valid parquet file: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_dir_err" +``` + +In this situation, a glob pattern can be used to be more specific about which files to load. Note +that `hive_partitioning` must explicitly set to `True`: + +{{code_block('user-guide/io/hive','scan_glob',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_glob" +``` + +### Scanning file paths with hive parts + +`hive_partitioning` is not enabled by default for file paths: + +{{code_block('user-guide/io/hive','scan_file_no_hive',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_file_no_hive" +``` + +Pass `hive_partitioning=True` to enable hive partition parsing: + +{{code_block('user-guide/io/hive','scan_file_hive',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_file_hive" +``` + +## Writing hive partitioned data + +> Note: The following functionality is considered _unstable_, and is subject to change. + +Polars supports writing hive partitioned parquet datasets, with planned support for other formats. + +### Example + +For this example the following DataFrame is used: + +{{code_block('user-guide/io/hive','write_parquet_partitioned_show_data',['write_parquet_partitioned'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:write_parquet_partitioned_show_data" +``` + +We will write it to a hive-partitioned parquet dataset, partitioned by the columns `a` and `b`: + +{{code_block('user-guide/io/hive','write_parquet_partitioned',['write_parquet_partitioned'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:write_parquet_partitioned" +``` + +The output is a hive partitioned parquet dataset with the following paths: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:write_parquet_partitioned_show_paths" +``` diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md index 5a3548871e8a..4495cefc9257 100644 --- a/docs/user-guide/io/index.md +++ b/docs/user-guide/io/index.md @@ -7,6 +7,7 @@ Reading and writing your data is crucial for a DataFrame library. In this chapte - [Parquet](parquet.md) - [Json](json.md) - [Multiple](multiple.md) +- [Hive](hive.md) - [Database](database.md) - [Cloud storage](cloud-storage.md) - [Google Big Query](bigquery.md) diff --git a/mkdocs.yml b/mkdocs.yml index d2e3c1e637fa..87e1b03a0212 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,6 +65,7 @@ nav: - user-guide/io/parquet.md - user-guide/io/json.md - user-guide/io/multiple.md + - user-guide/io/hive.md - user-guide/io/database.md - user-guide/io/cloud-storage.md - user-guide/io/bigquery.md diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 3ea2ed1f2b16..365f233bf084 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.0.0" +version = "1.1.0" edition = "2021" [lib] diff --git a/py-polars/docs/source/reference/io.rst b/py-polars/docs/source/reference/io.rst index 1f088958a3c0..0085f3d943fb 100644 --- a/py-polars/docs/source/reference/io.rst +++ b/py-polars/docs/source/reference/io.rst @@ -107,6 +107,7 @@ Parquet read_parquet_schema scan_parquet DataFrame.write_parquet + DataFrame.write_parquet_partitioned LazyFrame.sink_parquet PyArrow Datasets diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index bc2a3714c06c..1c16e21eb637 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -140,7 +140,7 @@ ClosedInterval: TypeAlias = Literal["left", "right", "both", "none"] # ClosedWindow InterpolationMethod: TypeAlias = Literal["linear", "nearest"] JoinStrategy: TypeAlias = Literal[ - "inner", "left", "full", "semi", "anti", "cross", "outer" + "inner", "left", "right", "full", "semi", "anti", "cross", "outer" ] # JoinType RollingInterpolationMethod: TypeAlias = Literal[ "nearest", "higher", "lower", "midpoint", "linear" diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index 54e340c36248..d41fd0da3529 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -1040,6 +1040,14 @@ def pandas_to_pydf( include_index: bool = False, ) -> PyDataFrame: """Construct a PyDataFrame from a pandas DataFrame.""" + stringified_cols = {str(col) for col in data.columns} + if len(stringified_cols) < len(data.columns): + msg = ( + "Polars dataframes must have unique string column names." + "Please check your pandas dataframe for duplicates." + ) + raise ValueError(msg) + convert_index = include_index and not _pandas_has_default_index(data) if not convert_index and all( is_simple_numpy_backed_pandas_series(data[col]) for col in data.columns diff --git a/py-polars/polars/_utils/construction/utils.py b/py-polars/polars/_utils/construction/utils.py index ab64598f12f5..de214a2dfb15 100644 --- a/py-polars/polars/_utils/construction/utils.py +++ b/py-polars/polars/_utils/construction/utils.py @@ -106,13 +106,11 @@ def is_simple_numpy_backed_pandas_series( if len(series.shape) > 1: # Pandas Series is actually a Pandas DataFrame when the original DataFrame # contains duplicated columns and a duplicated column is requested with df["a"]. - msg = "duplicate column names found: " - raise ValueError( - msg, - f"{series.columns.tolist()!s}", # type: ignore[union-attr] - ) + msg = f"duplicate column names found: {series.columns.tolist()!s}" # type: ignore[union-attr] + raise ValueError(msg) return (str(series.dtype) in PANDAS_SIMPLE_NUMPY_DTYPES) or ( series.dtype == "object" + and not series.hasnans and not series.empty and isinstance(next(iter(series)), str) ) diff --git a/py-polars/polars/_utils/getitem.py b/py-polars/polars/_utils/getitem.py index 7dd0e24cef5d..991dba52bb2d 100644 --- a/py-polars/polars/_utils/getitem.py +++ b/py-polars/polars/_utils/getitem.py @@ -61,9 +61,17 @@ def get_series_item_by_key( elif isinstance(key, Sequence): if not key: return s.clear() - if isinstance(key[0], bool): + + first = key[0] + if isinstance(first, bool): _raise_on_boolean_mask() - indices = pl.Series("", key, dtype=Int64) + + try: + indices = pl.Series("", key, dtype=Int64) + except TypeError: + msg = f"cannot select elements using Sequence with elements of type {type(first).__name__!r}" + raise TypeError(msg) from None + indices = _convert_series_to_indices(indices, s.len()) return _select_elements_by_index(s, indices) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 04bc70ecc8f8..b2972a164424 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7,7 +7,7 @@ import random from collections import defaultdict from collections.abc import Sized -from io import BytesIO, StringIO, TextIOWrapper +from io import BytesIO, StringIO from operator import itemgetter from pathlib import Path from typing import ( @@ -24,7 +24,6 @@ NoReturn, Sequence, TypeVar, - cast, get_args, overload, ) @@ -104,6 +103,7 @@ TooManyRowsReturnedError, ) from polars.functions import col, lit +from polars.interchange.protocol import CompatLevel from polars.schema import Schema from polars.selectors import _expand_selector_dicts, _expand_selectors @@ -1385,7 +1385,8 @@ def item(self, row: int | None = None, column: int | str | None = None) -> Any: ) return s.get_index_signed(row) - def to_arrow(self, *, future: bool = False) -> pa.Table: + @deprecate_renamed_parameter("future", "compat_level", version="1.1") + def to_arrow(self, *, compat_level: CompatLevel | None = None) -> pa.Table: """ Collect the underlying arrow arrays in an Arrow Table. @@ -1396,13 +1397,9 @@ def to_arrow(self, *, future: bool = False) -> pa.Table: Parameters ---------- - future - Setting this to `True` will write Polars' internal data structures that - might not be available by other Arrow implementations. - - .. warning:: - This functionality is considered **unstable**. It may be changed - at any point without it being considered a breaking change. + compat_level + Use a specific compatibility level + when exporting Polars' internal data structures. Examples -------- @@ -1420,12 +1417,12 @@ def to_arrow(self, *, future: bool = False) -> pa.Table: if not self.width: # 0x0 dataframe, cannot infer schema from batches return pa.table({}) - if future: - issue_unstable_warning( - "The `future` parameter of `DataFrame.to_arrow` is considered unstable." - ) + if compat_level is None: + compat_level = False # type: ignore[assignment] + elif isinstance(compat_level, CompatLevel): + compat_level = compat_level._version # type: ignore[attr-defined] - record_batches = self._df.to_arrow(future) + record_batches = self._df.to_arrow(compat_level) return pa.Table.from_batches(record_batches) @overload @@ -2695,8 +2692,6 @@ def write_csv( should_return_buffer = True elif isinstance(file, (str, os.PathLike)): file = normalize_filepath(file) - elif isinstance(file, TextIOWrapper): - file = cast(TextIOWrapper, file.buffer) self._df.write_csv( file, @@ -3300,7 +3295,7 @@ def write_ipc( file: None, *, compression: IpcCompression = "uncompressed", - future: bool | None = None, + compat_level: CompatLevel | None = None, ) -> BytesIO: ... @overload @@ -3309,15 +3304,16 @@ def write_ipc( file: str | Path | IO[bytes], *, compression: IpcCompression = "uncompressed", - future: bool | None = None, + compat_level: CompatLevel | None = None, ) -> None: ... + @deprecate_renamed_parameter("future", "compat_level", version="1.1") def write_ipc( self, file: str | Path | IO[bytes] | None, *, compression: IpcCompression = "uncompressed", - future: bool | None = None, + compat_level: CompatLevel | None = None, ) -> BytesIO | None: """ Write to Arrow IPC binary stream or Feather file. @@ -3331,13 +3327,9 @@ def write_ipc( written. If set to `None`, the output is returned as a BytesIO object. compression : {'uncompressed', 'lz4', 'zstd'} Compression method. Defaults to "uncompressed". - future - Setting this to `True` will write Polars' internal data structures that - might not be available by other Arrow implementations. - - .. warning:: - This functionality is considered **unstable**. It may be changed - at any point without it being considered a breaking change. + compat_level + Use a specific compatibility level + when exporting Polars' internal data structures. Examples -------- @@ -3359,17 +3351,15 @@ def write_ipc( elif isinstance(file, (str, Path)): file = normalize_filepath(file) + if compat_level is None: + compat_level = True # type: ignore[assignment] + elif isinstance(compat_level, CompatLevel): + compat_level = compat_level._version # type: ignore[attr-defined] + if compression is None: compression = "uncompressed" - if future: - issue_unstable_warning( - "The `future` parameter of `DataFrame.write_ipc` is considered unstable." - ) - if future is None: - future = True - - self._df.write_ipc(file, compression, future) + self._df.write_ipc(file, compression, compat_level) return file if return_bytes else None # type: ignore[return-value] @overload @@ -3378,7 +3368,7 @@ def write_ipc_stream( file: None, *, compression: IpcCompression = "uncompressed", - future: bool | None = None, + compat_level: CompatLevel | None = None, ) -> BytesIO: ... @overload @@ -3387,15 +3377,16 @@ def write_ipc_stream( file: str | Path | IO[bytes], *, compression: IpcCompression = "uncompressed", - future: bool | None = None, + compat_level: CompatLevel | None = None, ) -> None: ... + @deprecate_renamed_parameter("future", "compat_level", version="1.1") def write_ipc_stream( self, file: str | Path | IO[bytes] | None, *, compression: IpcCompression = "uncompressed", - future: bool | None = None, + compat_level: CompatLevel | None = None, ) -> BytesIO | None: """ Write to Arrow IPC record batch stream. @@ -3409,13 +3400,9 @@ def write_ipc_stream( be written. If set to `None`, the output is returned as a BytesIO object. compression : {'uncompressed', 'lz4', 'zstd'} Compression method. Defaults to "uncompressed". - future - Setting this to `True` will write Polars' internal data structures that - might not be available by other Arrow implementations. - - .. warning:: - This functionality is considered **unstable**. It may be changed - at any point without it being considered a breaking change. + compat_level + Use a specific compatibility level + when exporting Polars' internal data structures. Examples -------- @@ -3437,17 +3424,15 @@ def write_ipc_stream( elif isinstance(file, (str, Path)): file = normalize_filepath(file) + if compat_level is None: + compat_level = True # type: ignore[assignment] + elif isinstance(compat_level, CompatLevel): + compat_level = compat_level._version # type: ignore[attr-defined] + if compression is None: compression = "uncompressed" - if future: - issue_unstable_warning( - "The `future` parameter of `DataFrame.write_ipc` is considered unstable." - ) - if future is None: - future = True - - self._df.write_ipc_stream(file, compression, future=future) + self._df.write_ipc_stream(file, compression, compat_level) return file if return_bytes else None # type: ignore[return-value] def write_parquet( @@ -3618,6 +3603,98 @@ def write_parquet( data_page_size, ) + @unstable() + def write_parquet_partitioned( + self, + path: str | Path, + partition_by: str | Collection[str], + *, + chunk_size_bytes: int = 4_294_967_296, + compression: ParquetCompression = "zstd", + compression_level: int | None = None, + statistics: bool | str | dict[str, bool] = True, + row_group_size: int | None = None, + data_page_size: int | None = None, + ) -> None: + """ + Write a partitioned directory of parquet files. + + Parameters + ---------- + path + Path to the base directory for the partitioned dataset. + partition_by + Columns to partition by. + chunk_size_bytes + Approximate size to split DataFrames within a single partition when + writing. Note this is calculated using the size of the DataFrame in + memory - the size of the output file may differ depending on the + file format / compression. + compression : {'lz4', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'zstd'} + Choose "zstd" for good compression performance. + Choose "lz4" for fast compression/decompression. + Choose "snappy" for more backwards compatibility guarantees + when you deal with older parquet readers. + compression_level + The level of compression to use. Higher compression means smaller files on + disk. + + - "gzip" : min-level: 0, max-level: 10. + - "brotli" : min-level: 0, max-level: 11. + - "zstd" : min-level: 1, max-level: 22. + + statistics + Write statistics to the parquet headers. This is the default behavior. + + Possible values: + + - `True`: enable default set of statistics (default) + - `False`: disable all statistics + - "full": calculate and write all available statistics. Cannot be + combined with `use_pyarrow`. + - `{ "statistic-key": True / False, ... }`. Cannot be combined with + `use_pyarrow`. Available keys: + + - "min": column minimum value (default: `True`) + - "max": column maximum value (default: `True`) + - "distinct_count": number of unique column values (default: `False`) + - "null_count": number of null values in column (default: `True`) + row_group_size + Size of the row groups in number of rows. Defaults to 512^2 rows. + data_page_size + Size of the data page in bytes. Defaults to 1024^2 bytes. + """ + path = normalize_filepath(path, check_not_directory=False) + partition_by = [partition_by] if isinstance(partition_by, str) else partition_by + + if isinstance(statistics, bool) and statistics: + statistics = { + "min": True, + "max": True, + "distinct_count": False, + "null_count": True, + } + elif isinstance(statistics, bool) and not statistics: + statistics = {} + elif statistics == "full": + statistics = { + "min": True, + "max": True, + "distinct_count": True, + "null_count": True, + } + + self._df.write_parquet_partitioned( + path, + partition_by, + chunk_size_bytes, + compression, + compression_level, + statistics, + row_group_size, + data_page_size, + ) + def write_database( self, table_name: str, @@ -3801,13 +3878,20 @@ def unpack_table_name(name: str) -> tuple[str | None, str | None, str]: min_err_prefix="pandas >= 2.2 requires", ) # note: the catalog (database) should be a part of the connection string - from sqlalchemy.engine import create_engine + from sqlalchemy.engine import Connectable, create_engine + from sqlalchemy.orm import Session + + sa_object: Connectable + if isinstance(connection, str): + sa_object = create_engine(connection) + elif isinstance(connection, Session): + sa_object = connection.connection() + elif isinstance(connection, Connectable): + sa_object = connection + else: + error_msg = f"unexpected connection type {type(connection)}" + raise TypeError(error_msg) - engine_sa = ( - create_engine(connection) - if isinstance(connection, str) - else connection.engine # type: ignore[union-attr] - ) catalog, db_schema, unpacked_table_name = unpack_table_name(table_name) if catalog: msg = f"Unexpected three-part table name; provide the database/catalog ({catalog!r}) on the connection URI" @@ -3820,7 +3904,7 @@ def unpack_table_name(name: str) -> tuple[str | None, str | None, str]: ).to_sql( name=unpacked_table_name, schema=db_schema, - con=engine_sa, + con=sa_object, if_exists=if_table_exists, index=False, **(engine_options or {}), @@ -6643,7 +6727,7 @@ def join( DataFrame to join with. on Name(s) of the join columns in both DataFrames. - how : {'inner', 'left', 'full', 'semi', 'anti', 'cross'} + how : {'inner', 'left', 'right', 'full', 'semi', 'anti', 'cross'} Join strategy. * *inner* @@ -6651,6 +6735,9 @@ def join( * *left* Returns all rows from the left table, and the matched rows from the right table + * *right* + Returns all rows from the right table, and the matched rows from the + left table * *full* Returns all rows when there is a match in either left or right table * *cross* diff --git a/py-polars/polars/interchange/protocol.py b/py-polars/polars/interchange/protocol.py index 2daca4b3cb19..4eda7fa95f2d 100644 --- a/py-polars/polars/interchange/protocol.py +++ b/py-polars/polars/interchange/protocol.py @@ -13,6 +13,8 @@ TypedDict, ) +from polars._utils.unstable import issue_unstable_warning + if TYPE_CHECKING: import sys @@ -255,3 +257,47 @@ class Endianness: class CopyNotAllowedError(RuntimeError): """Exception raised when a copy is required, but `allow_copy` is set to `False`.""" + + +class CompatLevel: + """Data structure compatibility level.""" + + def __init__(self) -> None: + msg = "it is not allowed to create a CompatLevel object" + raise TypeError(msg) + + @staticmethod + def _with_version(version: int) -> CompatLevel: + compat_level = CompatLevel.__new__(CompatLevel) + compat_level._version = version # type: ignore[attr-defined] + return compat_level + + @staticmethod + def _newest() -> CompatLevel: + return CompatLevel._future1 # type: ignore[attr-defined] + + @staticmethod + def newest() -> CompatLevel: + """ + Get the highest supported compatibility level. + + .. warning:: + Highest compatibility level is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + """ + issue_unstable_warning( + "Using the highest compatibility level is considered unstable." + ) + return CompatLevel._newest() + + @staticmethod + def oldest() -> CompatLevel: + """Get the most compatible level.""" + return CompatLevel._compatible # type: ignore[attr-defined] + + def __repr__(self) -> str: + return f"<{self.__class__.__module__}.{self.__class__.__qualname__}: {self._version}>" # type: ignore[attr-defined] + + +CompatLevel._compatible = CompatLevel._with_version(0) # type: ignore[attr-defined] +CompatLevel._future1 = CompatLevel._with_version(1) # type: ignore[attr-defined] diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index e24b6f258579..e8971b08660d 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -281,10 +281,6 @@ def is_glob_pattern(file: str) -> bool: return any(char in file for char in ["*", "?", "["]) -def is_supported_cloud(file: str) -> bool: - return bool(re.match("^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?)://", file)) - - def is_local_file(file: str) -> bool: try: next(glob.iglob(file, recursive=True)) # noqa: PTH207 diff --git a/py-polars/polars/io/database/_executor.py b/py-polars/polars/io/database/_executor.py index 67492a8bee59..13e5b2cbe037 100644 --- a/py-polars/polars/io/database/_executor.py +++ b/py-polars/polars/io/database/_executor.py @@ -346,15 +346,18 @@ def _is_alchemy_engine(conn: Any) -> bool: @staticmethod def _is_alchemy_session(conn: Any) -> bool: """Check if the cursor/connection/session object is async.""" - from sqlalchemy.ext.asyncio import ( - AsyncSession, - async_sessionmaker, - ) + from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import Session, sessionmaker - return isinstance( - conn, (Session, sessionmaker, AsyncSession, async_sessionmaker) - ) + if isinstance(conn, (AsyncSession, Session, sessionmaker)): + return True + + try: + from sqlalchemy.ext.asyncio import async_sessionmaker + + return isinstance(conn, async_sessionmaker) + except ImportError: + return False def _normalise_cursor(self, conn: Any) -> Cursor: """Normalise a connection object such that we have the query executor.""" diff --git a/py-polars/polars/io/database/_inference.py b/py-polars/polars/io/database/_inference.py index f9045e00bdb6..7b48a3fc2377 100644 --- a/py-polars/polars/io/database/_inference.py +++ b/py-polars/polars/io/database/_inference.py @@ -203,7 +203,7 @@ def _infer_dtype_from_cursor_description( description: tuple[Any, ...], ) -> PolarsDataType | None: """Attempt to infer Polars dtype from database cursor description `type_code`.""" - type_code, _disp_size, internal_size, precision, scale, _null_ok = description + type_code, _disp_size, internal_size, precision, scale, *_ = description dtype: PolarsDataType | None = None if isclass(type_code): diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index deb9b7a8956b..106a025bf679 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -27,6 +27,7 @@ if TYPE_CHECKING: from polars import DataFrame, DataType, LazyFrame + from polars._typing import SchemaDict @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @@ -313,6 +314,9 @@ def scan_ipc( memory_map: bool = True, retries: int = 0, file_cache_ttl: int | None = None, + hive_partitioning: bool | None = None, + hive_schema: SchemaDict | None = None, + try_parse_hive_dates: bool = True, ) -> LazyFrame: """ Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns. @@ -349,6 +353,20 @@ def scan_ipc( Amount of time to keep downloaded cloud files since their last access time, in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable (which defaults to 1 hour) if not given. + hive_partitioning + Infer statistics and schema from Hive partitioned URL and use them + to prune reads. This is unset by default (i.e. `None`), meaning it is + automatically enabled when a single directory is passed, and otherwise + disabled. + hive_schema + The column names and data types of the columns by which the data is partitioned. + If set to `None` (default), the schema of the Hive partitions is inferred. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + try_parse_hive_dates + Whether to try parsing hive values as date/datetime types. """ if isinstance(source, (str, Path)): @@ -382,5 +400,8 @@ def scan_ipc( cloud_options=storage_options, retries=retries, file_cache_ttl=file_cache_ttl, + hive_partitioning=hive_partitioning, + hive_schema=hive_schema, + try_parse_hive_dates=try_parse_hive_dates, ) return wrap_ldf(pylf) diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py index ccaf721bbe58..83152753185e 100644 --- a/py-polars/polars/io/spreadsheet/functions.py +++ b/py-polars/polars/io/spreadsheet/functions.py @@ -22,6 +22,7 @@ Int64, Null, String, + UInt8, ) from polars.datatypes.group import FLOAT_DTYPES, INTEGER_DTYPES, NUMERIC_DTYPES from polars.dependencies import import_optional @@ -506,6 +507,7 @@ def _read_spreadsheet( read_options = (read_options or {}).copy() engine_options = (engine_options or {}).copy() + schema_overrides = dict(schema_overrides or {}) # normalise some top-level parameters to 'read_options' entries if engine == "calamine": @@ -872,7 +874,7 @@ def _read_spreadsheet_calamine( elif base_dtype == Duration: parser_dtypes[name] = "duration" elif base_dtype == Boolean: - parser_dtypes[name] = "bool" + parser_dtypes[name] = "boolean" read_options["dtypes"] = parser_dtypes @@ -936,6 +938,13 @@ def _read_spreadsheet_xlsx2csv( if columns: read_options["columns"] = columns + cast_to_boolean = [] + if schema_overrides: + for col, dtype in schema_overrides.items(): + if dtype == Boolean: + schema_overrides[col] = UInt8 # type: ignore[index] + cast_to_boolean.append(F.col(col).cast(Boolean)) + df = _csv_buffer_to_frame( csv_buffer, separator=",", @@ -943,4 +952,7 @@ def _read_spreadsheet_xlsx2csv( schema_overrides=schema_overrides, raise_if_empty=raise_if_empty, ) + if cast_to_boolean: + df = df.with_columns(*cast_to_boolean) + return _reorder_columns(df, columns) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 096e5e40f74e..6f60a9d3326e 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2154,7 +2154,7 @@ def sink_parquet( compression_level: int | None = None, statistics: bool | str | dict[str, bool] = True, row_group_size: int | None = None, - data_pagesize_limit: int | None = None, + data_page_size: int | None = None, maintain_order: bool = True, type_coercion: bool = True, predicate_pushdown: bool = True, @@ -2209,7 +2209,7 @@ def sink_parquet( If None (default), the chunks of the `DataFrame` are used. Writing in smaller chunks may reduce memory pressure and improve writing speeds. - data_pagesize_limit + data_page_size Size limit of individual data pages. If not set defaults to 1024 * 1024 bytes maintain_order @@ -2269,7 +2269,7 @@ def sink_parquet( compression_level=compression_level, statistics=statistics, row_group_size=row_group_size, - data_pagesize_limit=data_pagesize_limit, + data_page_size=data_page_size, maintain_order=maintain_order, ) @@ -2475,7 +2475,7 @@ def sink_csv( ) return lf.sink_csv( - path=path, + path=normalize_filepath(path), include_bom=include_bom, include_header=include_header, separator=ord(separator), @@ -4060,6 +4060,9 @@ def join( * *left* Returns all rows from the left table, and the matched rows from the right table + * *right* + Returns all rows from the right table, and the matched rows from the + left table * *full* Returns all rows when there is a match in either left or right table * *cross* diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 75a17453613e..bd3d6a45a43f 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -98,6 +98,7 @@ from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import ComputeError, ModuleUpgradeRequiredError, ShapeError +from polars.interchange.protocol import CompatLevel from polars.series.array import ArrayNameSpace from polars.series.binary import BinaryNameSpace from polars.series.categorical import CatNameSpace @@ -502,6 +503,15 @@ def _from_buffers( validity = validity._s return cls._from_pyseries(PySeries._from_buffers(dtype, data, validity)) + @staticmethod + def _newest_compat_level() -> int: + """ + Get the newest supported compat level. + + This is for pyo3-polars. + """ + return CompatLevel._newest()._version # type: ignore[attr-defined] + @property def dtype(self) -> DataType: """ @@ -4342,7 +4352,8 @@ def to_torch(self) -> torch.Tensor: # tensor.rename(self.name) return tensor - def to_arrow(self, *, future: bool = False) -> pa.Array: + @deprecate_renamed_parameter("future", "compat_level", version="1.1") + def to_arrow(self, *, compat_level: CompatLevel | None = None) -> pa.Array: """ Return the underlying Arrow array. @@ -4350,13 +4361,9 @@ def to_arrow(self, *, future: bool = False) -> pa.Array: Parameters ---------- - future - Setting this to `True` will write Polars' internal data structures that - might not be available by other Arrow implementations. - - .. warning:: - This functionality is considered **unstable**. It may be changed - at any point without it being considered a breaking change. + compat_level + Use a specific compatibility level + when exporting Polars' internal data structures. Examples -------- @@ -4370,7 +4377,11 @@ def to_arrow(self, *, future: bool = False) -> pa.Array: 3 ] """ - return self._s.to_arrow(future) + if compat_level is None: + compat_level = False # type: ignore[assignment] + elif isinstance(compat_level, CompatLevel): + compat_level = compat_level._version # type: ignore[attr-defined] + return self._s.to_arrow(compat_level) def to_pandas( self, *, use_pyarrow_extension_array: bool = False, **kwargs: Any diff --git a/py-polars/polars/sql/context.py b/py-polars/polars/sql/context.py index 0d207ac3a1fd..85101e65ed4b 100644 --- a/py-polars/polars/sql/context.py +++ b/py-polars/polars/sql/context.py @@ -279,7 +279,7 @@ def execute_global( possible_names = ( { nm.strip('"') - for nm in re.split(r"\s", q[1]) + for nm in re.split(r"\b", q[1]) if re.match(r'^("[^"]+")$', nm) or nm.isidentifier() } if len(q) > 1 diff --git a/py-polars/src/conversion/mod.rs b/py-polars/src/conversion/mod.rs index 0049ab967598..0451c9f8eddf 100644 --- a/py-polars/src/conversion/mod.rs +++ b/py-polars/src/conversion/mod.rs @@ -775,6 +775,7 @@ impl<'py> FromPyObject<'py> for Wrap { let parsed = match &*ob.extract::()? { "inner" => JoinType::Inner, "left" => JoinType::Left, + "right" => JoinType::Right, "full" => JoinType::Full, "semi" => JoinType::Semi, "anti" => JoinType::Anti, @@ -1181,3 +1182,28 @@ where { container.into_iter().map(|s| s.as_ref().into()).collect() } + +#[derive(Debug, Copy, Clone)] +pub struct PyCompatLevel(pub CompatLevel); + +impl<'a> FromPyObject<'a> for PyCompatLevel { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { + Ok(PyCompatLevel(if let Ok(level) = ob.extract::() { + if let Ok(compat_level) = CompatLevel::with_level(level) { + compat_level + } else { + return Err(PyValueError::new_err("invalid compat level")); + } + } else if let Ok(future) = ob.extract::() { + if future { + CompatLevel::newest() + } else { + CompatLevel::oldest() + } + } else { + return Err(PyTypeError::new_err( + "'compat_level' argument accepts int or bool", + )); + })) + } +} diff --git a/py-polars/src/dataframe/export.rs b/py-polars/src/dataframe/export.rs index c22d974407f4..0b0d0a4f9020 100644 --- a/py-polars/src/dataframe/export.rs +++ b/py-polars/src/dataframe/export.rs @@ -7,6 +7,7 @@ use pyo3::types::{PyList, PyTuple}; use super::*; use crate::conversion::{ObjectValue, Wrap}; use crate::interop; +use crate::prelude::PyCompatLevel; #[pymethods] impl PyDataFrame { @@ -63,7 +64,7 @@ impl PyDataFrame { } #[allow(clippy::wrong_self_convention)] - pub fn to_arrow(&mut self, future: bool) -> PyResult> { + pub fn to_arrow(&mut self, compat_level: PyCompatLevel) -> PyResult> { self.df.align_chunks(); Python::with_gil(|py| { let pyarrow = py.import_bound("pyarrow")?; @@ -71,7 +72,7 @@ impl PyDataFrame { let rbs = self .df - .iter_chunks(future, true) + .iter_chunks(compat_level.0, true) .map(|rb| interop::arrow::to_py::to_py_rb(&rb, &names, py, &pyarrow)) .collect::>()?; Ok(rbs) @@ -104,7 +105,7 @@ impl PyDataFrame { .collect::>(); let rbs = self .df - .iter_chunks(false, true) + .iter_chunks(CompatLevel::oldest(), true) .map(|rb| { let mut rb = rb.into_arrays(); for i in &cat_columns { diff --git a/py-polars/src/dataframe/io.rs b/py-polars/src/dataframe/io.rs index b363def33d14..61afd6fc1b74 100644 --- a/py-polars/src/dataframe/io.rs +++ b/py-polars/src/dataframe/io.rs @@ -18,6 +18,7 @@ use crate::file::{ get_either_file, get_file_like, get_mmap_bytes_reader, get_mmap_bytes_reader_and_path, read_if_bytesio, EitherRustPythonFile, }; +use crate::prelude::PyCompatLevel; #[pymethods] impl PyDataFrame { @@ -402,6 +403,49 @@ impl PyDataFrame { Ok(()) } + #[cfg(feature = "parquet")] + #[pyo3(signature = (py_f, partition_by, chunk_size_bytes, compression, compression_level, statistics, row_group_size, data_page_size))] + pub fn write_parquet_partitioned( + &mut self, + py: Python, + py_f: PyObject, + partition_by: Vec, + chunk_size_bytes: usize, + compression: &str, + compression_level: Option, + statistics: Wrap, + row_group_size: Option, + data_page_size: Option, + ) -> PyResult<()> { + use std::path::Path; + + use polars_io::partition::write_partitioned_dataset; + + let Ok(path) = py_f.extract::(py) else { + return Err(PyPolarsErr::from(polars_err!(ComputeError: "expected path-like")).into()); + }; + let path = Path::new(&*path); + let compression = parse_parquet_compression(compression, compression_level)?; + + let write_options = ParquetWriteOptions { + compression, + statistics: statistics.0, + row_group_size, + data_page_size, + maintain_order: true, + }; + + write_partitioned_dataset( + &self.df, + path, + partition_by.as_slice(), + &write_options, + chunk_size_bytes, + ) + .map_err(PyPolarsErr::from)?; + Ok(()) + } + #[cfg(feature = "json")] pub fn write_json(&mut self, py_f: PyObject) -> PyResult<()> { let file = BufWriter::new(get_file_like(py_f, true)?); @@ -431,7 +475,7 @@ impl PyDataFrame { py: Python, py_f: PyObject, compression: Wrap>, - future: bool, + compat_level: PyCompatLevel, ) -> PyResult<()> { let either = get_either_file(py_f, true)?; if let EitherRustPythonFile::Rust(ref f) = either { @@ -441,7 +485,7 @@ impl PyDataFrame { py.allow_threads(|| { IpcWriter::new(&mut buf) .with_compression(compression.0) - .with_pl_flavor(future) + .with_compat_level(compat_level.0) .finish(&mut self.df) .map_err(PyPolarsErr::from) })?; @@ -454,13 +498,13 @@ impl PyDataFrame { py: Python, py_f: PyObject, compression: Wrap>, - future: bool, + compat_level: PyCompatLevel, ) -> PyResult<()> { let mut buf = get_file_like(py_f, true)?; py.allow_threads(|| { IpcStreamWriter::new(&mut buf) .with_compression(compression.0) - .with_pl_flavor(future) + .with_compat_level(compat_level.0) .finish(&mut self.df) .map_err(PyPolarsErr::from) })?; diff --git a/py-polars/src/dataframe/serde.rs b/py-polars/src/dataframe/serde.rs index 131262acf9c8..524d894786cd 100644 --- a/py-polars/src/dataframe/serde.rs +++ b/py-polars/src/dataframe/serde.rs @@ -2,12 +2,12 @@ use std::io::{BufReader, BufWriter, Cursor}; use std::ops::Deref; use polars_io::mmap::ReaderBytes; -use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::PyBytes; use super::PyDataFrame; use crate::error::PyPolarsErr; +use crate::exceptions::ComputeError; use crate::file::{get_file_like, get_mmap_bytes_reader}; use crate::prelude::*; @@ -18,7 +18,7 @@ impl PyDataFrame { // Used in pickle/pickling let mut buf: Vec = vec![]; IpcStreamWriter::new(&mut buf) - .with_pl_flavor(true) + .with_compat_level(CompatLevel::newest()) .finish(&mut self.df.clone()) .expect("ipc writer"); Ok(PyBytes::new_bound(py, &buf).to_object(py)) @@ -48,7 +48,7 @@ impl PyDataFrame { let file = get_file_like(py_f, true)?; let writer = BufWriter::new(file); ciborium::into_writer(&self.df, writer) - .map_err(|err| PyValueError::new_err(format!("{err:?}"))) + .map_err(|err| ComputeError::new_err(err.to_string())) } /// Serialize into a JSON string. @@ -57,7 +57,7 @@ impl PyDataFrame { let file = get_file_like(py_f, true)?; let writer = BufWriter::new(file); serde_json::to_writer(writer, &self.df) - .map_err(|err| PyValueError::new_err(format!("{err:?}"))) + .map_err(|err| ComputeError::new_err(err.to_string())) } /// Deserialize a file-like object containing binary data into a DataFrame. @@ -66,7 +66,7 @@ impl PyDataFrame { let file = get_file_like(py_f, false)?; let reader = BufReader::new(file); let df = ciborium::from_reader::(reader) - .map_err(|err| PyValueError::new_err(format!("{err:?}")))?; + .map_err(|err| ComputeError::new_err(err.to_string()))?; Ok(df.into()) } @@ -81,14 +81,9 @@ impl PyDataFrame { py.allow_threads(move || { let mmap_read: ReaderBytes = (&mut mmap_bytes_r).into(); let bytes = mmap_read.deref(); - match serde_json::from_slice::(bytes) { - Ok(df) => Ok(df.into()), - Err(e) => { - let msg = format!("{e}"); - let e = PyPolarsErr::from(PolarsError::ComputeError(msg.into())); - Err(PyErr::from(e)) - }, - } + let df = serde_json::from_slice::(bytes) + .map_err(|err| ComputeError::new_err(err.to_string()))?; + Ok(df.into()) }) } } diff --git a/py-polars/src/expr/serde.rs b/py-polars/src/expr/serde.rs index a2107ee67665..8045a1076d39 100644 --- a/py-polars/src/expr/serde.rs +++ b/py-polars/src/expr/serde.rs @@ -1,14 +1,13 @@ use std::io::{BufReader, BufWriter, Cursor}; use polars::lazy::prelude::Expr; -use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedBytes; use pyo3::types::PyBytes; use crate::error::PyPolarsErr; +use crate::exceptions::ComputeError; use crate::file::get_file_like; -use crate::prelude::polars_err; use crate::PyExpr; #[pymethods] @@ -40,7 +39,7 @@ impl PyExpr { let file = get_file_like(py_f, true)?; let writer = BufWriter::new(file); ciborium::into_writer(&self.inner, writer) - .map_err(|err| PyValueError::new_err(format!("{err:?}"))) + .map_err(|err| ComputeError::new_err(err.to_string())) } /// Serialize into a JSON string. @@ -49,7 +48,7 @@ impl PyExpr { let file = get_file_like(py_f, true)?; let writer = BufWriter::new(file); serde_json::to_writer(writer, &self.inner) - .map_err(|err| PyValueError::new_err(format!("{err:?}"))) + .map_err(|err| ComputeError::new_err(err.to_string())) } /// Deserialize a file-like object containing binary data into an Expr. @@ -58,7 +57,7 @@ impl PyExpr { let file = get_file_like(py_f, false)?; let reader = BufReader::new(file); let expr = ciborium::from_reader::(reader) - .map_err(|err| PyValueError::new_err(format!("{err:?}")))?; + .map_err(|err| ComputeError::new_err(err.to_string()))?; Ok(expr.into()) } @@ -83,7 +82,7 @@ impl PyExpr { let inner: Expr = serde_json::from_str(json).map_err(|_| { let msg = "could not deserialize input into an expression"; - PyPolarsErr::from(polars_err!(ComputeError: msg)) + ComputeError::new_err(msg) })?; Ok(inner.into()) } diff --git a/py-polars/src/expr/struct.rs b/py-polars/src/expr/struct.rs index 74cff11e6eac..167d1d24a137 100644 --- a/py-polars/src/expr/struct.rs +++ b/py-polars/src/expr/struct.rs @@ -1,5 +1,6 @@ use pyo3::prelude::*; +use crate::error::PyPolarsErr; use crate::expr::ToExprs; use crate::PyExpr; @@ -25,8 +26,14 @@ impl PyExpr { self.inner.clone().struct_().json_encode().into() } - fn struct_with_fields(&self, fields: Vec) -> Self { + fn struct_with_fields(&self, fields: Vec) -> PyResult { let fields = fields.to_exprs(); - self.inner.clone().struct_().with_fields(fields).into() + let e = self + .inner + .clone() + .struct_() + .with_fields(fields) + .map_err(PyPolarsErr::from)?; + Ok(e.into()) } } diff --git a/py-polars/src/file.rs b/py-polars/src/file.rs index 56ff242f06fe..adee1d3125f3 100644 --- a/py-polars/src/file.rs +++ b/py-polars/src/file.rs @@ -51,36 +51,34 @@ impl PyFileLikeObject { Cursor::new(buf) } - /// Same as `PyFileLikeObject::new`, but validates that the underlying + /// Validates that the underlying /// python object has a `read`, `write`, and `seek` methods in respect to parameters. /// Will return a `TypeError` if object does not have `read`, `seek`, and `write` methods. - pub fn with_requirements( - object: PyObject, + pub fn ensure_requirements( + object: &Bound, read: bool, write: bool, seek: bool, - ) -> PyResult { - Python::with_gil(|py| { - if read && object.getattr(py, "read").is_err() { - return Err(PyErr::new::( - "Object does not have a .read() method.", - )); - } + ) -> PyResult<()> { + if read && object.getattr("read").is_err() { + return Err(PyErr::new::( + "Object does not have a .read() method.", + )); + } - if seek && object.getattr(py, "seek").is_err() { - return Err(PyErr::new::( - "Object does not have a .seek() method.", - )); - } + if seek && object.getattr("seek").is_err() { + return Err(PyErr::new::( + "Object does not have a .seek() method.", + )); + } - if write && object.getattr(py, "write").is_err() { - return Err(PyErr::new::( - "Object does not have a .write() method.", - )); - } + if write && object.getattr("write").is_err() { + return Err(PyErr::new::( + "Object does not have a .write() method.", + )); + } - Ok(PyFileLikeObject::new(object)) - }) + Ok(()) } } @@ -196,7 +194,7 @@ fn get_either_file_and_path( write: bool, ) -> PyResult<(EitherRustPythonFile, Option)> { Python::with_gil(|py| { - let py_f = py_f.bind(py); + let py_f = py_f.into_bound(py); if let Ok(s) = py_f.extract::>() { let file_path = std::path::Path::new(&*s); let file_path = resolve_homedir(file_path); @@ -208,6 +206,15 @@ fn get_either_file_and_path( Ok((EitherRustPythonFile::Rust(f), Some(file_path))) } else { let io = py.import_bound("io").unwrap(); + let is_utf8_encoding = |py_f: &Bound| -> PyResult { + let encoding = py_f.getattr("encoding")?; + let encoding = encoding.extract::>()?; + Ok(encoding.eq_ignore_ascii_case("utf-8") || encoding.eq_ignore_ascii_case("utf8")) + }; + let flush_file = |py_f: &Bound| -> PyResult<()> { + py_f.getattr("flush")?.call0()?; + Ok(()) + }; #[cfg(target_family = "unix")] if let Some(fd) = ((py_f.is_exact_instance(&io.getattr("FileIO").unwrap()) || py_f.is_exact_instance(&io.getattr("BufferedReader").unwrap()) @@ -215,22 +222,8 @@ fn get_either_file_and_path( || py_f.is_exact_instance(&io.getattr("BufferedRandom").unwrap()) || py_f.is_exact_instance(&io.getattr("BufferedRWPair").unwrap()) || (py_f.is_exact_instance(&io.getattr("TextIOWrapper").unwrap()) - && py_f - .getattr("encoding") - .ok() - .filter(|encoding| match encoding.extract::>() { - Ok(encoding) => { - encoding.eq_ignore_ascii_case("utf-8") - || encoding.eq_ignore_ascii_case("utf8") - }, - Err(_) => false, - }) - .is_some())) - && (!write - || py_f - .getattr("flush") - .and_then(|flush| flush.call0()) - .is_ok())) + && is_utf8_encoding(&py_f)?)) + && (!write || flush_file(&py_f).is_ok())) .then(|| { py_f.getattr("fileno") .and_then(|fileno| fileno.call0()) @@ -256,7 +249,27 @@ fn get_either_file_and_path( Ensure you pass a path to the file instead of a python file object when possible for best \ performance."); } - let f = PyFileLikeObject::with_requirements(py_f.to_object(py), !write, write, !write)?; + // Unwrap TextIOWrapper + // Allow subclasses to allow things like pytest.capture.CaptureIO + let py_f = if py_f + .is_instance(&io.getattr("TextIOWrapper").unwrap()) + .unwrap_or_default() + { + if !is_utf8_encoding(&py_f)? { + return Err(PyPolarsErr::from( + polars_err!(InvalidOperation: "file encoding is not UTF-8"), + ) + .into()); + } + if write { + flush_file(&py_f)?; + } + py_f.getattr("buffer")? + } else { + py_f + }; + PyFileLikeObject::ensure_requirements(&py_f, !write, write, !write)?; + let f = PyFileLikeObject::new(py_f.to_object(py)); Ok((EitherRustPythonFile::Py(f), None)) } }) diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index 7345d54f0399..6b63aaa630ed 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -310,7 +310,7 @@ impl PyLazyFrame { #[cfg(feature = "ipc")] #[staticmethod] - #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, retries, file_cache_ttl))] + #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl))] fn new_from_ipc( path: Option, paths: Vec, @@ -320,6 +320,9 @@ impl PyLazyFrame { row_index: Option<(String, IdxSize)>, memory_map: bool, cloud_options: Option>, + hive_partitioning: Option, + hive_schema: Option>, + try_parse_hive_dates: bool, retries: usize, file_cache_ttl: Option, ) -> PyResult { @@ -357,6 +360,13 @@ impl PyLazyFrame { Some(cloud_options) }; + let hive_options = HiveOptions { + enabled: hive_partitioning, + hive_start_idx: 0, + schema: hive_schema.map(|x| Arc::new(x.0)), + try_parse_dates: try_parse_hive_dates, + }; + let args = ScanArgsIpc { n_rows, cache, @@ -365,6 +375,7 @@ impl PyLazyFrame { memory_map, #[cfg(feature = "cloud")] cloud_options, + hive_options, }; let lf = if let Some(path) = &path { @@ -614,7 +625,7 @@ impl PyLazyFrame { } #[cfg(all(feature = "streaming", feature = "parquet"))] - #[pyo3(signature = (path, compression, compression_level, statistics, row_group_size, data_pagesize_limit, maintain_order))] + #[pyo3(signature = (path, compression, compression_level, statistics, row_group_size, data_page_size, maintain_order))] fn sink_parquet( &self, py: Python, @@ -623,7 +634,7 @@ impl PyLazyFrame { compression_level: Option, statistics: Wrap, row_group_size: Option, - data_pagesize_limit: Option, + data_page_size: Option, maintain_order: bool, ) -> PyResult<()> { let compression = parse_parquet_compression(compression, compression_level)?; @@ -632,7 +643,7 @@ impl PyLazyFrame { compression, statistics: statistics.0, row_group_size, - data_pagesize_limit, + data_page_size, maintain_order, }; diff --git a/py-polars/src/lazyframe/serde.rs b/py-polars/src/lazyframe/serde.rs index af49c0f2ebd7..fa51fa9efb37 100644 --- a/py-polars/src/lazyframe/serde.rs +++ b/py-polars/src/lazyframe/serde.rs @@ -1,12 +1,12 @@ use std::io::{BufReader, BufWriter}; -use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::pybacked::PyBackedBytes; use pyo3::types::PyBytes; use super::PyLazyFrame; use crate::error::PyPolarsErr; +use crate::exceptions::ComputeError; use crate::file::get_file_like; use crate::prelude::*; @@ -40,7 +40,7 @@ impl PyLazyFrame { let file = get_file_like(py_f, true)?; let writer = BufWriter::new(file); ciborium::into_writer(&self.ldf.logical_plan, writer) - .map_err(|err| PyValueError::new_err(format!("{err:?}"))) + .map_err(|err| ComputeError::new_err(err.to_string())) } /// Serialize into a JSON string. @@ -49,7 +49,7 @@ impl PyLazyFrame { let file = get_file_like(py_f, true)?; let writer = BufWriter::new(file); serde_json::to_writer(writer, &self.ldf.logical_plan) - .map_err(|err| PyValueError::new_err(format!("{err:?}"))) + .map_err(|err| ComputeError::new_err(err.to_string())) } /// Deserialize a file-like object containing binary data into a LazyFrame. @@ -58,7 +58,7 @@ impl PyLazyFrame { let file = get_file_like(py_f, false)?; let reader = BufReader::new(file); let lp = ciborium::from_reader::(reader) - .map_err(|err| PyValueError::new_err(format!("{err:?}")))?; + .map_err(|err| ComputeError::new_err(err.to_string()))?; Ok(LazyFrame::from(lp).into()) } @@ -82,7 +82,7 @@ impl PyLazyFrame { let json = unsafe { std::mem::transmute::<&'_ str, &'static str>(json.as_str()) }; let lp = serde_json::from_str::(json) - .map_err(|err| PyValueError::new_err(format!("{err:?}")))?; + .map_err(|err| ComputeError::new_err(err.to_string()))?; Ok(LazyFrame::from(lp).into()) } } diff --git a/py-polars/src/lazyframe/visitor/nodes.rs b/py-polars/src/lazyframe/visitor/nodes.rs index a706b3e253d2..671fadac7e28 100644 --- a/py-polars/src/lazyframe/visitor/nodes.rs +++ b/py-polars/src/lazyframe/visitor/nodes.rs @@ -448,6 +448,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { options: ( match options.args.how { JoinType::Left => "left", + JoinType::Right => "right", JoinType::Inner => "inner", JoinType::Full => "full", JoinType::AsOf(_) => return Err(PyNotImplementedError::new_err("asof join")), diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index d49f3358cc14..eb320311e7f8 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -145,12 +145,16 @@ impl PySeries { /// Return the underlying Arrow array. #[allow(clippy::wrong_self_convention)] - fn to_arrow(&mut self, future: bool) -> PyResult { + fn to_arrow(&mut self, compat_level: PyCompatLevel) -> PyResult { self.rechunk(true); Python::with_gil(|py| { let pyarrow = py.import_bound("pyarrow")?; - interop::arrow::to_py::to_py_array(self.series.to_arrow(0, future), py, &pyarrow) + interop::arrow::to_py::to_py_array( + self.series.to_arrow(0, compat_level.0), + py, + &pyarrow, + ) }) } } diff --git a/py-polars/src/series/mod.rs b/py-polars/src/series/mod.rs index 74d2c7ae8422..899ae3940191 100644 --- a/py-polars/src/series/mod.rs +++ b/py-polars/src/series/mod.rs @@ -665,7 +665,7 @@ impl PySeries { // IPC only support DataFrames so we need to convert it let mut df = self.series.clone().into_frame(); IpcStreamWriter::new(&mut buf) - .with_pl_flavor(true) + .with_compat_level(CompatLevel::newest()) .finish(&mut df) .expect("ipc writer"); Ok(PyBytes::new_bound(py, &buf).to_object(py)) diff --git a/py-polars/tests/unit/dataframe/test_serde.py b/py-polars/tests/unit/dataframe/test_serde.py index b12cd393cb6f..04c453f08080 100644 --- a/py-polars/tests/unit/dataframe/test_serde.py +++ b/py-polars/tests/unit/dataframe/test_serde.py @@ -221,3 +221,11 @@ def test_df_deserialize_validation() -> None: ) with pytest.raises(ComputeError, match=r"lengths don't match"): pl.DataFrame.deserialize(f, format="json") + + +def test_df_serialize_invalid_type() -> None: + df = pl.DataFrame({"a": [object()]}) + with pytest.raises( + ComputeError, match="serializing data of type Object is not supported" + ): + df.serialize() diff --git a/py-polars/tests/unit/datatypes/test_decimal.py b/py-polars/tests/unit/datatypes/test_decimal.py index 7c18886a9975..e9a45080cfa1 100644 --- a/py-polars/tests/unit/datatypes/test_decimal.py +++ b/py-polars/tests/unit/datatypes/test_decimal.py @@ -472,3 +472,16 @@ def test_decimal_supertype() -> None: pl.col("column_0").cast(pl.Decimal(scale=6)) * 1 ) assert q.collect().dtypes[0].is_decimal() + + +def test_decimal_raise_oob_precision() -> None: + df = pl.DataFrame({"a": [1.0]}) + # max precision is 38. + with pytest.raises(pl.exceptions.InvalidOperationError): + df.select(b=pl.col("a").cast(pl.Decimal(76, 38))) + + +def test_decimal_dynamic_float_st() -> None: + assert pl.LazyFrame({"a": [D("2.0"), D("0.5")]}).filter( + pl.col("a").is_between(0.45, 0.9) + ).collect().to_dict(as_series=False) == {"a": [D("0.5")]} diff --git a/py-polars/tests/unit/datatypes/test_string.py b/py-polars/tests/unit/datatypes/test_string.py index ce63c4b6b79c..4250b2c23e99 100644 --- a/py-polars/tests/unit/datatypes/test_string.py +++ b/py-polars/tests/unit/datatypes/test_string.py @@ -1,3 +1,5 @@ +import json + import polars as pl from polars.testing import assert_series_equal @@ -28,3 +30,17 @@ def test_utf8_alias_lit() -> None: result = pl.select(a=pl.lit(5, dtype=pl.Utf8)).to_series() expected = pl.Series("a", ["5"], dtype=pl.String) assert_series_equal(result, expected) + + +def test_json_decode_multiple_chunks() -> None: + a = json.dumps({"x": None}) + b = json.dumps({"x": True}) + + df_1 = pl.Series([a]).to_frame("s") + df_2 = pl.Series([b]).to_frame("s") + + df = pl.concat([df_1, df_2]) + + assert df.with_columns(pl.col("s").str.json_decode()).to_dict(as_series=False) == { + "s": [{"x": None}, {"x": True}] + } diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 418baef35529..c51a1f620dd2 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -932,3 +932,32 @@ def test_struct_split_16536() -> None: df = pl.concat([df, df, df, df], rechunk=False) assert df.filter(pl.col("int") == 1).shape == (4, 3) + + +def test_struct_wildcard_expansion_and_exclude() -> None: + df = pl.DataFrame( + { + "id": [1, 2], + "meta_data": [ + {"system_data": "to_remove", "user_data": "keep"}, + {"user_data": "keep_"}, + ], + } + ) + + # ensure wildcard expansion is on input + assert df.lazy().select( + pl.col("meta_data").struct.with_fields("*") + ).collect().schema["meta_data"].fields == [ # type: ignore[attr-defined] + pl.Field("system_data", pl.String), + pl.Field("user_data", pl.String), + pl.Field("id", pl.Int64), + pl.Field( + "meta_data", pl.Struct({"system_data": pl.String, "user_data": pl.String}) + ), + ] + + with pytest.raises(pl.exceptions.InvalidOperationError): + df.lazy().select( + pl.col("meta_data").struct.with_fields(pl.field("*").exclude("user_data")) + ).collect() diff --git a/py-polars/tests/unit/interop/numpy/test_numpy.py b/py-polars/tests/unit/interop/numpy/test_numpy.py index 7d36d56cfb0d..84fbe26b9363 100644 --- a/py-polars/tests/unit/interop/numpy/test_numpy.py +++ b/py-polars/tests/unit/interop/numpy/test_numpy.py @@ -73,3 +73,7 @@ def test_numpy_disambiguation() -> None: "b": [1, 2], } assert result == expected + + +def test_respect_dtype_with_series_from_numpy() -> None: + assert pl.Series("foo", np.array([1, 2, 3]), dtype=pl.UInt32).dtype == pl.UInt32 diff --git a/py-polars/tests/unit/interop/test_from_pandas.py b/py-polars/tests/unit/interop/test_from_pandas.py new file mode 100644 index 000000000000..27b4f005e468 --- /dev/null +++ b/py-polars/tests/unit/interop/test_from_pandas.py @@ -0,0 +1,370 @@ +from __future__ import annotations + +from datetime import datetime, timedelta +from typing import TYPE_CHECKING, Any + +import numpy as np +import pandas as pd +import pytest + +import polars as pl +from polars.testing import assert_frame_equal +from polars.testing.asserts.series import assert_series_equal + +if TYPE_CHECKING: + from polars._typing import PolarsDataType + + +def test_from_pandas() -> None: + df = pd.DataFrame( + { + "bools": [False, True, False], + "bools_nulls": [None, True, False], + "int": [1, 2, 3], + "int_nulls": [1, None, 3], + "floats": [1.0, 2.0, 3.0], + "floats_nulls": [1.0, None, 3.0], + "strings": ["foo", "bar", "ham"], + "strings_nulls": ["foo", None, "ham"], + "strings-cat": ["foo", "bar", "ham"], + } + ) + df["strings-cat"] = df["strings-cat"].astype("category") + + out = pl.from_pandas(df) + assert out.shape == (3, 9) + assert out.schema == { + "bools": pl.Boolean, + "bools_nulls": pl.Boolean, + "int": pl.Int64, + "int_nulls": pl.Float64, + "floats": pl.Float64, + "floats_nulls": pl.Float64, + "strings": pl.String, + "strings_nulls": pl.String, + "strings-cat": pl.Categorical, + } + assert out.rows() == [ + (False, None, 1, 1.0, 1.0, 1.0, "foo", "foo", "foo"), + (True, True, 2, None, 2.0, None, "bar", None, "bar"), + (False, False, 3, 3.0, 3.0, 3.0, "ham", "ham", "ham"), + ] + + # partial dtype overrides from pandas + overrides = {"int": pl.Int8, "int_nulls": pl.Int32, "floats": pl.Float32} + out = pl.from_pandas(df, schema_overrides=overrides) + for col, dtype in overrides.items(): + assert out.schema[col] == dtype + + +@pytest.mark.parametrize( + "nulls", + [ + [], + [None], + [None, None], + [None, None, None], + ], +) +def test_from_pandas_nulls(nulls: list[None]) -> None: + # empty and/or all null values, no pandas dtype + ps = pd.Series(nulls) + s = pl.from_pandas(ps) + assert nulls == s.to_list() + + +def test_from_pandas_nan_to_null() -> None: + df = pd.DataFrame( + { + "bools_nulls": [None, True, False], + "int_nulls": [1, None, 3], + "floats_nulls": [1.0, None, 3.0], + "strings_nulls": ["foo", None, "ham"], + "nulls": [None, np.nan, np.nan], + } + ) + out_true = pl.from_pandas(df) + out_false = pl.from_pandas(df, nan_to_null=False) + assert all(val is None for val in out_true["nulls"]) + assert all(np.isnan(val) for val in out_false["nulls"][1:]) + + df = pd.Series([2, np.nan, None], name="pd") # type: ignore[assignment] + out_true = pl.from_pandas(df) + out_false = pl.from_pandas(df, nan_to_null=False) + assert [val is None for val in out_true] + assert [np.isnan(val) for val in out_false[1:]] + + +def test_from_pandas_datetime() -> None: + ts = datetime(2021, 1, 1, 20, 20, 20, 20) + pd_s = pd.Series([ts, ts]) + tmp = pl.from_pandas(pd_s.to_frame("a")) + s = tmp["a"] + assert s.dt.hour()[0] == 20 + assert s.dt.minute()[0] == 20 + assert s.dt.second()[0] == 20 + + date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 09:00:00", freq="1h") + s = pl.from_pandas(date_times) + assert s[0] == datetime(2021, 6, 24, 0, 0) + assert s[-1] == datetime(2021, 6, 24, 9, 0) + + +@pytest.mark.parametrize( + ("index_class", "index_data", "index_params", "expected_data", "expected_dtype"), + [ + (pd.Index, [100, 200, 300], {}, None, pl.Int64), + (pd.Index, [1, 2, 3], {"dtype": "uint32"}, None, pl.UInt32), + (pd.RangeIndex, 5, {}, [0, 1, 2, 3, 4], pl.Int64), + (pd.CategoricalIndex, ["N", "E", "S", "W"], {}, None, pl.Categorical), + ( + pd.DatetimeIndex, + [datetime(1960, 12, 31), datetime(2077, 10, 20)], + {"dtype": "datetime64[ms]"}, + None, + pl.Datetime("ms"), + ), + ( + pd.TimedeltaIndex, + ["24 hours", "2 days 8 hours", "3 days 42 seconds"], + {}, + [timedelta(1), timedelta(days=2, hours=8), timedelta(days=3, seconds=42)], + pl.Duration("ns"), + ), + ], +) +def test_from_pandas_index( + index_class: Any, + index_data: Any, + index_params: dict[str, Any], + expected_data: list[Any] | None, + expected_dtype: PolarsDataType, +) -> None: + if expected_data is None: + expected_data = index_data + + s = pl.from_pandas(index_class(index_data, **index_params)) + assert s.to_list() == expected_data + assert s.dtype == expected_dtype + + +def test_from_pandas_include_indexes() -> None: + data = { + "dtm": [datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], + "val": [100, 200, 300], + "misc": ["x", "y", "z"], + } + pd_df = pd.DataFrame(data) + + df = pl.from_pandas(pd_df.set_index(["dtm"])) + assert df.to_dict(as_series=False) == { + "val": [100, 200, 300], + "misc": ["x", "y", "z"], + } + + df = pl.from_pandas(pd_df.set_index(["dtm", "val"])) + assert df.to_dict(as_series=False) == {"misc": ["x", "y", "z"]} + + df = pl.from_pandas(pd_df.set_index(["dtm"]), include_index=True) + assert df.to_dict(as_series=False) == data + + df = pl.from_pandas(pd_df.set_index(["dtm", "val"]), include_index=True) + assert df.to_dict(as_series=False) == data + + +def test_duplicate_cols_diff_types() -> None: + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1]) + with pytest.raises(ValueError, match="Polars dataframes must have unique string"): + pl.from_pandas(df) + + +def test_from_pandas_duplicated_columns() -> None: + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"]) + with pytest.raises(ValueError, match="Polars dataframes must have unique string"): + pl.from_pandas(df) + + +def test_from_pandas_null() -> None: + # null column is an object dtype, so pl.Utf8 is most close + df = pd.DataFrame([{"a": None}, {"a": None}]) + out = pl.DataFrame(df) + assert out.dtypes == [pl.String] + assert out["a"][0] is None + + df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}]) + out = pl.DataFrame(df) + assert out.dtypes == [pl.String, pl.Int64] + + +def test_from_pandas_nested_list() -> None: + # this panicked in https://github.com/pola-rs/polars/issues/1615 + pddf = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]} + ) + pldf = pl.from_pandas(pddf) + assert pldf.shape == (4, 2) + assert pldf.rows() == [ + (1, ["x", "y"]), + (2, ["x", "y", "z"]), + (3, ["x"]), + (4, ["x", "y"]), + ] + + +def test_from_pandas_categorical_none() -> None: + s = pd.Series(["a", "b", "c", pd.NA], dtype="category") + out = pl.from_pandas(s) + assert out.dtype == pl.Categorical + assert out.to_list() == ["a", "b", "c", None] + + +def test_from_pandas_dataframe() -> None: + pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + df = pl.from_pandas(pd_df) + assert df.shape == (2, 3) + assert df.rows() == [(1, 2, 3), (4, 5, 6)] + + # if not a pandas dataframe, raise a ValueError + with pytest.raises(TypeError): + _ = pl.from_pandas([1, 2]) # type: ignore[call-overload] + + +def test_from_pandas_series() -> None: + pd_series = pd.Series([1, 2, 3], name="pd") + s = pl.from_pandas(pd_series) + assert s.shape == (3,) + assert list(s) == [1, 2, 3] + + +def test_from_empty_pandas() -> None: + pandas_df = pd.DataFrame( + { + "A": [], + "fruits": [], + } + ) + polars_df = pl.from_pandas(pandas_df) + assert polars_df.columns == ["A", "fruits"] + assert polars_df.dtypes == [pl.Float64, pl.Float64] + + +def test_from_null_column() -> None: + df = pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA], columns=["n/a"])) + + assert df.shape == (2, 1) + assert df.columns == ["n/a"] + assert df.dtypes[0] == pl.Null + + +def test_from_pandas_ns_resolution() -> None: + df = pd.DataFrame( + [pd.Timestamp(year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)], + columns=["date"], + ) + assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1) + + +def test_pandas_string_none_conversion_3298() -> None: + data: dict[str, list[str | None]] = {"col_1": ["a", "b", "c", "d"]} + data["col_1"][0] = None + df_pd = pd.DataFrame(data) + df_pl = pl.DataFrame(df_pd) + assert df_pl.to_series().to_list() == [None, "b", "c", "d"] + + +def test_from_pandas_null_struct_6412() -> None: + data = [ + { + "a": { + "b": None, + }, + }, + {"a": None}, + ] + df_pandas = pd.DataFrame(data) + assert pl.from_pandas(df_pandas).to_dict(as_series=False) == { + "a": [{"b": None}, {"b": None}] + } + + +def test_untrusted_categorical_input() -> None: + df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) + df = pl.from_pandas(df_pd) + result = df.group_by("x").len() + expected = pl.DataFrame( + {"x": ["x"], "len": [1]}, schema={"x": pl.Categorical, "len": pl.UInt32} + ) + assert_frame_equal(result, expected, categorical_as_str=True) + + +@pytest.fixture() +def _set_pyarrow_unavailable(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "polars._utils.construction.dataframe._PYARROW_AVAILABLE", False + ) + monkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False) + + +@pytest.mark.usefixtures("_set_pyarrow_unavailable") +def test_from_pandas_pyarrow_not_available_succeeds() -> None: + data: dict[str, Any] = { + "a": [1, 2], + "b": ["one", "two"], + "c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"), + "d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"), + "e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"), + "f": np.array([1, 2], dtype="timedelta64[ns]"), + "g": np.array([1, 2], dtype="timedelta64[us]"), + "h": np.array([1, 2], dtype="timedelta64[ms]"), + "i": [True, False], + } + + # DataFrame + result = pl.from_pandas(pd.DataFrame(data)) + expected = pl.DataFrame(data) + assert_frame_equal(result, expected) + + # Series + for col in data: + s_pd = pd.Series(data[col]) + result_s = pl.from_pandas(s_pd) + expected_s = pl.Series(data[col]) + assert_series_equal(result_s, expected_s) + + +@pytest.mark.usefixtures("_set_pyarrow_unavailable") +def test_from_pandas_pyarrow_not_available_fails() -> None: + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64")) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64")) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas( + pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()}) + ) + with pytest.raises(ImportError, match="pyarrow is required"): + pl.from_pandas(pd.DataFrame({"a": [None, "foo"]})) + + +def test_from_pandas_nan_to_null_16453(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "polars._utils.construction.dataframe._MIN_NUMPY_SIZE_FOR_MULTITHREADING", 2 + ) + df = pd.DataFrame( + {"a": [np.nan, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]} + ) + result = pl.from_pandas(df, nan_to_null=True) + expected = pl.DataFrame( + {"a": [None, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]} + ) + assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("null", [pd.NA, np.nan, None, float("nan")]) +def test_from_pandas_string_with_natype_17355(null: Any) -> None: + # https://github.com/pola-rs/polars/issues/17355 + + pd_df = pd.DataFrame({"col": ["a", null]}) + result = pl.from_pandas(pd_df) + expected = pl.DataFrame({"col": ["a", None]}) + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index 2802023e8b9c..5dd7f442c905 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -1,183 +1,19 @@ from __future__ import annotations -from datetime import date, datetime, time, timedelta, timezone -from typing import TYPE_CHECKING, Any, cast +from datetime import date, datetime, time, timezone +from typing import Any, cast import numpy as np import pandas as pd +import pyarrow import pyarrow as pa import pytest import polars as pl -from polars.exceptions import ComputeError +from polars.exceptions import ComputeError, UnstableWarning +from polars.interchange.protocol import CompatLevel from polars.testing import assert_frame_equal, assert_series_equal -if TYPE_CHECKING: - from polars._typing import PolarsDataType - - -def test_from_pandas() -> None: - df = pd.DataFrame( - { - "bools": [False, True, False], - "bools_nulls": [None, True, False], - "int": [1, 2, 3], - "int_nulls": [1, None, 3], - "floats": [1.0, 2.0, 3.0], - "floats_nulls": [1.0, None, 3.0], - "strings": ["foo", "bar", "ham"], - "strings_nulls": ["foo", None, "ham"], - "strings-cat": ["foo", "bar", "ham"], - } - ) - df["strings-cat"] = df["strings-cat"].astype("category") - - out = pl.from_pandas(df) - assert out.shape == (3, 9) - assert out.schema == { - "bools": pl.Boolean, - "bools_nulls": pl.Boolean, - "int": pl.Int64, - "int_nulls": pl.Float64, - "floats": pl.Float64, - "floats_nulls": pl.Float64, - "strings": pl.String, - "strings_nulls": pl.String, - "strings-cat": pl.Categorical, - } - assert out.rows() == [ - (False, None, 1, 1.0, 1.0, 1.0, "foo", "foo", "foo"), - (True, True, 2, None, 2.0, None, "bar", None, "bar"), - (False, False, 3, 3.0, 3.0, 3.0, "ham", "ham", "ham"), - ] - - # partial dtype overrides from pandas - overrides = {"int": pl.Int8, "int_nulls": pl.Int32, "floats": pl.Float32} - out = pl.from_pandas(df, schema_overrides=overrides) - for col, dtype in overrides.items(): - assert out.schema[col] == dtype - - -@pytest.mark.parametrize( - "nulls", - [ - [], - [None], - [None, None], - [None, None, None], - ], -) -def test_from_pandas_nulls(nulls: list[None]) -> None: - # empty and/or all null values, no pandas dtype - ps = pd.Series(nulls) - s = pl.from_pandas(ps) - assert nulls == s.to_list() - - -def test_from_pandas_nan_to_null() -> None: - df = pd.DataFrame( - { - "bools_nulls": [None, True, False], - "int_nulls": [1, None, 3], - "floats_nulls": [1.0, None, 3.0], - "strings_nulls": ["foo", None, "ham"], - "nulls": [None, np.nan, np.nan], - } - ) - out_true = pl.from_pandas(df) - out_false = pl.from_pandas(df, nan_to_null=False) - assert all(val is None for val in out_true["nulls"]) - assert all(np.isnan(val) for val in out_false["nulls"][1:]) - - df = pd.Series([2, np.nan, None], name="pd") # type: ignore[assignment] - out_true = pl.from_pandas(df) - out_false = pl.from_pandas(df, nan_to_null=False) - assert [val is None for val in out_true] - assert [np.isnan(val) for val in out_false[1:]] - - -def test_from_pandas_datetime() -> None: - ts = datetime(2021, 1, 1, 20, 20, 20, 20) - pd_s = pd.Series([ts, ts]) - tmp = pl.from_pandas(pd_s.to_frame("a")) - s = tmp["a"] - assert s.dt.hour()[0] == 20 - assert s.dt.minute()[0] == 20 - assert s.dt.second()[0] == 20 - - date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 09:00:00", freq="1h") - s = pl.from_pandas(date_times) - assert s[0] == datetime(2021, 6, 24, 0, 0) - assert s[-1] == datetime(2021, 6, 24, 9, 0) - - -@pytest.mark.parametrize( - ("index_class", "index_data", "index_params", "expected_data", "expected_dtype"), - [ - (pd.Index, [100, 200, 300], {}, None, pl.Int64), - (pd.Index, [1, 2, 3], {"dtype": "uint32"}, None, pl.UInt32), - (pd.RangeIndex, 5, {}, [0, 1, 2, 3, 4], pl.Int64), - (pd.CategoricalIndex, ["N", "E", "S", "W"], {}, None, pl.Categorical), - ( - pd.DatetimeIndex, - [datetime(1960, 12, 31), datetime(2077, 10, 20)], - {"dtype": "datetime64[ms]"}, - None, - pl.Datetime("ms"), - ), - ( - pd.TimedeltaIndex, - ["24 hours", "2 days 8 hours", "3 days 42 seconds"], - {}, - [timedelta(1), timedelta(days=2, hours=8), timedelta(days=3, seconds=42)], - pl.Duration("ns"), - ), - ], -) -def test_from_pandas_index( - index_class: Any, - index_data: Any, - index_params: dict[str, Any], - expected_data: list[Any] | None, - expected_dtype: PolarsDataType, -) -> None: - if expected_data is None: - expected_data = index_data - - s = pl.from_pandas(index_class(index_data, **index_params)) - assert s.to_list() == expected_data - assert s.dtype == expected_dtype - - -def test_from_pandas_include_indexes() -> None: - data = { - "dtm": [datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], - "val": [100, 200, 300], - "misc": ["x", "y", "z"], - } - pd_df = pd.DataFrame(data) - - df = pl.from_pandas(pd_df.set_index(["dtm"])) - assert df.to_dict(as_series=False) == { - "val": [100, 200, 300], - "misc": ["x", "y", "z"], - } - - df = pl.from_pandas(pd_df.set_index(["dtm", "val"])) - assert df.to_dict(as_series=False) == {"misc": ["x", "y", "z"]} - - df = pl.from_pandas(pd_df.set_index(["dtm"]), include_index=True) - assert df.to_dict(as_series=False) == data - - df = pl.from_pandas(pd_df.set_index(["dtm", "val"]), include_index=True) - assert df.to_dict(as_series=False) == data - - -def test_from_pandas_duplicated_columns() -> None: - df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"]) - with pytest.raises(ValueError, match="duplicate column names found: "): - pl.from_pandas(df) - def test_arrow_list_roundtrip() -> None: # https://github.com/pola-rs/polars/issues/1064 @@ -243,40 +79,6 @@ def test_arrow_list_chunked_array() -> None: assert s.dtype == pl.List -def test_from_pandas_null() -> None: - # null column is an object dtype, so pl.Utf8 is most close - df = pd.DataFrame([{"a": None}, {"a": None}]) - out = pl.DataFrame(df) - assert out.dtypes == [pl.String] - assert out["a"][0] is None - - df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}]) - out = pl.DataFrame(df) - assert out.dtypes == [pl.String, pl.Int64] - - -def test_from_pandas_nested_list() -> None: - # this panicked in https://github.com/pola-rs/polars/issues/1615 - pddf = pd.DataFrame( - {"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]} - ) - pldf = pl.from_pandas(pddf) - assert pldf.shape == (4, 2) - assert pldf.rows() == [ - (1, ["x", "y"]), - (2, ["x", "y", "z"]), - (3, ["x"]), - (4, ["x", "y"]), - ] - - -def test_from_pandas_categorical_none() -> None: - s = pd.Series(["a", "b", "c", pd.NA], dtype="category") - out = pl.from_pandas(s) - assert out.dtype == pl.Categorical - assert out.to_list() == ["a", "b", "c", None] - - def test_from_dict() -> None: data = {"a": [1, 2], "b": [3, 4]} df = pl.from_dict(data) @@ -408,24 +210,6 @@ def test_from_arrow() -> None: assert df.schema == {"a": pl.UInt32, "b": pl.UInt64} # type: ignore[union-attr] -def test_from_pandas_dataframe() -> None: - pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) - df = pl.from_pandas(pd_df) - assert df.shape == (2, 3) - assert df.rows() == [(1, 2, 3), (4, 5, 6)] - - # if not a pandas dataframe, raise a ValueError - with pytest.raises(TypeError): - _ = pl.from_pandas([1, 2]) # type: ignore[call-overload] - - -def test_from_pandas_series() -> None: - pd_series = pd.Series([1, 2, 3], name="pd") - s = pl.from_pandas(pd_series) - assert s.shape == (3,) - assert list(s) == [1, 2, 3] - - def test_from_optional_not_available() -> None: from polars.dependencies import _LazyModule @@ -473,18 +257,6 @@ def test_no_rechunk() -> None: assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2 -def test_from_empty_pandas() -> None: - pandas_df = pd.DataFrame( - { - "A": [], - "fruits": [], - } - ) - polars_df = pl.from_pandas(pandas_df) - assert polars_df.columns == ["A", "fruits"] - assert polars_df.dtypes == [pl.Float64, pl.Float64] - - def test_from_empty_arrow() -> None: df = cast(pl.DataFrame, pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []})))) assert df.columns == ["a", "b"] @@ -508,34 +280,6 @@ def test_from_empty_arrow() -> None: assert df.schema["l"] == pl.List(pl.UInt8) -def test_from_null_column() -> None: - df = pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA], columns=["n/a"])) - - assert df.shape == (2, 1) - assert df.columns == ["n/a"] - assert df.dtypes[0] == pl.Null - - -def test_respect_dtype_with_series_from_numpy() -> None: - assert pl.Series("foo", np.array([1, 2, 3]), dtype=pl.UInt32).dtype == pl.UInt32 - - -def test_from_pandas_ns_resolution() -> None: - df = pd.DataFrame( - [pd.Timestamp(year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)], - columns=["date"], - ) - assert cast(datetime, pl.from_pandas(df)[0, 0]) == datetime(2021, 1, 1, 1, 0, 1) - - -def test_pandas_string_none_conversion_3298() -> None: - data: dict[str, list[str | None]] = {"col_1": ["a", "b", "c", "d"]} - data["col_1"][0] = None - df_pd = pd.DataFrame(data) - df_pl = pl.DataFrame(df_pd) - assert df_pl.to_series().to_list() == [None, "b", "c", "d"] - - def test_cat_int_types_3500() -> None: with pl.StringCache(): # Create an enum / categorical / dictionary typed pyarrow array @@ -571,21 +315,6 @@ def test_arrow_list_null_5697() -> None: assert pl.from_arrow(pa_table).schema == {"mycol": pl.List(pl.Null)} # type: ignore[union-attr] -def test_from_pandas_null_struct_6412() -> None: - data = [ - { - "a": { - "b": None, - }, - }, - {"a": None}, - ] - df_pandas = pd.DataFrame(data) - assert pl.from_pandas(df_pandas).to_dict(as_series=False) == { - "a": [{"b": None}, {"b": None}] - } - - def test_from_pyarrow_map() -> None: pa_table = pa.table( [[1, 2], [[("a", "something")], [("a", "else"), ("b", "another key")]]], @@ -893,16 +622,6 @@ def test_dataframe_from_repr_custom_separators() -> None: ) -def test_untrusted_categorical_input() -> None: - df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])}) - df = pl.from_pandas(df_pd) - result = df.group_by("x").len() - expected = pl.DataFrame( - {"x": ["x"], "len": [1]}, schema={"x": pl.Categorical, "len": pl.UInt32} - ) - assert_frame_equal(result, expected, categorical_as_str=True) - - def test_sliced_struct_from_arrow() -> None: # Create a dataset with 3 rows tbl = pa.Table.from_arrays( @@ -972,44 +691,6 @@ def test_from_avro_valid_time_zone_13032() -> None: assert_series_equal(result, expected) -def test_from_pandas_pyarrow_not_available( - monkeypatch: pytest.MonkeyPatch, -) -> None: - monkeypatch.setattr( - "polars._utils.construction.dataframe._PYARROW_AVAILABLE", False - ) - monkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False) - data: dict[str, Any] = { - "a": [1, 2], - "b": ["one", "two"], - "c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"), - "d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"), - "e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"), - "f": np.array([1, 2], dtype="timedelta64[ns]"), - "g": np.array([1, 2], dtype="timedelta64[us]"), - "h": np.array([1, 2], dtype="timedelta64[ms]"), - "i": [True, False], - } - result = pl.from_pandas(pd.DataFrame(data)) - expected = pl.DataFrame(data) - assert_frame_equal(result, expected) - for col in data: - s_pd = pd.Series(data[col]) - result_s = pl.from_pandas(s_pd) - expected_s = pl.Series(data[col]) - assert_series_equal(result_s, expected_s) - with pytest.raises(ImportError, match="pyarrow is required"): - pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64")) - with pytest.raises(ImportError, match="pyarrow is required"): - pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64")) - with pytest.raises(ImportError, match="pyarrow is required"): - pl.from_pandas( - pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()}) - ) - with pytest.raises(ImportError, match="pyarrow is required"): - pl.from_pandas(pd.DataFrame({"a": [None, "foo"]})) - - def test_from_numpy_different_resolution_15991() -> None: result = pl.Series( np.array(["2020-01-01"], dtype="datetime64[ns]"), dtype=pl.Datetime("us") @@ -1025,15 +706,46 @@ def test_from_numpy_different_resolution_invalid() -> None: ) -def test_from_pandas_nan_to_null_16453(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.setattr( - "polars._utils.construction.dataframe._MIN_NUMPY_SIZE_FOR_MULTITHREADING", 2 +def test_compat_level(monkeypatch: pytest.MonkeyPatch) -> None: + # change these if compat level bumped + monkeypatch.setenv("POLARS_WARN_UNSTABLE", "1") + oldest = CompatLevel.oldest() + assert oldest is CompatLevel.oldest() # test singleton + assert oldest._version == 0 # type: ignore[attr-defined] + with pytest.warns(UnstableWarning): + newest = CompatLevel.newest() + assert newest is CompatLevel.newest() + assert newest._version == 1 # type: ignore[attr-defined] + + str_col = pl.Series(["awd"]) + bin_col = pl.Series([b"dwa"]) + assert str_col._newest_compat_level() == newest._version # type: ignore[attr-defined] + assert isinstance(str_col.to_arrow(), pyarrow.LargeStringArray) + assert isinstance(str_col.to_arrow(compat_level=oldest), pyarrow.LargeStringArray) + assert isinstance(str_col.to_arrow(compat_level=newest), pyarrow.StringViewArray) + assert isinstance(bin_col.to_arrow(), pyarrow.LargeBinaryArray) + assert isinstance(bin_col.to_arrow(compat_level=oldest), pyarrow.LargeBinaryArray) + assert isinstance(bin_col.to_arrow(compat_level=newest), pyarrow.BinaryViewArray) + + df = pl.DataFrame({"str_col": str_col, "bin_col": bin_col}) + assert isinstance(df.to_arrow()["str_col"][0], pyarrow.LargeStringScalar) + assert isinstance( + df.to_arrow(compat_level=oldest)["str_col"][0], pyarrow.LargeStringScalar ) - df = pd.DataFrame( - {"a": [np.nan, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]} + assert isinstance( + df.to_arrow(compat_level=newest)["str_col"][0], pyarrow.StringViewScalar ) - result = pl.from_pandas(df, nan_to_null=True) - expected = pl.DataFrame( - {"a": [None, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]} + assert isinstance(df.to_arrow()["bin_col"][0], pyarrow.LargeBinaryScalar) + assert isinstance( + df.to_arrow(compat_level=oldest)["bin_col"][0], pyarrow.LargeBinaryScalar ) - assert_frame_equal(result, expected) + assert isinstance( + df.to_arrow(compat_level=newest)["bin_col"][0], pyarrow.BinaryViewScalar + ) + + assert len(df.write_ipc(None).getbuffer()) == 786 + assert len(df.write_ipc(None, compat_level=oldest).getbuffer()) == 914 + assert len(df.write_ipc(None, compat_level=newest).getbuffer()) == 786 + assert len(df.write_ipc_stream(None).getbuffer()) == 544 + assert len(df.write_ipc_stream(None, compat_level=oldest).getbuffer()) == 672 + assert len(df.write_ipc_stream(None, compat_level=newest).getbuffer()) == 544 diff --git a/py-polars/tests/unit/io/cloud/test_utils.py b/py-polars/tests/unit/io/cloud/test_utils.py deleted file mode 100644 index 90fb23343fe8..000000000000 --- a/py-polars/tests/unit/io/cloud/test_utils.py +++ /dev/null @@ -1,25 +0,0 @@ -from __future__ import annotations - -import pytest - -from polars.io._utils import is_supported_cloud - - -@pytest.mark.parametrize( - ("url", "expected"), - [ - ("s3://bucket/file.tmp", True), - ("s3a://bucket/file.tmp", True), - ("gs://bucket/file.tmp", True), - ("gcs://bucket/file.tmp", True), - ("abfs://container@account/file.tmp", True), - ("abfss://container@account/file.tmp", True), - ("azure://container@account/file.tmp", True), - ("az://container@account/file.tmp", True), - ("adl://account/file.tmp", True), - ("file:///local/file.tmp", True), - ("/local/file.tmp", False), - ], -) -def test_is_cloud_url(url: str, expected: bool) -> None: - assert is_supported_cloud(url) is expected diff --git a/py-polars/tests/unit/io/database/test_write.py b/py-polars/tests/unit/io/database/test_write.py index 3b16a420db12..1a995e31df64 100644 --- a/py-polars/tests/unit/io/database/test_write.py +++ b/py-polars/tests/unit/io/database/test_write.py @@ -5,6 +5,8 @@ import pytest from sqlalchemy import create_engine +from sqlalchemy.orm import Session +from sqlalchemy.pool import NullPool import polars as pl from polars.io.database._utils import _open_adbc_connection @@ -233,3 +235,86 @@ def test_write_database_errors( match="unrecognised connection type", ): df.write_database(connection=True, table_name="misc") # type: ignore[arg-type] + + +@pytest.mark.write_disk() +def test_write_database_using_sa_session(tmp_path: str) -> None: + df = pl.DataFrame( + { + "key": ["xx", "yy", "zz"], + "value": [123, None, 789], + "other": [5.5, 7.0, None], + } + ) + table_name = "test_sa_session" + test_db_uri = f"sqlite:///{tmp_path}/test_sa_session.db" + engine = create_engine(test_db_uri, poolclass=NullPool) + with Session(engine) as session: + df.write_database(table_name, session) + session.commit() + + with Session(engine) as session: + result = pl.read_database( + query=f"select * from {table_name}", connection=session + ) + + assert_frame_equal(result, df) + + +@pytest.mark.write_disk() +@pytest.mark.parametrize("pass_connection", [True, False]) +def test_write_database_sa_rollback(tmp_path: str, pass_connection: bool) -> None: + df = pl.DataFrame( + { + "key": ["xx", "yy", "zz"], + "value": [123, None, 789], + "other": [5.5, 7.0, None], + } + ) + table_name = "test_sa_rollback" + test_db_uri = f"sqlite:///{tmp_path}/test_sa_rollback.db" + engine = create_engine(test_db_uri, poolclass=NullPool) + with Session(engine) as session: + if pass_connection: + conn = session.connection() + df.write_database(table_name, conn) + else: + df.write_database(table_name, session) + session.rollback() + + with Session(engine) as session: + count = pl.read_database( + query=f"select count(*) from {table_name}", connection=session + ).item(0, 0) + + assert isinstance(count, int) + assert count == 0 + + +@pytest.mark.write_disk() +@pytest.mark.parametrize("pass_connection", [True, False]) +def test_write_database_sa_commit(tmp_path: str, pass_connection: bool) -> None: + df = pl.DataFrame( + { + "key": ["xx", "yy", "zz"], + "value": [123, None, 789], + "other": [5.5, 7.0, None], + } + ) + table_name = "test_sa_commit" + test_db_uri = f"sqlite:///{tmp_path}/test_sa_commit.db" + engine = create_engine(test_db_uri, poolclass=NullPool) + with Session(engine) as session: + if pass_connection: + conn = session.connection() + df.write_database(table_name, conn) + else: + df.write_database(table_name, session) + session.commit() + + with Session(engine) as session: + result = pl.read_database( + query=f"select * from {table_name}", connection=session + ) + + assert_frame_equal(result, df) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index 113d0cedc89c..94dc428a1223 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -2221,3 +2221,29 @@ def test_projection_applied_on_file_with_no_rows_16606(tmp_path: Path) -> None: out = pl.scan_csv(path).select(columns).collect().columns assert out == columns + + +@pytest.mark.write_disk() +def test_write_csv_to_dangling_file_17328( + df_no_lists: pl.DataFrame, tmp_path: Path +) -> None: + tmp_path.mkdir(exist_ok=True) + df_no_lists.write_csv((tmp_path / "dangling.csv").open("w")) + + +def test_write_csv_raise_on_non_utf8_17328( + df_no_lists: pl.DataFrame, tmp_path: Path +) -> None: + tmp_path.mkdir(exist_ok=True) + with pytest.raises(InvalidOperationError, match="file encoding is not UTF-8"): + df_no_lists.write_csv((tmp_path / "dangling.csv").open("w", encoding="gbk")) + + +@pytest.mark.write_disk() +def test_write_csv_appending_17328(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + with (tmp_path / "append.csv").open("w") as f: + f.write("# test\n") + pl.DataFrame({"col": ["value"]}).write_csv(f) + with (tmp_path / "append.csv").open("r") as f: + assert f.read() == "# test\ncol\nvalue\n" diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py index 98ff82408eb6..809f50937467 100644 --- a/py-polars/tests/unit/io/test_hive.py +++ b/py-polars/tests/unit/io/test_hive.py @@ -1,11 +1,9 @@ -import os import sys import urllib.parse import warnings from collections import OrderedDict from datetime import datetime from functools import partial -from multiprocessing import get_context from pathlib import Path from typing import Any, Callable @@ -88,54 +86,16 @@ def test_hive_partitioned_predicate_pushdown( ) -def init_env_spawned_single_threaded_async() -> None: - os.environ["SPAWNED_PROCESS"] = "1" - os.environ["POLARS_MAX_THREADS"] = "1" - os.environ["POLARS_PREFETCH_SIZE"] = "1" - - @pytest.mark.xdist_group("streaming") @pytest.mark.write_disk() -def test_hive_partitioned_predicate_pushdown_single_threaded_async( +def test_hive_partitioned_predicate_pushdown_single_threaded_async_17155( io_files_path: Path, tmp_path: Path, monkeypatch: Any, capfd: Any, ) -> None: - # We need to run this in a separate process to avoid leakage of - # `POLARS_MAX_THREADS`. You can test this locally (on a - # system with > 1 threads) by removing the process-spawning logic and - # directly calling `init_env_spawned_single_threaded_async`, and then - # running: - # ``` - # python -m pytest py-polars/tests/unit/io/ -m '' -k \ - # test_hive_partitioned_predicate_pushdown - # ``` - # And observe that the below assertion of `thread_pool_size` will fail. - if "SPAWNED_PROCESS" not in os.environ: - with get_context("spawn").Pool( - 1, initializer=init_env_spawned_single_threaded_async - ) as p: - pytest_path = Path(__file__).relative_to(Path.cwd()) - pytest_path: str = f"{pytest_path}::test_hive_partitioned_predicate_pushdown_single_threaded_async" # type: ignore[no-redef] - - assert ( - p.map( - pytest.main, # type: ignore[arg-type] - [ - [ - pytest_path, - "-m", - "", - ] - ], - )[0] - == 0 - ) - - return - - assert pl.thread_pool_size() == 1 + monkeypatch.setenv("POLARS_FORCE_ASYNC", "1") + monkeypatch.setenv("POLARS_PREFETCH_SIZE", "1") impl_test_hive_partitioned_predicate_pushdown( io_files_path, @@ -336,6 +296,7 @@ def test_read_parquet_hive_schema_with_pyarrow() -> None: ("scan_func", "write_func"), [ (pl.scan_parquet, pl.DataFrame.write_parquet), + (pl.scan_ipc, pl.DataFrame.write_ipc), ], ) @pytest.mark.parametrize( @@ -368,7 +329,10 @@ def test_hive_partition_directory_scan( hive_schema = df.lazy().select("a", "b").collect_schema() scan = scan_func - scan = partial(scan_func, hive_schema=hive_schema, glob=glob) + scan = partial(scan_func, hive_schema=hive_schema) + + if scan_func is pl.scan_parquet: + scan = partial(scan, glob=glob) out = scan( tmp_path, @@ -529,10 +493,20 @@ def test_hive_partition_force_async_17155(tmp_path: Path, monkeypatch: Any) -> N ) +@pytest.mark.parametrize( + ("scan_func", "write_func"), + [ + (pl.scan_parquet, pl.DataFrame.write_parquet), + (pl.scan_ipc, pl.DataFrame.write_ipc), + ], +) @pytest.mark.write_disk() @pytest.mark.parametrize("projection_pushdown", [True, False]) def test_hive_partition_columns_contained_in_file( - tmp_path: Path, projection_pushdown: bool + tmp_path: Path, + scan_func: Callable[[Any], pl.LazyFrame], + write_func: Callable[[pl.DataFrame, Path], None], + projection_pushdown: bool, ) -> None: path = tmp_path / "a=1/b=2/data.bin" path.parent.mkdir(exist_ok=True, parents=True) @@ -540,7 +514,7 @@ def test_hive_partition_columns_contained_in_file( {"x": 1, "a": 1, "b": 2, "y": 1}, schema={"x": pl.Int32, "a": pl.Int8, "b": pl.Int16, "y": pl.Int32}, ) - df.write_parquet(path) + write_func(df, path) def assert_with_projections(lf: pl.LazyFrame, df: pl.DataFrame) -> None: for projection in [ @@ -561,12 +535,12 @@ def assert_with_projections(lf: pl.LazyFrame, df: pl.DataFrame) -> None: df.select(projection), ) - lf = pl.scan_parquet(path, hive_partitioning=True) + lf = scan_func(path, hive_partitioning=True) # type: ignore[call-arg] rhs = df assert_frame_equal(lf.collect(projection_pushdown=projection_pushdown), rhs) assert_with_projections(lf, rhs) - lf = pl.scan_parquet( + lf = scan_func( # type: ignore[call-arg] path, hive_schema={"a": pl.String, "b": pl.String}, hive_partitioning=True, @@ -580,7 +554,7 @@ def assert_with_projections(lf: pl.LazyFrame, df: pl.DataFrame) -> None: @pytest.mark.write_disk() -def test_hive_partition_dates(tmp_path: Path, monkeypatch: Any) -> None: +def test_hive_partition_dates(tmp_path: Path) -> None: df = pl.DataFrame( { "date1": [ @@ -646,3 +620,112 @@ def test_hive_partition_dates(tmp_path: Path, monkeypatch: Any) -> None: lf.collect(), df.with_columns(pl.col("date1", "date2").cast(pl.String)), ) + + +@pytest.mark.parametrize( + ("scan_func", "write_func"), + [ + (pl.scan_parquet, pl.DataFrame.write_parquet), + (pl.scan_ipc, pl.DataFrame.write_ipc), + ], +) +@pytest.mark.write_disk() +def test_projection_only_hive_parts_gives_correct_number_of_rows( + tmp_path: Path, + scan_func: Callable[[Any], pl.LazyFrame], + write_func: Callable[[pl.DataFrame, Path], None], +) -> None: + # Check the number of rows projected when projecting only hive parts, which + # should be the same as the number of rows in the file. + path = tmp_path / "a=3/data.bin" + path.parent.mkdir(exist_ok=True, parents=True) + + write_func(pl.DataFrame({"x": [1, 1, 1]}), path) + + assert_frame_equal( + scan_func(path, hive_partitioning=True).select("a").collect(), # type: ignore[call-arg] + pl.DataFrame({"a": [3, 3, 3]}), + ) + + +@pytest.mark.parametrize( + "df", + [ + pl.select( + pl.Series("a", [1, 2, 3, 4], dtype=pl.Int8), + pl.Series("b", [1, 2, 3, 4], dtype=pl.Int8), + pl.Series("x", [1, 2, 3, 4]), + ), + pl.select( + pl.Series( + "a", + [1.2981275, 2.385974035, 3.1231892749185718397510, 4.129387128949156], + dtype=pl.Float64, + ), + pl.Series("b", ["a", "b", " / c = : ", "d"]), + pl.Series("x", [1, 2, 3, 4]), + ), + ], +) +@pytest.mark.write_disk() +def test_hive_write(tmp_path: Path, df: pl.DataFrame) -> None: + root = tmp_path + df.write_parquet_partitioned(root, ["a", "b"]) + + lf = pl.scan_parquet(root) + assert_frame_equal(lf.collect(), df) + + lf = pl.scan_parquet(root, hive_schema={"a": pl.String, "b": pl.String}) + assert_frame_equal(lf.collect(), df.with_columns(pl.col("a", "b").cast(pl.String))) + + +@pytest.mark.slow() +@pytest.mark.write_disk() +def test_hive_write_multiple_files(tmp_path: Path, monkeypatch: Any) -> None: + chunk_size = 262_144 + n_rows = 100_000 + df = pl.select(a=pl.repeat(0, n_rows), b=pl.int_range(0, n_rows)) + + n_files = int(df.estimated_size() / chunk_size) + + assert n_files > 1, "increase df size or decrease file size" + + root = tmp_path + df.write_parquet_partitioned(root, ["a"], chunk_size_bytes=chunk_size) + + assert sum(1 for _ in (root / "a=0").iterdir()) == n_files + assert_frame_equal(pl.scan_parquet(root).collect(), df) + + +@pytest.mark.write_disk() +def test_hive_write_dates(tmp_path: Path) -> None: + df = pl.DataFrame( + { + "date1": [ + datetime(2024, 1, 1), + datetime(2024, 2, 1), + datetime(2024, 3, 1), + None, + ], + "date2": [ + datetime(2023, 1, 1), + datetime(2023, 2, 1), + None, + datetime(2023, 3, 1, 1, 1, 1, 1), + ], + "x": [1, 2, 3, 4], + }, + schema={"date1": pl.Date, "date2": pl.Datetime, "x": pl.Int32}, + ) + + root = tmp_path + df.write_parquet_partitioned(root, ["date1", "date2"]) + + lf = pl.scan_parquet(root) + assert_frame_equal(lf.collect(), df) + + lf = pl.scan_parquet(root, try_parse_hive_dates=False) + assert_frame_equal( + lf.collect(), + df.with_columns(pl.col("date1", "date2").cast(pl.String)), + ) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 1741ff116b63..1c0b7ed4516c 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -11,6 +11,7 @@ import polars as pl from polars.exceptions import ComputeError +from polars.interchange.protocol import CompatLevel from polars.testing import assert_frame_equal if TYPE_CHECKING: @@ -234,7 +235,7 @@ def test_from_float16() -> None: def test_binview_ipc_mmap(tmp_path: Path) -> None: df = pl.DataFrame({"foo": ["aa" * 10, "bb", None, "small", "big" * 20]}) file_path = tmp_path / "dump.ipc" - df.write_ipc(file_path, future=True) + df.write_ipc(file_path, compat_level=CompatLevel.newest()) read = pl.read_ipc(file_path, memory_map=True) assert_frame_equal(df, read) @@ -243,7 +244,7 @@ def test_list_nested_enum() -> None: dtype = pl.List(pl.Enum(["a", "b", "c"])) df = pl.DataFrame(pl.Series("list_cat", [["a", "b", "c", None]], dtype=dtype)) buffer = io.BytesIO() - df.write_ipc(buffer) + df.write_ipc(buffer, compat_level=CompatLevel.newest()) df = pl.read_ipc(buffer) assert df.get_column("list_cat").dtype == dtype @@ -256,7 +257,7 @@ def test_struct_nested_enum() -> None: ) ) buffer = io.BytesIO() - df.write_ipc(buffer) + df.write_ipc(buffer, compat_level=CompatLevel.newest()) df = pl.read_ipc(buffer) assert df.get_column("struct_cat").dtype == dtype @@ -268,7 +269,7 @@ def test_ipc_view_gc_14448() -> None: df = pl.DataFrame( pl.Series(["small"] * 10 + ["looooooong string......."] * 750).slice(20, 20) ) - df.write_ipc(f, future=True) + df.write_ipc(f, compat_level=CompatLevel.newest()) f.seek(0) assert_frame_equal(pl.read_ipc(f), df) diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 49477409d8d1..6c4fb8ce3973 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -496,6 +496,7 @@ def test_scan_directory( assert_frame_equal(out, df) +@pytest.mark.write_disk() def test_scan_glob_excludes_directories(tmp_path: Path) -> None: for dir in ["dir1", "dir2", "dir3"]: (tmp_path / dir).mkdir() @@ -515,6 +516,7 @@ def test_scan_glob_excludes_directories(tmp_path: Path) -> None: @pytest.mark.parametrize("file_name", ["a b", "a %25 b"]) +@pytest.mark.write_disk() def test_scan_async_whitespace_in_path( tmp_path: Path, monkeypatch: Any, file_name: str ) -> None: @@ -529,3 +531,42 @@ def test_scan_async_whitespace_in_path( assert_frame_equal(pl.scan_parquet(tmp_path / "*").collect(), df) assert_frame_equal(pl.scan_parquet(tmp_path / "*.parquet").collect(), df) path.unlink() + + +@pytest.mark.write_disk() +def test_path_expansion_excludes_empty_files_17362(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + df = pl.DataFrame({"x": 1}) + df.write_parquet(tmp_path / "data.parquet") + (tmp_path / "empty").touch() + + assert_frame_equal(pl.scan_parquet(tmp_path).collect(), df) + assert_frame_equal(pl.scan_parquet(tmp_path / "*").collect(), df) + + +@pytest.mark.write_disk() +def test_scan_single_dir_differing_file_extensions_raises_17436(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + df = pl.DataFrame({"x": 1}) + df.write_parquet(tmp_path / "data.parquet") + df.write_ipc(tmp_path / "data.ipc") + + with pytest.raises( + pl.exceptions.InvalidOperationError, match="different file extensions" + ): + pl.scan_parquet(tmp_path).collect() + + for lf in [ + pl.scan_parquet(tmp_path / "*.parquet"), + pl.scan_ipc(tmp_path / "*.ipc"), + ]: + assert_frame_equal(lf.collect(), df) + + # Ensure passing a glob doesn't trigger file extension checking + with pytest.raises( + pl.exceptions.ComputeError, + match="parquet: File out of specification: The file must end with PAR1", + ): + pl.scan_parquet(tmp_path / "*").collect() diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py index 9b891f305149..744c029ae059 100644 --- a/py-polars/tests/unit/io/test_spreadsheet.py +++ b/py-polars/tests/unit/io/test_spreadsheet.py @@ -17,7 +17,7 @@ from tests.unit.conftest import FLOAT_DTYPES, NUMERIC_DTYPES if TYPE_CHECKING: - from polars._typing import ExcelSpreadsheetEngine, SchemaDict, SelectorType + from polars._typing import ExcelSpreadsheetEngine, SelectorType pytestmark = pytest.mark.slow() @@ -209,39 +209,35 @@ def test_read_excel_all_sheets( @pytest.mark.parametrize( - ("engine", "schema_overrides"), - [ - ("xlsx2csv", {"datetime": pl.Datetime}), - ("calamine", None), - ("openpyxl", None), - ], + "engine", + ["xlsx2csv", "calamine", "openpyxl"], ) -def test_read_excel_basic_datatypes( - engine: ExcelSpreadsheetEngine, - schema_overrides: SchemaDict | None, -) -> None: +def test_read_excel_basic_datatypes(engine: ExcelSpreadsheetEngine) -> None: df = pl.DataFrame( { "A": [1, 2, 3, 4, 5], "fruits": ["banana", "banana", "apple", "apple", "banana"], "floats": [1.1, 1.2, 1.3, 1.4, 1.5], "datetime": [datetime(2023, 1, x) for x in range(1, 6)], - "nulls": [1, None, None, None, 1], - } + "nulls": [1, None, None, None, 0], + }, ) xls = BytesIO() df.write_excel(xls, position="C5") - # check if can be read as it was written + schema_overrides = {"datetime": pl.Datetime, "nulls": pl.Boolean} + df_compare = df.with_columns( + pl.col(nm).cast(tp) for nm, tp in schema_overrides.items() + ) for sheet_id, sheet_name in ((None, None), (1, None), (None, "Sheet1")): - df = pl.read_excel( + df_from_excel = pl.read_excel( xls, sheet_id=sheet_id, sheet_name=sheet_name, engine=engine, schema_overrides=schema_overrides, ) - assert_frame_equal(df, df) + assert_frame_equal(df_compare, df_from_excel) # check some additional overrides # (note: xlsx2csv can't currently convert datetime with trailing '00:00:00' to date) diff --git a/py-polars/tests/unit/lazyframe/test_serde.py b/py-polars/tests/unit/lazyframe/test_serde.py index 941cf6996cd3..5ec0e6fd14c1 100644 --- a/py-polars/tests/unit/lazyframe/test_serde.py +++ b/py-polars/tests/unit/lazyframe/test_serde.py @@ -91,7 +91,7 @@ def test_lf_serde_to_from_file(lf: pl.LazyFrame, tmp_path: Path) -> None: def test_lf_deserialize_validation() -> None: f = io.BytesIO(b"hello world!") with pytest.raises(ComputeError, match="expected value at line 1 column 1"): - pl.DataFrame.deserialize(f, format="json") + pl.LazyFrame.deserialize(f, format="json") @pytest.mark.write_disk() diff --git a/py-polars/tests/unit/operations/test_gather.py b/py-polars/tests/unit/operations/test_gather.py index e897a8da57a6..fab07dc71956 100644 --- a/py-polars/tests/unit/operations/test_gather.py +++ b/py-polars/tests/unit/operations/test_gather.py @@ -137,3 +137,14 @@ def test_list_get_null_on_oob_true() -> None: df = s_no_nulls.to_frame().with_columns(pl.lit(2).alias("idx")) out = df.select(pl.col("a").list.get("idx", null_on_oob=True)).to_series() assert_series_equal(out, expected) + + +def test_chunked_gather_phys_repr_17446() -> None: + dfa = pl.DataFrame({"replace_unique_id": range(2)}) + + for dt in [pl.Date, pl.Time, pl.Duration]: + dfb = dfa.clone() + dfb = dfb.with_columns(ds_start_date_right=pl.lit(None).cast(dt)) + dfb = pl.concat([dfb, dfb]) + + assert dfa.join(dfb, how="left", on=pl.col("replace_unique_id")).shape == (4, 2) diff --git a/py-polars/tests/unit/operations/test_join_right.py b/py-polars/tests/unit/operations/test_join_right.py new file mode 100644 index 000000000000..bd3c74d6cb5a --- /dev/null +++ b/py-polars/tests/unit/operations/test_join_right.py @@ -0,0 +1,73 @@ +import polars as pl + + +def test_right_join_schemas() -> None: + a = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + + b = pl.DataFrame({"a": [1, 3], "b": [1, 3], "c": [1, 3]}) + + # coalesces the join key, so the key of the right table remains + assert a.join(b, on="a", how="right", coalesce=True).to_dict(as_series=False) == { + "b": [1, 3], + "a": [1, 3], + "b_right": [1, 3], + "c": [1, 3], + } + # doesn't coalesce the join key, so all columns remain + assert a.join(b, on="a", how="right", coalesce=False).columns == [ + "a", + "b", + "a_right", + "b_right", + "c", + ] + + # coalesces the join key, so the key of the right table remains + assert b.join(a, on="a", how="right", coalesce=True).to_dict(as_series=False) == { + "b": [1, None, 3], + "c": [1, None, 3], + "a": [1, 2, 3], + "b_right": [1, 2, 3], + } + assert b.join(a, on="a", how="right", coalesce=False).columns == [ + "a", + "b", + "c", + "a_right", + "b_right", + ] + + a_ = a.lazy() + b_ = b.lazy() + assert list( + a_.join(b_, on="a", how="right", coalesce=True).collect_schema().keys() + ) == ["b", "a", "b_right", "c"] + assert list( + a_.join(b_, on="a", how="right", coalesce=False).collect_schema().keys() + ) == ["a", "b", "a_right", "b_right", "c"] + assert list( + b_.join(a_, on="a", how="right", coalesce=True).collect_schema().keys() + ) == ["b", "c", "a", "b_right"] + assert list( + b_.join(a_, on="a", how="right", coalesce=False).collect_schema().keys() + ) == ["a", "b", "c", "a_right", "b_right"] + + +def test_right_join_schemas_multikey() -> None: + a = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]}) + + b = pl.DataFrame({"a": [1, 3], "b": [1, 3], "c": [1, 3]}) + assert a.join(b, on=["a", "b"], how="right", coalesce=False).columns == [ + "a", + "b", + "c", + "a_right", + "b_right", + "c_right", + ] + assert a.join(b, on=["a", "b"], how="right", coalesce=True).to_dict( + as_series=False + ) == {"c": [1, 3], "a": [1, 3], "b": [1, 3], "c_right": [1, 3]} + assert b.join(a, on=["a", "b"], how="right", coalesce=True).to_dict( + as_series=False + ) == {"c": [1, None, 3], "a": [1, 2, 3], "b": [1, 2, 3], "c_right": [1, 2, 3]} diff --git a/py-polars/tests/unit/operations/test_rename.py b/py-polars/tests/unit/operations/test_rename.py index a3e3693ece49..3d9809b20842 100644 --- a/py-polars/tests/unit/operations/test_rename.py +++ b/py-polars/tests/unit/operations/test_rename.py @@ -149,3 +149,14 @@ def test_rename_schema_order_6660() -> None: assert renamed.collect_schema() == renamed.collect().schema assert computed.collect_schema() == computed.collect().schema + + +def test_rename_schema_17427() -> None: + assert ( + pl.LazyFrame({"A": [1]}) + .with_columns(B=2) + .select(["A", "B"]) + .rename({"A": "C", "B": "A"}) + .select(["C", "A"]) + .collect() + ).to_dict(as_series=False) == {"C": [1], "A": [2]} diff --git a/py-polars/tests/unit/operations/test_slice.py b/py-polars/tests/unit/operations/test_slice.py index bf179a1e8dfa..7c8fb22665c1 100644 --- a/py-polars/tests/unit/operations/test_slice.py +++ b/py-polars/tests/unit/operations/test_slice.py @@ -173,10 +173,10 @@ def test_slice_nullcount(ref: list[int | None]) -> None: def test_slice_pushdown_set_sorted() -> None: ldf = pl.LazyFrame({"foo": [1, 2, 3]}) - ldf = ldf.set_sorted("foo").head(5) + ldf = ldf.set_sorted("foo").head(2) plan = ldf.explain() - # check the set sorted is above slice - assert plan.index("set_sorted") < plan.index("SLICE") + assert "SLICE" not in plan + assert ldf.collect().height == 2 def test_slice_pushdown_literal_projection_14349() -> None: @@ -197,19 +197,17 @@ def test_slice_pushdown_literal_projection_14349() -> None: # For select, slice pushdown should happen when at least 1 input column is selected q = lf.select("a", x=1).head(0) - plan = q.explain() - assert plan.index("SELECT") < plan.index("SLICE") + # slice isn't in plan if it has been pushed down to the dataframe + assert "SLICE" not in q.explain() assert q.collect().height == 0 # For with_columns, slice pushdown should happen if the input has at least 1 column q = lf.with_columns(x=1).head(0) - plan = q.explain() - assert plan.index("WITH_COLUMNS") < plan.index("SLICE") + assert "SLICE" not in q.explain() assert q.collect().height == 0 q = lf.with_columns(pl.col("a") + 1).head(0) - plan = q.explain() - assert plan.index("WITH_COLUMNS") < plan.index("SLICE") + assert "SLICE" not in q.explain() assert q.collect().height == 0 # This does not project any of the original columns @@ -219,8 +217,7 @@ def test_slice_pushdown_literal_projection_14349() -> None: assert q.collect().height == 0 q = lf.with_columns(b=1, c=2).head(0) - plan = q.explain() - assert plan.index("WITH_COLUMNS") < plan.index("SLICE") + assert "SLICE" not in q.explain() assert q.collect().height == 0 diff --git a/py-polars/tests/unit/series/test_getitem.py b/py-polars/tests/unit/series/test_getitem.py index 50dee3e0cf02..81607f2d7236 100644 --- a/py-polars/tests/unit/series/test_getitem.py +++ b/py-polars/tests/unit/series/test_getitem.py @@ -88,7 +88,14 @@ def test_series_getitem_multiple_indices(indices: Any) -> None: @pytest.mark.parametrize( ("input", "match"), [ - ([0.0, 1.0], "unexpected value while building Series of type Int64"), + ( + [0.0, 1.0], + "cannot select elements using Sequence with elements of type 'float'", + ), + ( + "foobar", + "cannot select elements using Sequence with elements of type 'str'", + ), ( pl.Series([[1, 2], [3, 4]]), "cannot treat Series of type List\\(Int64\\) as indices", @@ -97,7 +104,7 @@ def test_series_getitem_multiple_indices(indices: Any) -> None: (object(), "cannot select elements using key of type 'object'"), ], ) -def test_df_getitem_col_invalid_inputs(input: Any, match: str) -> None: +def test_series_getitem_col_invalid_inputs(input: Any, match: str) -> None: s = pl.Series([1, 2, 3]) with pytest.raises(TypeError, match=match): s[input] diff --git a/py-polars/tests/unit/sql/test_miscellaneous.py b/py-polars/tests/unit/sql/test_miscellaneous.py index 68d3ab0b1e68..f3979219c813 100644 --- a/py-polars/tests/unit/sql/test_miscellaneous.py +++ b/py-polars/tests/unit/sql/test_miscellaneous.py @@ -269,3 +269,16 @@ def test_read_csv(tmp_path: Path) -> None: match="`read_csv` expects a single file path; found 3 arguments", ): pl.sql("SELECT * FROM read_csv('a','b','c')") + + +def test_global_variable_inference_17398() -> None: + users = pl.DataFrame({"id": "1"}) + + res = pl.sql( + query=""" + WITH user_by_email AS (SELECT id FROM users) + SELECT * FROM user_by_email + """, + eager=True, + ) + assert_frame_equal(res, users)