Skip to content

Commit

Permalink
Merge branch 'main' of github.com:pola-rs/polars into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Henry Harbeck committed Jul 8, 2024
2 parents 7e35551 + 36eff75 commit 41a03cd
Show file tree
Hide file tree
Showing 229 changed files with 5,419 additions and 1,950 deletions.
15 changes: 15 additions & 0 deletions .github/pr-title-checker-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"LABEL": {
"name": "title needs formatting",
"color": "FF0000"
},
"CHECKS": {
"regexp": "^(build|chore|ci|depr|docs|feat|fix|perf|refactor|release|test)(\\((python|rust)\\!?(,(python|rust)\\!?)?\\))?\\!?\\: [A-Z].*[^\\.\\!\\?,… ]$",
"ignoreLabels": ["skip changelog"]
},
"MESSAGES": {
"success": "PR title OK!",
"failure": "Invalid PR title! Please update according to the contributing guidelines: https://docs.pola.rs/development/contributing/#pull-requests",
"notice": ""
}
}
7 changes: 6 additions & 1 deletion .github/workflows/pr-labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@ permissions:
pull-requests: write

jobs:
main:
labeler:
runs-on: ubuntu-latest
steps:
- name: Check pull request title
uses: thehanimo/pr-title-checker@v1.4.2
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Label pull request
uses: release-drafter/release-drafter@v6
with:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/test-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ concurrency:
env:
RUSTFLAGS: -C debuginfo=0 # Do not produce debug symbols to keep memory usage down
RUST_BACKTRACE: 1
PYTHONUTF8: 1

defaults:
run:
Expand Down
3 changes: 2 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ use crate::array::iterator::NonNullValuesIter;
use crate::bitmap::utils::{BitmapIter, ZipValidity};
pub type BinaryViewArray = BinaryViewArrayGeneric<[u8]>;
pub type Utf8ViewArray = BinaryViewArrayGeneric<str>;
pub use view::{View, INLINE_VIEW_SIZE};
pub use view::View;

use super::Splitable;

Expand Down
12 changes: 12 additions & 0 deletions crates/polars-arrow/src/array/binview/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,18 @@ impl<T: ViewType + ?Sized> MutableBinaryViewArray<T> {
self.views.push(value);
}

#[inline]
pub fn push_buffer(&mut self, buffer: Buffer<u8>) -> u32 {
if !self.in_progress_buffer.is_empty() {
self.completed_buffers
.push(Buffer::from(std::mem::take(&mut self.in_progress_buffer)));
}

let buffer_idx = self.completed_buffers.len();
self.completed_buffers.push(buffer);
buffer_idx as u32
}

#[inline]
pub fn push_value<V: AsRef<T>>(&mut self, value: V) {
if let Some(validity) = &mut self.validity {
Expand Down
86 changes: 68 additions & 18 deletions crates/polars-arrow/src/array/binview/view.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use std::cmp::Ordering;
use std::fmt::{Display, Formatter};
use std::fmt::{self, Display, Formatter};
use std::ops::Add;

use bytemuck::{Pod, Zeroable};
Expand All @@ -13,10 +13,12 @@ use crate::buffer::Buffer;
use crate::datatypes::PrimitiveType;
use crate::types::NativeType;

pub const INLINE_VIEW_SIZE: u32 = 12;

// We use this instead of u128 because we want alignment of <= 8 bytes.
#[derive(Debug, Copy, Clone, Default)]
/// A reference to a set of bytes.
///
/// If `length <= 12`, these bytes are inlined over the `prefix`, `buffer_idx` and `offset` fields.
/// If `length > 12`, these fields specify a slice of a buffer.
#[derive(Copy, Clone, Default)]
#[repr(C)]
pub struct View {
/// The length of the string/bytes.
Expand All @@ -29,29 +31,77 @@ pub struct View {
pub offset: u32,
}

impl fmt::Debug for View {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.length <= Self::MAX_INLINE_SIZE {
fmt.debug_struct("View")
.field("length", &self.length)
.field("content", &unsafe {
std::slice::from_raw_parts(
(self as *const _ as *const u8).add(4),
self.length as usize,
)
})
.finish()
} else {
fmt.debug_struct("View")
.field("length", &self.length)
.field("prefix", &self.prefix.to_be_bytes())
.field("buffer_idx", &self.buffer_idx)
.field("offset", &self.offset)
.finish()
}
}
}

impl View {
pub const MAX_INLINE_SIZE: u32 = 12;

#[inline(always)]
pub fn as_u128(self) -> u128 {
unsafe { std::mem::transmute(self) }
}

/// Create a new inline view
///
/// # Panics
///
/// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`.
#[inline]
pub fn new_inline(bytes: &[u8]) -> Self {
debug_assert!(bytes.len() <= u32::MAX as usize);
assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE);

let mut view = Self {
length: bytes.len() as u32,
..Default::default()
};

let view_ptr = &mut view as *mut _ as *mut u8;

// SAFETY:
// - bytes length <= 12,
// - size_of::<View> == 16
// - View is laid out as [length, prefix, buffer_idx, offset] (using repr(C))
// - By grabbing the view_ptr and adding 4, we have provenance over prefix, buffer_idx and
// offset. (i.e. the same could not be achieved with &mut self.prefix as *mut _ as *mut u8)
unsafe {
let inline_data_ptr = view_ptr.add(4);
core::ptr::copy_nonoverlapping(bytes.as_ptr(), inline_data_ptr, bytes.len());
}
view
}

#[inline]
pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self {
if bytes.len() <= 12 {
let mut ret = Self {
length: bytes.len() as u32,
..Default::default()
};
let ret_ptr = &mut ret as *mut _ as *mut u8;
unsafe {
core::ptr::copy_nonoverlapping(bytes.as_ptr(), ret_ptr.add(4), bytes.len());
}
ret
debug_assert!(bytes.len() <= u32::MAX as usize);

if bytes.len() as u32 <= Self::MAX_INLINE_SIZE {
Self::new_inline(bytes)
} else {
let prefix_buf: [u8; 4] = std::array::from_fn(|i| *bytes.get(i).unwrap_or(&0));
Self {
length: bytes.len() as u32,
prefix: u32::from_le_bytes(prefix_buf),
prefix: u32::from_le_bytes(bytes[0..4].try_into().unwrap()),
buffer_idx,
offset,
}
Expand Down Expand Up @@ -190,8 +240,8 @@ where
{
for view in views {
let len = view.length;
if len <= INLINE_VIEW_SIZE {
if len < INLINE_VIEW_SIZE && view.as_u128() >> (32 + len * 8) != 0 {
if len <= View::MAX_INLINE_SIZE {
if len < View::MAX_INLINE_SIZE && view.as_u128() >> (32 + len * 8) != 0 {
polars_bail!(ComputeError: "view contained non-zero padding in prefix");
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ mod values;
pub use binary::{BinaryArray, BinaryValueIter, MutableBinaryArray, MutableBinaryValuesArray};
pub use binview::{
BinaryViewArray, BinaryViewArrayGeneric, MutableBinaryViewArray, MutablePlBinary,
MutablePlString, Utf8ViewArray, View, ViewType, INLINE_VIEW_SIZE,
MutablePlString, Utf8ViewArray, View, ViewType,
};
pub use boolean::{BooleanArray, MutableBooleanArray};
pub use dictionary::{DictionaryArray, DictionaryKey, MutableDictionaryArray};
Expand Down
Loading

0 comments on commit 41a03cd

Please sign in to comment.