Skip to content

Commit

Permalink
fix(query): fix incorrect total_bytes_len in string view (#16877)
Browse files Browse the repository at this point in the history
* update

* update

* update

* update

* update
  • Loading branch information
sundy-li authored Nov 21, 2024
1 parent 22a8a82 commit bb5e2e6
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 40 deletions.
4 changes: 1 addition & 3 deletions src/common/column/src/binview/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,7 @@ impl<T: ViewType + ?Sized> BinaryViewColumnBuilder<T> {

self.push_value(value);
let value = self.views.pop().unwrap();

self.total_bytes_len +=
(self.total_bytes_len - old_bytes_len) * additional.saturating_sub(1);
self.total_bytes_len = old_bytes_len + value.length as usize * additional;
self.views.extend(std::iter::repeat(value).take(additional));
}

Expand Down
56 changes: 19 additions & 37 deletions src/common/column/src/binview/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ mod view;

use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;

use arrow_data::ArrayData;
Expand Down Expand Up @@ -52,8 +50,6 @@ mod private {
impl Sealed for [u8] {}
}

const UNKNOWN_LEN: u64 = u64::MAX;

pub trait ViewType: Sealed + 'static + PartialEq + AsRef<Self> {
const IS_UTF8: bool;
type Owned: Debug + Clone + Sync + Send + AsRef<Self>;
Expand Down Expand Up @@ -119,7 +115,7 @@ pub struct BinaryViewColumnGeneric<T: ViewType + ?Sized> {
buffers: Arc<[Buffer<u8>]>,
phantom: PhantomData<T>,
/// Total bytes length if we would concat them all
total_bytes_len: AtomicU64,
total_bytes_len: usize,
/// Total bytes in the buffer (exclude remaining capacity)
total_buffer_len: usize,
}
Expand All @@ -131,7 +127,7 @@ impl<T: ViewType + ?Sized> Clone for BinaryViewColumnGeneric<T> {
buffers: self.buffers.clone(),

phantom: Default::default(),
total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
total_bytes_len: self.total_bytes_len,
total_buffer_len: self.total_buffer_len,
}
}
Expand All @@ -151,26 +147,19 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
) -> Self {
#[cfg(debug_assertions)]
{
if total_bytes_len != UNKNOWN_LEN as usize {
let total = views.iter().map(|v| v.length as usize).sum::<usize>();
assert_eq!(total, total_bytes_len);
}
let total = views.iter().map(|v| v.length as usize).sum::<usize>();
assert_eq!(total, total_bytes_len);

if total_buffer_len != UNKNOWN_LEN as usize {
let total = buffers.iter().map(|v| v.len()).sum::<usize>();
assert_eq!(total, total_buffer_len);
}
let total = buffers.iter().map(|v| v.len()).sum::<usize>();
assert_eq!(total, total_buffer_len);
}
// # Safety
// The caller must ensure
// - the data is valid utf8 (if required)
// - the offsets match the buffers.

Self {
views,
buffers,

phantom: Default::default(),
total_bytes_len: AtomicU64::new(total_bytes_len as u64),
total_bytes_len,
total_buffer_len,
}
}
Expand All @@ -181,12 +170,11 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
pub unsafe fn new_unchecked_unknown_md(
views: Buffer<View>,
buffers: Arc<[Buffer<u8>]>,

total_buffer_len: Option<usize>,
) -> Self {
let total_bytes_len = UNKNOWN_LEN as usize;
let total_buffer_len =
total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum());
total_buffer_len.unwrap_or(buffers.iter().map(|v| v.len()).sum::<usize>());
let total_bytes_len = views.iter().map(|v| v.length as usize).sum::<usize>();
Self::new_unchecked(views, buffers, total_bytes_len, total_buffer_len)
}

Expand Down Expand Up @@ -305,14 +293,7 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {

/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.
pub fn total_bytes_len(&self) -> usize {
let total = self.total_bytes_len.load(Ordering::Relaxed);
if total == UNKNOWN_LEN {
let total = self.len_iter().map(|v| v as usize).sum::<usize>();
self.total_bytes_len.store(total as u64, Ordering::Relaxed);
total
} else {
total as usize
}
self.total_bytes_len
}

pub fn memory_size(&self) -> usize {
Expand Down Expand Up @@ -378,7 +359,7 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
debug_assert!(offset + length <= self.len());
self.views.slice_unchecked(offset, length);
self.total_bytes_len.store(UNKNOWN_LEN, Ordering::Relaxed)
self.total_bytes_len = self.views.iter().map(|v| v.length as usize).sum::<usize>();
}

impl_sliced!();
Expand Down Expand Up @@ -418,13 +399,14 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
pub fn make_mut(self) -> BinaryViewColumnBuilder<T> {
let views = self.views.make_mut();
let completed_buffers = self.buffers.to_vec();

BinaryViewColumnBuilder {
views,
completed_buffers,
in_progress_buffer: vec![],

phantom: Default::default(),
total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize,
total_bytes_len: self.total_bytes_len,
total_buffer_len: self.total_buffer_len,
}
}
Expand All @@ -440,19 +422,19 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
completed_buffers: self.buffers.to_vec(),
in_progress_buffer: vec![],
phantom: Default::default(),
total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize,
total_bytes_len: self.total_bytes_len,
total_buffer_len: self.total_buffer_len,
}),
(Right(views), false) => Left(Self::new_unchecked(
views.into(),
self.buffers,
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_bytes_len,
self.total_buffer_len,
)),
(Left(views), _) => Left(Self::new_unchecked(
views,
self.buffers,
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_bytes_len,
self.total_buffer_len,
)),
}
Expand Down Expand Up @@ -518,7 +500,7 @@ impl BinaryViewColumn {
Utf8ViewColumn::new_unchecked(
self.views.clone(),
self.buffers.clone(),
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_bytes_len,
self.total_buffer_len,
)
}
Expand All @@ -529,7 +511,7 @@ impl Utf8ViewColumn {
BinaryViewColumn::new_unchecked(
self.views.clone(),
self.buffers.clone(),
self.total_bytes_len.load(Ordering::Relaxed) as usize,
self.total_bytes_len,
self.total_buffer_len,
)
}
Expand Down
2 changes: 2 additions & 0 deletions src/query/expression/src/kernels/group_by_hash/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pub fn serialize_group_columns(
}
builder.commit_row();
}
// For nulllable column it will only serialize valid row data
debug_assert!(builder.data.len() <= serialize_size);
builder.build()
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
query T
SELECT '1'::string from numbers(3)
----
1
1
1

query T
SELECT {'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232} from numbers(3);
----
{'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232}
{'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232}
{'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232}


# String concatenation
query T
SELECT 'hello' || ' ' || 'world' FROM numbers(1);
----
hello world

query T
SELECT '!@#$%^&*()'::string, '你好'::string, '🌟'::string;
----
!@#$%^&*() 你好 🌟

# String with escape sequences
query T
SELECT 'line1-line2'::string, 'tab\there'::string;
----
line1-line2 tab here

query T
SELECT UPPER('hello'), LOWER('WORLD'), LENGTH('databend') FROM numbers(1);
----
HELLO world 8

# String with JSON objects
query T
SELECT {'key': 'value'::string, 'numbers': 123::string} FROM numbers(2);
----
{'key':'value','numbers':'123'}
{'key':'value','numbers':'123'}


# String with scientific notation
query T
SELECT {'scientific': 1.23e-4, 'regular': 123.456} FROM numbers(1);
----
{'scientific':0.000123,'regular':123.456000}


# String with very long content
query T
SELECT repeat('a', 100)::string FROM numbers(1);
----
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

# String with whitespace handling
query T
SELECT TRIM(' spaced ')::string, LTRIM(' left')::string, RTRIM('right ')::string;
----
spaced left right

# String with NULL values
query T
SELECT NULL::string, COALESCE(NULL::string, 'default') FROM numbers(1);
----
NULL default

0 comments on commit bb5e2e6

Please sign in to comment.