From bb5e2e6c4cf74710c607a11e993a9f7f06059e2d Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Thu, 21 Nov 2024 15:41:21 +0800 Subject: [PATCH] fix(query): fix incorrect total_bytes_len in string view (#16877) * update * update * update * update * update --- src/common/column/src/binview/builder.rs | 4 +- src/common/column/src/binview/mod.rs | 56 +++++---------- .../src/kernels/group_by_hash/utils.rs | 2 + .../11_0002_data_type_string.test | 69 +++++++++++++++++++ 4 files changed, 91 insertions(+), 40 deletions(-) diff --git a/src/common/column/src/binview/builder.rs b/src/common/column/src/binview/builder.rs index 7ea031694a1c..1ff5df5fe84d 100644 --- a/src/common/column/src/binview/builder.rs +++ b/src/common/column/src/binview/builder.rs @@ -196,9 +196,7 @@ impl BinaryViewColumnBuilder { self.push_value(value); let value = self.views.pop().unwrap(); - - self.total_bytes_len += - (self.total_bytes_len - old_bytes_len) * additional.saturating_sub(1); + self.total_bytes_len = old_bytes_len + value.length as usize * additional; self.views.extend(std::iter::repeat(value).take(additional)); } diff --git a/src/common/column/src/binview/mod.rs b/src/common/column/src/binview/mod.rs index 58e52b234f33..f963a0ca99d7 100644 --- a/src/common/column/src/binview/mod.rs +++ b/src/common/column/src/binview/mod.rs @@ -20,8 +20,6 @@ mod view; use std::fmt::Debug; use std::marker::PhantomData; -use std::sync::atomic::AtomicU64; -use std::sync::atomic::Ordering; use std::sync::Arc; use arrow_data::ArrayData; @@ -52,8 +50,6 @@ mod private { impl Sealed for [u8] {} } -const UNKNOWN_LEN: u64 = u64::MAX; - pub trait ViewType: Sealed + 'static + PartialEq + AsRef { const IS_UTF8: bool; type Owned: Debug + Clone + Sync + Send + AsRef; @@ -119,7 +115,7 @@ pub struct BinaryViewColumnGeneric { buffers: Arc<[Buffer]>, phantom: PhantomData, /// Total bytes length if we would concat them all - total_bytes_len: AtomicU64, + total_bytes_len: usize, /// Total bytes in the buffer (exclude remaining capacity) total_buffer_len: usize, } @@ -131,7 +127,7 @@ impl Clone for BinaryViewColumnGeneric { buffers: self.buffers.clone(), phantom: Default::default(), - total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)), + total_bytes_len: self.total_bytes_len, total_buffer_len: self.total_buffer_len, } } @@ -151,26 +147,19 @@ impl BinaryViewColumnGeneric { ) -> Self { #[cfg(debug_assertions)] { - if total_bytes_len != UNKNOWN_LEN as usize { - let total = views.iter().map(|v| v.length as usize).sum::(); - assert_eq!(total, total_bytes_len); - } + let total = views.iter().map(|v| v.length as usize).sum::(); + assert_eq!(total, total_bytes_len); - if total_buffer_len != UNKNOWN_LEN as usize { - let total = buffers.iter().map(|v| v.len()).sum::(); - assert_eq!(total, total_buffer_len); - } + let total = buffers.iter().map(|v| v.len()).sum::(); + assert_eq!(total, total_buffer_len); } - // # Safety - // The caller must ensure - // - the data is valid utf8 (if required) - // - the offsets match the buffers. + Self { views, buffers, phantom: Default::default(), - total_bytes_len: AtomicU64::new(total_bytes_len as u64), + total_bytes_len, total_buffer_len, } } @@ -181,12 +170,11 @@ impl BinaryViewColumnGeneric { pub unsafe fn new_unchecked_unknown_md( views: Buffer, buffers: Arc<[Buffer]>, - total_buffer_len: Option, ) -> Self { - let total_bytes_len = UNKNOWN_LEN as usize; let total_buffer_len = - total_buffer_len.unwrap_or_else(|| buffers.iter().map(|b| b.len()).sum()); + total_buffer_len.unwrap_or(buffers.iter().map(|v| v.len()).sum::()); + let total_bytes_len = views.iter().map(|v| v.length as usize).sum::(); Self::new_unchecked(views, buffers, total_bytes_len, total_buffer_len) } @@ -305,14 +293,7 @@ impl BinaryViewColumnGeneric { /// Get the total length of bytes that it would take to concatenate all binary/str values in this array. pub fn total_bytes_len(&self) -> usize { - let total = self.total_bytes_len.load(Ordering::Relaxed); - if total == UNKNOWN_LEN { - let total = self.len_iter().map(|v| v as usize).sum::(); - self.total_bytes_len.store(total as u64, Ordering::Relaxed); - total - } else { - total as usize - } + self.total_bytes_len } pub fn memory_size(&self) -> usize { @@ -378,7 +359,7 @@ impl BinaryViewColumnGeneric { unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { debug_assert!(offset + length <= self.len()); self.views.slice_unchecked(offset, length); - self.total_bytes_len.store(UNKNOWN_LEN, Ordering::Relaxed) + self.total_bytes_len = self.views.iter().map(|v| v.length as usize).sum::(); } impl_sliced!(); @@ -418,13 +399,14 @@ impl BinaryViewColumnGeneric { pub fn make_mut(self) -> BinaryViewColumnBuilder { let views = self.views.make_mut(); let completed_buffers = self.buffers.to_vec(); + BinaryViewColumnBuilder { views, completed_buffers, in_progress_buffer: vec![], phantom: Default::default(), - total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize, + total_bytes_len: self.total_bytes_len, total_buffer_len: self.total_buffer_len, } } @@ -440,19 +422,19 @@ impl BinaryViewColumnGeneric { completed_buffers: self.buffers.to_vec(), in_progress_buffer: vec![], phantom: Default::default(), - total_bytes_len: self.total_bytes_len.load(Ordering::Relaxed) as usize, + total_bytes_len: self.total_bytes_len, total_buffer_len: self.total_buffer_len, }), (Right(views), false) => Left(Self::new_unchecked( views.into(), self.buffers, - self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_bytes_len, self.total_buffer_len, )), (Left(views), _) => Left(Self::new_unchecked( views, self.buffers, - self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_bytes_len, self.total_buffer_len, )), } @@ -518,7 +500,7 @@ impl BinaryViewColumn { Utf8ViewColumn::new_unchecked( self.views.clone(), self.buffers.clone(), - self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_bytes_len, self.total_buffer_len, ) } @@ -529,7 +511,7 @@ impl Utf8ViewColumn { BinaryViewColumn::new_unchecked( self.views.clone(), self.buffers.clone(), - self.total_bytes_len.load(Ordering::Relaxed) as usize, + self.total_bytes_len, self.total_buffer_len, ) } diff --git a/src/query/expression/src/kernels/group_by_hash/utils.rs b/src/query/expression/src/kernels/group_by_hash/utils.rs index 8b844452a798..280edbe6e850 100644 --- a/src/query/expression/src/kernels/group_by_hash/utils.rs +++ b/src/query/expression/src/kernels/group_by_hash/utils.rs @@ -40,6 +40,8 @@ pub fn serialize_group_columns( } builder.commit_row(); } + // For nulllable column it will only serialize valid row data + debug_assert!(builder.data.len() <= serialize_size); builder.build() } diff --git a/tests/sqllogictests/suites/base/11_data_type/11_0002_data_type_string.test b/tests/sqllogictests/suites/base/11_data_type/11_0002_data_type_string.test index e69de29bb2d1..0ef9b2985d30 100644 --- a/tests/sqllogictests/suites/base/11_data_type/11_0002_data_type_string.test +++ b/tests/sqllogictests/suites/base/11_data_type/11_0002_data_type_string.test @@ -0,0 +1,69 @@ +query T +SELECT '1'::string from numbers(3) +---- +1 +1 +1 + +query T +SELECT {'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232} from numbers(3); +---- +{'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232} +{'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232} +{'w1vY1t5':-15239676694.972677,'6UfU721':-4905646705.765232} + + +# String concatenation +query T +SELECT 'hello' || ' ' || 'world' FROM numbers(1); +---- +hello world + +query T +SELECT '!@#$%^&*()'::string, '你好'::string, '🌟'::string; +---- +!@#$%^&*() 你好 🌟 + +# String with escape sequences +query T +SELECT 'line1-line2'::string, 'tab\there'::string; +---- +line1-line2 tab here + +query T +SELECT UPPER('hello'), LOWER('WORLD'), LENGTH('databend') FROM numbers(1); +---- +HELLO world 8 + +# String with JSON objects +query T +SELECT {'key': 'value'::string, 'numbers': 123::string} FROM numbers(2); +---- +{'key':'value','numbers':'123'} +{'key':'value','numbers':'123'} + + +# String with scientific notation +query T +SELECT {'scientific': 1.23e-4, 'regular': 123.456} FROM numbers(1); +---- +{'scientific':0.000123,'regular':123.456000} + + +# String with very long content +query T +SELECT repeat('a', 100)::string FROM numbers(1); +---- +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + +# String with whitespace handling +query T +SELECT TRIM(' spaced ')::string, LTRIM(' left')::string, RTRIM('right ')::string; +---- +spaced left right + +# String with NULL values +query T +SELECT NULL::string, COALESCE(NULL::string, 'default') FROM numbers(1); +---- +NULL default