Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(query): add collation #8610

Merged
merged 3 commits into from
Nov 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/query/datavalues/src/columns/string/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@ impl MutableStringColumn {
self.offsets.push(self.last_size as i64);
}

pub fn write_from_char_iter(&mut self, iter: impl Iterator<Item = char>) {
for c in iter {
let mut buf = [0; 4];
let result = c.encode_utf8(&mut buf);
self.values.extend_from_slice(result.as_bytes());
}
}

#[inline]
pub fn commit_row(&mut self) {
self.last_size = self.values.len();
Expand Down
12 changes: 12 additions & 0 deletions src/query/expression/src/types/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,18 @@ impl StringColumnBuilder {
self.data.extend_from_slice(item);
}

pub fn put_char_iter(&mut self, iter: impl Iterator<Item = char>) {
for c in iter {
let mut buf = [0; 4];
let result = c.encode_utf8(&mut buf);
self.data.extend_from_slice(result.as_bytes());
}
}

pub fn put(&mut self, item: &[u8]) {
self.data.extend_from_slice(item);
}

pub fn write_row<T>(&mut self, f: impl FnOnce(&mut Vec<u8>) -> T) -> T {
let res = f(&mut self.data);
self.commit_row();
Expand Down
61 changes: 55 additions & 6 deletions src/query/functions-v2/src/scalars/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pub fn register(registry: &mut FunctionRegistry) {
registry.register_aliases("upper", &["ucase"]);
registry.register_aliases("lower", &["lcase"]);
registry.register_aliases("length", &["octet_length"]);
registry.register_aliases("char_length", &["character_length"]);
registry.register_aliases("char_length", &["character_length", "length_utf8"]);
registry.register_aliases("substr", &["substring", "mid"]);

registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
Expand Down Expand Up @@ -105,14 +105,19 @@ pub fn register(registry: &mut FunctionRegistry) {
|val, _| val.len() as u64,
);

registry.register_1_arg::<StringType, NumberType<u64>, _, _>(
registry.register_passthrough_nullable_1_arg::<StringType, NumberType<u64>, _, _>(
"char_length",
FunctionProperty::default(),
|_| None,
|val, _| match std::str::from_utf8(val) {
Ok(s) => s.chars().count() as u64,
Err(_) => val.len() as u64,
},
vectorize_with_builder_1_arg::<StringType, NumberType<u64>>(|s, output, _| {
match std::str::from_utf8(s) {
Ok(s) => {
output.push(s.chars().count() as u64);
Ok(())
}
Err(err) => Err(err.to_string()),
}
}),
);

registry.register_passthrough_nullable_3_arg::<StringType, NumberType<u64>, StringType, StringType, _, _>(
Expand Down Expand Up @@ -750,6 +755,30 @@ pub fn register(registry: &mut FunctionRegistry) {
Ok(())
}),
);

registry.register_passthrough_nullable_2_arg::<StringType, NumberType<i64>, StringType, _, _>(
"substr_utf8",
FunctionProperty::default(),
|_, _| None,
vectorize_with_builder_2_arg::<StringType, NumberType<i64>, StringType>(
|s, pos, output, _| {
let s = std::str::from_utf8(s).map_err(|e| e.to_string())?;
substr_utf8(output, s, pos, s.len() as u64);
Ok(())
},
),
);

registry.register_passthrough_nullable_3_arg::<StringType, NumberType<i64>, NumberType<u64>, StringType, _, _>(
"substr_utf8",
FunctionProperty::default(),
|_, _, _| None,
vectorize_with_builder_3_arg::<StringType, NumberType<i64>, NumberType<u64>, StringType>(|s, pos, len, output, _| {
let s = std::str::from_utf8(s).map_err(|e| e.to_string())?;
substr_utf8(output, s, pos, len);
Ok(())
}),
);
}

mod soundex {
Expand Down Expand Up @@ -804,6 +833,26 @@ fn substr(str: &[u8], pos: i64, len: u64) -> &[u8] {
&str[0..0]
}

#[inline]
fn substr_utf8(builder: &mut StringColumnBuilder, str: &str, pos: i64, len: u64) {
if pos == 0 || len == 0 {
builder.commit_row();
return;
}

let char_len = str.chars().count();
let start = if pos > 0 {
(pos - 1).min(char_len as i64) as usize
} else {
char_len
.checked_sub(pos.unsigned_abs() as usize)
.unwrap_or(char_len)
};

builder.put_char_iter(str.chars().skip(start).take(len as usize));
builder.commit_row();
}

/// String to String scalar function with estimiated ouput column capacity.
fn vectorize_string_to_string(
estimate_bytes: impl Fn(&StringColumn) -> usize + Copy,
Expand Down
18 changes: 18 additions & 0 deletions src/query/functions/src/scalars/strings/length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use common_exception::Result;

use super::string2number_try::TryNumberOperator;
use super::string2number_try::TryString2NumberFunction;
use super::NumberOperator;
use super::String2NumberFunction;

#[derive(Clone, Default)]
pub struct StringLength {}

#[derive(Clone, Default)]
pub struct StringUtf8Length {}

impl NumberOperator<u64> for StringLength {
const IS_DETERMINISTIC: bool = true;
const MAYBE_MONOTONIC: bool = false;
Expand All @@ -27,4 +34,15 @@ impl NumberOperator<u64> for StringLength {
}
}

impl TryNumberOperator<u64> for StringUtf8Length {
const IS_DETERMINISTIC: bool = true;
const MAYBE_MONOTONIC: bool = false;

fn try_apply<'a>(&'a mut self, value: &'a [u8]) -> Result<u64> {
let s = std::str::from_utf8(value)?;
Ok(s.chars().count() as u64)
}
}

pub type StringLengthFunction = String2NumberFunction<StringLength, u64>;
pub type StringUtf8LengthFunction = TryString2NumberFunction<StringUtf8Length, u64>;
3 changes: 3 additions & 0 deletions src/query/functions/src/scalars/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,11 @@ mod space;
mod strcmp;
mod string;
mod string2number;
mod string2number_try;
mod string2string;
mod substring;
mod substring_index;
mod substring_utf8;
mod trim;
mod unhex;
mod upper;
Expand All @@ -74,6 +76,7 @@ pub use insert::InsertFunction;
pub use leftright::LeftFunction;
pub use leftright::RightFunction;
pub use length::StringLengthFunction;
pub use length::StringUtf8LengthFunction;
pub use locate::InstrFunction;
pub use locate::LocateFunction;
pub use locate::PositionFunction;
Expand Down
7 changes: 7 additions & 0 deletions src/query/functions/src/scalars/strings/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use super::substring_utf8::SubstringUtf8Function;
use super::StringUtf8LengthFunction;
use crate::scalars::AsciiFunction;
use crate::scalars::Base64DecodeFunction;
use crate::scalars::Base64EncodeFunction;
Expand Down Expand Up @@ -120,5 +122,10 @@ impl StringFunction {
factory.register("locate", LocateFunction::desc());
factory.register("position", PositionFunction::desc());
factory.register("instr", InstrFunction::desc());

// utf8 collation
factory.register("length_utf8", StringUtf8LengthFunction::desc());
factory.register("substring_utf8", SubstringUtf8Function::desc());
factory.register("substr_utf8", SubstringUtf8Function::desc());
}
}
108 changes: 108 additions & 0 deletions src/query/functions/src/scalars/strings/string2number_try.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Copyright 2021 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt;
use std::marker::PhantomData;
use std::sync::Arc;

use common_datavalues::prelude::*;
use common_exception::Result;

use crate::scalars::assert_string;
use crate::scalars::Function;
use crate::scalars::FunctionContext;
use crate::scalars::FunctionDescription;
use crate::scalars::FunctionFeatures;

pub trait TryNumberOperator<R>: Send + Sync + Clone + Default + 'static {
const IS_DETERMINISTIC: bool;
const MAYBE_MONOTONIC: bool;

fn try_apply<'a>(&'a mut self, _: &'a [u8]) -> Result<R>;
}

#[derive(Clone)]
pub struct TryString2NumberFunction<T, R> {
display_name: String,
t: PhantomData<T>,
r: PhantomData<R>,
}

impl<T, R> TryString2NumberFunction<T, R>
where
T: TryNumberOperator<R>,
R: PrimitiveType + Clone + ToDataType,
{
pub fn try_create(display_name: &str, args: &[&DataTypeImpl]) -> Result<Box<dyn Function>> {
assert_string(args[0])?;

Ok(Box::new(Self {
display_name: display_name.to_string(),
t: PhantomData,
r: PhantomData,
}))
}

pub fn desc() -> FunctionDescription {
let mut features = FunctionFeatures::default().num_arguments(1);

if T::IS_DETERMINISTIC {
features = features.deterministic();
}

if T::MAYBE_MONOTONIC {
features = features.monotonicity();
}

FunctionDescription::creator(Box::new(Self::try_create)).features(features)
}
}

/// A common function template that transform string column into number column
/// Eg: length
impl<T, R> Function for TryString2NumberFunction<T, R>
where
T: TryNumberOperator<R> + Clone,
R: PrimitiveType + Clone + ToDataType,
{
fn name(&self) -> &str {
&self.display_name
}

fn return_type(&self) -> DataTypeImpl {
R::to_data_type()
}

fn eval(
&self,
_func_ctx: FunctionContext,
columns: &common_datavalues::ColumnsWithField,
input_rows: usize,
) -> Result<common_datavalues::ColumnRef> {
let mut op = T::default();
let column: &StringColumn = Series::check_get(columns[0].column())?;
let mut array = Vec::with_capacity(input_rows);
for x in column.iter() {
let r = op.try_apply(x)?;
array.push(r);
}

Ok(Arc::new(PrimitiveColumn::new_from_vec(array)))
}
}

impl<T, R> fmt::Display for TryString2NumberFunction<T, R> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}()", self.display_name)
}
}
Loading