diff --git a/common/functions/src/scalars/strings/mod.rs b/common/functions/src/scalars/strings/mod.rs index 5bbe17b352c5..a0ceea1b3075 100644 --- a/common/functions/src/scalars/strings/mod.rs +++ b/common/functions/src/scalars/strings/mod.rs @@ -38,6 +38,7 @@ mod pad; mod quote; mod regexp_instr; mod regexp_like; +mod regexp_replace; mod regexp_substr; mod repeat; mod replace; @@ -84,6 +85,7 @@ pub use pad::RightPadFunction; pub use quote::QuoteFunction; pub use regexp_instr::RegexpInStrFunction; pub use regexp_like::RegexpLikeFunction; +pub use regexp_replace::RegexpReplaceFunction; pub use regexp_substr::RegexpSubStrFunction; pub use repeat::RepeatFunction; pub use replace::ReplaceFunction; diff --git a/common/functions/src/scalars/strings/regexp_instr.rs b/common/functions/src/scalars/strings/regexp_instr.rs index a1a37e37e1f9..dfb8c8f8b3e0 100644 --- a/common/functions/src/scalars/strings/regexp_instr.rs +++ b/common/functions/src/scalars/strings/regexp_instr.rs @@ -14,13 +14,12 @@ use std::collections::HashMap; use std::fmt; -use std::sync::Arc; use bstr::ByteSlice; use common_datavalues::prelude::*; use common_exception::ErrorCode; use common_exception::Result; -use itertools::izip; +use regex::bytes::Match; use regex::bytes::Regex; use crate::scalars::assert_string; @@ -39,8 +38,13 @@ pub struct RegexpInStrFunction { impl RegexpInStrFunction { pub fn try_create(display_name: &str, args: &[&DataTypePtr]) -> Result> { for (i, arg) in args.iter().enumerate() { + if arg.is_null() { + continue; + } + + let arg = remove_nullable(arg); if i < 2 || i == 5 { - assert_string(*arg)?; + assert_string(&arg)?; } else if !arg.data_type_id().is_integer() && !arg.data_type_id().is_string() { return Err(ErrorCode::IllegalDataType(format!( "Expected integer or string or null, but got {}", @@ -58,6 +62,7 @@ impl RegexpInStrFunction { FunctionDescription::creator(Box::new(Self::try_create)).features( FunctionFeatures::default() .deterministic() + .disable_passthrough_null() // disable passthrough null to validate the function arguments .variadic_arguments(2, 6), ) } @@ -69,7 +74,7 @@ impl Function for RegexpInStrFunction { } fn return_type(&self) -> DataTypePtr { - u64::to_data_type() + NullableType::arc(u64::to_data_type()) } // Notes: https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-instr @@ -79,6 +84,11 @@ impl Function for RegexpInStrFunction { columns: &ColumnsWithField, input_rows: usize, ) -> Result { + let has_null = columns.iter().any(|col| col.column().is_null()); + if has_null { + return Ok(NullColumn::new(input_rows).arc()); + } + let mut pos = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); let mut occurrence = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); let mut return_option = ConstColumn::new(Series::from_data(vec![0_i64]), input_rows).arc(); @@ -86,120 +96,130 @@ impl Function for RegexpInStrFunction { for i in 2..columns.len() { match i { - 2 => pos = cast_column_field(&columns[2], &Int64Type::arc())?, - 3 => occurrence = cast_column_field(&columns[3], &Int64Type::arc())?, - 4 => return_option = cast_column_field(&columns[4], &Int64Type::arc())?, - _ => match_type = cast_column_field(&columns[5], &StringType::arc())?, + 2 => pos = cast_column_field(&columns[2], &NullableType::arc(Int64Type::arc()))?, + 3 => { + occurrence = + cast_column_field(&columns[3], &NullableType::arc(Int64Type::arc()))? + } + 4 => { + return_option = + cast_column_field(&columns[4], &NullableType::arc(Int64Type::arc()))? + } + _ => { + match_type = + cast_column_field(&columns[5], &NullableType::arc(StringType::arc()))? + } } } + let source = columns[0].column(); let pat = columns[1].column(); if pat.is_const() && match_type.is_const() { let pat_value = pat.get_string(0)?; let mt_value = match_type.get_string(0)?; + let columns = [source, &pos, &occurrence, &return_option]; - return Ok(Arc::new(self.a_regexp_instr_binary_scalar( - columns[0].column(), - &pat_value, - &pos, - &occurrence, - &return_option, - &mt_value, - )?)); + return self.a_regexp_instr_binary_scalar(&columns, &pat_value, &mt_value, input_rows); } - Ok(Arc::new(self.a_regexp_instr_binary( - columns[0].column(), - pat, - &pos, - &occurrence, - &return_option, - &match_type, - )?)) + let columns = [source, pat, &pos, &occurrence, &return_option, &match_type]; + self.a_regexp_instr_binary(&columns, input_rows) } } impl RegexpInStrFunction { fn a_regexp_instr_binary_scalar( &self, - source: &ColumnRef, + columns: &[&ColumnRef], pat: &[u8], - pos: &ColumnRef, - occurrence: &ColumnRef, - return_option: &ColumnRef, mt: &[u8], - ) -> Result { - let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(source.len()); + input_rows: usize, + ) -> Result { + let mut builder = NullableColumnBuilder::::with_capacity(columns[0].len()); - let source = Vu8::try_create_viewer(source)?; - let pos = i64::try_create_viewer(pos)?; - let occur = i64::try_create_viewer(occurrence)?; - let ro = i64::try_create_viewer(return_option)?; + let source = Vu8::try_create_viewer(columns[0])?; + let pos = i64::try_create_viewer(columns[1])?; + let occur = i64::try_create_viewer(columns[2])?; + let ro = i64::try_create_viewer(columns[3])?; let re = build_regexp_from_pattern(self.name(), pat, Some(mt))?; - let iter = izip!(source, pos, occur, ro); - for (s_value, pos_value, occur_value, ro_value) in iter { - if ro_value != 0 && ro_value != 1 { - return Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to {}: return_option must be 1 or 0, but got {}", - self.name(), - ro_value - ))); + for row in 0..input_rows { + if source.null_at(row) || pos.null_at(row) || occur.null_at(row) || ro.null_at(row) { + builder.append_null(); + continue; } + + let s_value = source.value_at(row); + let pos_value = pos.value_at(row); + let occur_value = occur.value_at(row); + let ro_value = ro.value_at(row); + validate_regexp_arguments( + self.name(), + pos_value, + Some(occur_value), + Some(ro_value), + None, + )?; + if s_value.is_empty() || pat.is_empty() { - builder.append(0); + builder.append(0, true); continue; } let instr = regexp_instr(s_value, &re, pos_value, occur_value, ro_value); - - builder.append(instr); + builder.append(instr, true); } - Ok(builder.build_column()) + Ok(builder.build(input_rows)) } fn a_regexp_instr_binary( &self, - source: &ColumnRef, - pat: &ColumnRef, - pos: &ColumnRef, - occurrence: &ColumnRef, - return_option: &ColumnRef, - match_type: &ColumnRef, - ) -> Result { - let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(source.len()); + columns: &[&ColumnRef], + input_rows: usize, + ) -> Result { + let mut builder = NullableColumnBuilder::::with_capacity(columns[0].len()); let mut map: HashMap, Regex> = HashMap::new(); let mut key: Vec = Vec::new(); - let source = Vu8::try_create_viewer(source)?; - let pat = Vu8::try_create_viewer(pat)?; - let pos = i64::try_create_viewer(pos)?; - let occur = i64::try_create_viewer(occurrence)?; - let ro = i64::try_create_viewer(return_option)?; - let mt = Vu8::try_create_viewer(match_type)?; - - let iter = izip!(source, pat, pos, occur, ro, mt); - for (s_value, pat_value, pos_value, occur_value, ro_value, mt_value) in iter { - if ro_value != 0 && ro_value != 1 { - return Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to {}: return_option must be 1 or 0, but got {}", - self.name(), - ro_value - ))); - } - if mt_value.starts_with_str("-") { - return Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to {} match type: {}", - self.name(), - mt_value.to_str_lossy(), - ))); + let source = Vu8::try_create_viewer(columns[0])?; + let pat = Vu8::try_create_viewer(columns[1])?; + let pos = i64::try_create_viewer(columns[2])?; + let occur = i64::try_create_viewer(columns[3])?; + let ro = i64::try_create_viewer(columns[4])?; + let mt = Vu8::try_create_viewer(columns[5])?; + + for row in 0..input_rows { + if source.null_at(row) + || pat.null_at(row) + || pos.null_at(row) + || occur.null_at(row) + || ro.null_at(row) + || mt.null_at(row) + { + builder.append_null(); + continue; } + + let s_value = source.value_at(row); + let pat_value = pat.value_at(row); + let pos_value = pos.value_at(row); + let occur_value = occur.value_at(row); + let ro_value = ro.value_at(row); + let mt_value = mt.value_at(row); + validate_regexp_arguments( + self.name(), + pos_value, + Some(occur_value), + Some(ro_value), + Some(mt_value), + )?; + if s_value.is_empty() || pat_value.is_empty() { - builder.append(0); + builder.append(0, true); continue; } @@ -217,17 +237,16 @@ impl RegexpInStrFunction { let instr = regexp_instr(s_value, re, pos_value, occur_value, ro_value); - builder.append(instr); + builder.append(instr, true); } - Ok(builder.build_column()) + Ok(builder.build(input_rows)) } } #[inline] fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 { - let occur = if occur < 1 { 1 } else { occur }; - let pos = if pos < 1 { 0 } else { (pos - 1) as usize }; + let pos = (pos - 1) as usize; // set the index start from 0 // the 'pos' postion is the character index, // so we should iterate the character to find the byte index. @@ -236,20 +255,7 @@ fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 { None => return 0, }; - let mut i = 1_i64; - let m = loop { - let m = re.find_at(s, pos); - if i == occur || m.is_none() { - break m; - } - - i += 1; - if let Some(m) = m { - // set the start postion of 'find_at' function to the position following the matched substring - pos = m.end(); - } - }; - + let m = regexp_match_result(s, re, &mut pos, &occur); if m.is_none() { return 0; } @@ -272,6 +278,75 @@ fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 { instr as u64 } +#[inline] +pub fn regexp_match_result<'a>( + s: &'a [u8], + re: &Regex, + pos: &mut usize, + occur: &i64, +) -> Option> { + let mut i = 1_i64; + let m = loop { + let m = re.find_at(s, *pos); + if i >= *occur || m.is_none() { + break m; + } + + i += 1; + if let Some(m) = m { + // set the start postion of 'find_at' function to the position following the matched substring + *pos = m.end(); + } + }; + + m +} + +/// Validates the arguments of 'regexp_*' functions, returns error if any of arguments is invalid +/// and make the error logic the same as snowflake, since it is more reasonable and consistent +#[inline] +pub fn validate_regexp_arguments( + fn_name: &str, + pos: i64, + occur: Option, + ro: Option, + mt: Option<&[u8]>, +) -> Result<()> { + if pos < 1 { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: position must be positive, but got {}", + fn_name, pos + ))); + } + if let Some(occur) = occur { + if occur < 1 { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: occurrence must be positive, but got {}", + fn_name, occur + ))); + } + } + if let Some(ro) = ro { + if ro != 0 && ro != 1 { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: return_option must be 1 or 0, but got {}", + fn_name, ro + ))); + } + } + if let Some(mt) = mt { + if mt.starts_with_str("-") { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {} match type: {}", + fn_name, + mt.to_str_lossy(), + ))); + } + } + + Ok(()) +} + impl fmt::Display for RegexpInStrFunction { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.display_name) diff --git a/common/functions/src/scalars/strings/regexp_like.rs b/common/functions/src/scalars/strings/regexp_like.rs index d43928ccb907..147b244d71ce 100644 --- a/common/functions/src/scalars/strings/regexp_like.rs +++ b/common/functions/src/scalars/strings/regexp_like.rs @@ -69,36 +69,27 @@ impl Function for RegexpLikeFunction { &self, _func_ctx: FunctionContext, columns: &ColumnsWithField, - _input_rows: usize, + input_rows: usize, ) -> Result { - let col1: Result<&ConstColumn> = Series::check_get(columns[1].column()); - if let Ok(col1) = col1 { - let lhs = columns[0].column(); - let rhs = col1.get_string(0)?; - - if columns.len() == 3 { - if columns[2].column().is_const() { - let mt = columns[2].column().get_string(0)?; - return Ok(Arc::new(self.a_regexp_binary_scalar( - lhs, - &rhs, - Some(&mt), - )?)); - } - } else { - return Ok(Arc::new(self.a_regexp_binary_scalar(lhs, &rhs, None)?)); - } + let lhs = columns[0].column(); + let rhs = columns[1].column(); + let mut match_type = &ConstColumn::new(Series::from_data(vec![""]), input_rows).arc(); + if columns.len() == 3 { + match_type = columns[2].column(); } - let mut mt: Option<&ColumnRef> = None; - if columns.len() == 3 { - mt = Some(columns[2].column()) + if rhs.is_const() && match_type.is_const() { + let pat = rhs.get_string(0)?; + let mt = match_type.get_string(0)?; + + return Ok(Arc::new(self.a_regexp_binary_scalar( + lhs, + &pat, + Some(&mt), + )?)); } - Ok(Arc::new(self.a_regexp_binary( - columns[0].column(), - columns[1].column(), - mt, - )?)) + + Ok(Arc::new(self.a_regexp_binary(lhs, rhs, match_type)?)) } } @@ -131,7 +122,7 @@ impl RegexpLikeFunction { &self, lhs: &ColumnRef, rhs: &ColumnRef, - mt: Option<&ColumnRef>, + mt: &ColumnRef, ) -> Result { let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(lhs.len()); @@ -141,46 +132,30 @@ impl RegexpLikeFunction { let lhs = Vu8::try_create_viewer(lhs)?; let rhs = Vu8::try_create_viewer(rhs)?; - if let Some(mt) = mt { - let mt = Vu8::try_create_viewer(mt)?; - let iter = izip!(lhs, rhs, mt); - for (lhs_value, rhs_value, mt_value) in iter { - if mt_value.starts_with_str("-") { - return Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to {} match type: {}", - self.name(), - mt_value.to_str_lossy(), - ))); - } - key.extend_from_slice(rhs_value); - key.extend_from_slice("-".as_bytes()); - key.extend_from_slice(mt_value); - - let pattern = if let Some(pattern) = map.get(&key) { - pattern - } else { - let re = build_regexp_from_pattern(self.name(), rhs_value, Some(mt_value))?; - map.insert(key.clone(), re); - map.get(&key).unwrap() - }; - key.clear(); - - builder.append(pattern.is_match(lhs_value)); - } - } else { - for (lhs_value, rhs_value) in lhs.zip(rhs) { - key.extend_from_slice(rhs_value); - let pattern = if let Some(pattern) = map.get(&key) { - pattern - } else { - let re = build_regexp_from_pattern(self.name(), rhs_value, None)?; - map.insert(key.clone(), re); - map.get(&key).unwrap() - }; - key.clear(); - - builder.append(pattern.is_match(lhs_value)); + let mt = Vu8::try_create_viewer(mt)?; + let iter = izip!(lhs, rhs, mt); + for (lhs_value, rhs_value, mt_value) in iter { + if mt_value.starts_with_str("-") { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {} match type: {}", + self.name(), + mt_value.to_str_lossy(), + ))); } + key.extend_from_slice(rhs_value); + key.extend_from_slice("-".as_bytes()); + key.extend_from_slice(mt_value); + + let pattern = if let Some(pattern) = map.get(&key) { + pattern + } else { + let re = build_regexp_from_pattern(self.name(), rhs_value, Some(mt_value))?; + map.insert(key.clone(), re); + map.get(&key).unwrap() + }; + key.clear(); + + builder.append(pattern.is_match(lhs_value)); } Ok(builder.build_column()) diff --git a/common/functions/src/scalars/strings/regexp_replace.rs b/common/functions/src/scalars/strings/regexp_replace.rs new file mode 100644 index 000000000000..9d7cc2ca28ba --- /dev/null +++ b/common/functions/src/scalars/strings/regexp_replace.rs @@ -0,0 +1,304 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt; + +use bstr::ByteSlice; +use common_datavalues::prelude::*; +use common_exception::ErrorCode; +use common_exception::Result; +use regex::bytes::Regex; + +use crate::scalars::assert_string; +use crate::scalars::cast_column_field; +use crate::scalars::strings::regexp_instr::regexp_match_result; +use crate::scalars::strings::regexp_instr::validate_regexp_arguments; +use crate::scalars::strings::regexp_like::build_regexp_from_pattern; +use crate::scalars::Function; +use crate::scalars::FunctionContext; +use crate::scalars::FunctionDescription; +use crate::scalars::FunctionFeatures; + +#[derive(Clone)] +pub struct RegexpReplaceFunction { + display_name: String, +} + +impl RegexpReplaceFunction { + pub fn try_create(display_name: &str, args: &[&DataTypePtr]) -> Result> { + for (i, arg) in args.iter().enumerate() { + if arg.is_null() { + continue; + } + + let arg = remove_nullable(arg); + if i < 3 || i == 5 { + assert_string(&arg)?; + } else if !arg.data_type_id().is_integer() && !arg.data_type_id().is_string() { + return Err(ErrorCode::IllegalDataType(format!( + "Expected integer or string or null, but got {}", + args[i].data_type_id() + ))); + } + } + + Ok(Box::new(Self { + display_name: display_name.to_string(), + })) + } + + pub fn desc() -> FunctionDescription { + FunctionDescription::creator(Box::new(Self::try_create)).features( + FunctionFeatures::default() + .deterministic() + .disable_passthrough_null() // disable passthrough null to validate the function arguments + .variadic_arguments(3, 6), + ) + } +} + +impl Function for RegexpReplaceFunction { + fn name(&self) -> &str { + &self.display_name + } + + fn return_type(&self) -> DataTypePtr { + NullableType::arc(StringType::arc()) + } + + // Notes: https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-replace + fn eval( + &self, + _func_ctx: FunctionContext, + columns: &ColumnsWithField, + input_rows: usize, + ) -> Result { + let has_null = columns.iter().any(|col| col.column().is_null()); + if has_null { + return Ok(NullColumn::new(input_rows).arc()); + } + + let mut pos = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); + let mut occurrence = ConstColumn::new(Series::from_data(vec![0_i64]), input_rows).arc(); + let mut match_type = ConstColumn::new(Series::from_data(vec![""]), input_rows).arc(); + + for i in 3..columns.len() { + match i { + 3 => pos = cast_column_field(&columns[3], &NullableType::arc(Int64Type::arc()))?, + 4 => { + occurrence = + cast_column_field(&columns[4], &NullableType::arc(Int64Type::arc()))? + } + _ => { + match_type = + cast_column_field(&columns[5], &NullableType::arc(StringType::arc()))? + } + } + } + + let source = columns[0].column(); + let pat = columns[1].column(); + let repl = columns[2].column(); + + if pat.is_const() && match_type.is_const() { + let pat_value = pat.get_string(0)?; + let mt_value = match_type.get_string(0)?; + let columns = [source, repl, &pos, &occurrence]; + + return self.a_regexp_replace_binary_scalar( + &columns[..], + &pat_value, + &mt_value, + input_rows, + ); + } + + let columns = [source, pat, repl, &pos, &occurrence, &match_type]; + self.a_regexp_replace_binary(&columns[..], input_rows) + } +} + +impl RegexpReplaceFunction { + fn a_regexp_replace_binary_scalar( + &self, + columns: &[&ColumnRef], + pat: &[u8], + mt: &[u8], + input_rows: usize, + ) -> Result { + let re = build_regexp_from_pattern(self.name(), pat, Some(mt))?; + + let source = Vu8::try_create_viewer(columns[0])?; + let repl = Vu8::try_create_viewer(columns[1])?; + let pos = i64::try_create_viewer(columns[2])?; + let occur = i64::try_create_viewer(columns[3])?; + + let mut_string_col = MutableStringColumn::with_values_capacity( + source.value_at(0).len() * input_rows, + input_rows + 1, + ); + let mut builder = MutableNullableColumn::new(Box::new(mut_string_col), self.return_type()); + + let mut buf = Vec::with_capacity(source.value_at(0).len()); + for row in 0..input_rows { + if source.null_at(row) || repl.null_at(row) || pos.null_at(row) || occur.null_at(row) { + builder.append_default(); + continue; + } + + let s_value = source.value_at(row); + let repl_value = repl.value_at(row); + let pos_value = pos.value_at(row); + let occur_value = occur.value_at(row); + + validate_regexp_arguments(self.name(), pos_value, None, None, None)?; + if occur_value < 0 { + // the occurrence argument for regexp_replace is different with other regexp_* function + // the value of '0' is valid, so check the value here separately + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: occurrence must not be negative, but got {}", + self.name(), + occur_value + ))); + } + + if s_value.is_empty() || pat.is_empty() { + builder.append_data_value(s_value.into())?; + continue; + } + + regexp_replace(s_value, &re, repl_value, pos_value, occur_value, &mut buf); + builder.append_data_value(buf.clone().into())?; + buf.clear(); + } + + Ok(builder.to_column()) + } + + fn a_regexp_replace_binary( + &self, + columns: &[&ColumnRef], + input_rows: usize, + ) -> Result { + let mut map: HashMap, Regex> = HashMap::new(); + let mut key: Vec = Vec::new(); + + let source = Vu8::try_create_viewer(columns[0])?; + let pat = Vu8::try_create_viewer(columns[1])?; + let repl = Vu8::try_create_viewer(columns[2])?; + let pos = i64::try_create_viewer(columns[3])?; + let occur = i64::try_create_viewer(columns[4])?; + let mt = Vu8::try_create_viewer(columns[5])?; + + let mut_string_col = MutableStringColumn::with_values_capacity( + source.value_at(0).len() * input_rows, + input_rows + 1, + ); + let mut builder = MutableNullableColumn::new(Box::new(mut_string_col), self.return_type()); + + let mut buf = Vec::with_capacity(source.value_at(0).len()); + for row in 0..input_rows { + if source.null_at(row) + || pat.null_at(row) + || repl.null_at(row) + || pos.null_at(row) + || occur.null_at(row) + || mt.null_at(row) + { + builder.append_default(); + continue; + } + + let s_value = source.value_at(row); + let pat_value = pat.value_at(row); + let repl_value = repl.value_at(row); + let pos_value = pos.value_at(row); + let occur_value = occur.value_at(row); + let mt_value = mt.value_at(row); + + validate_regexp_arguments(self.name(), pos_value, None, None, Some(mt_value))?; + if occur_value < 0 { + // the occurrence argument for regexp_replace is different with other regexp_* function + // the value of '0' is valid, so check the value here separately + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: occurrence must not be negative, but got {}", + self.name(), + occur_value + ))); + } + validate_regexp_arguments(self.name(), pos_value, None, None, Some(mt_value))?; + + if s_value.is_empty() || pat_value.is_empty() { + builder.append_data_value(s_value.into())?; + continue; + } + + key.extend_from_slice(pat_value); + key.extend_from_slice("-".as_bytes()); + key.extend_from_slice(mt_value); + let re = if let Some(re) = map.get(&key) { + re + } else { + let re = build_regexp_from_pattern(self.name(), pat_value, Some(mt_value))?; + map.insert(key.clone(), re); + map.get(&key).unwrap() + }; + key.clear(); + + regexp_replace(s_value, re, repl_value, pos_value, occur_value, &mut buf); + builder.append_data_value(buf.clone().into())?; + buf.clear(); + } + + Ok(builder.to_column()) + } +} + +#[inline] +fn regexp_replace(s: &[u8], re: &Regex, repl: &[u8], pos: i64, occur: i64, buf: &mut Vec) { + let pos = (pos - 1) as usize; // set the index start from 0 + + // the 'pos' postion is the character index, + // so we should iterate the character to find the byte index. + let mut pos = match s.char_indices().nth(pos) { + Some((start, _, _)) => start, + None => { + buf.extend_from_slice(s); + return; + } + }; + + let m = regexp_match_result(s, re, &mut pos, &occur); + if m.is_none() { + buf.extend_from_slice(s); + return; + } + + buf.extend_from_slice(&s[..m.unwrap().start()]); + + if occur == 0 { + let s = &s[m.unwrap().start()..]; + buf.extend_from_slice(&re.replace_all(s, repl)); + } else { + buf.extend_from_slice(repl); + buf.extend_from_slice(&s[m.unwrap().end()..]) + } +} + +impl fmt::Display for RegexpReplaceFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.display_name) + } +} diff --git a/common/functions/src/scalars/strings/regexp_substr.rs b/common/functions/src/scalars/strings/regexp_substr.rs index 4c5a957099e2..d8120fe95bc0 100644 --- a/common/functions/src/scalars/strings/regexp_substr.rs +++ b/common/functions/src/scalars/strings/regexp_substr.rs @@ -19,11 +19,12 @@ use bstr::ByteSlice; use common_datavalues::prelude::*; use common_exception::ErrorCode; use common_exception::Result; -use itertools::izip; use regex::bytes::Regex; use crate::scalars::assert_string; use crate::scalars::cast_column_field; +use crate::scalars::strings::regexp_instr::regexp_match_result; +use crate::scalars::strings::regexp_instr::validate_regexp_arguments; use crate::scalars::strings::regexp_like::build_regexp_from_pattern; use crate::scalars::Function; use crate::scalars::FunctionContext; @@ -38,8 +39,13 @@ pub struct RegexpSubStrFunction { impl RegexpSubStrFunction { pub fn try_create(display_name: &str, args: &[&DataTypePtr]) -> Result> { for (i, arg) in args.iter().enumerate() { + if arg.is_null() { + continue; + } + + let arg = remove_nullable(arg); if i < 2 || i == 4 { - assert_string(*arg)?; + assert_string(&arg)?; } else if !arg.data_type_id().is_integer() && !arg.data_type_id().is_string() { return Err(ErrorCode::IllegalDataType(format!( "Expected integer or string or null, but got {}", @@ -57,6 +63,7 @@ impl RegexpSubStrFunction { FunctionDescription::creator(Box::new(Self::try_create)).features( FunctionFeatures::default() .deterministic() + .disable_passthrough_null() // disable passthrough null to validate the function arguments .variadic_arguments(2, 5), ) } @@ -78,15 +85,26 @@ impl Function for RegexpSubStrFunction { columns: &ColumnsWithField, input_rows: usize, ) -> Result { + let has_null = columns.iter().any(|col| col.column().is_null()); + if has_null { + return Ok(NullColumn::new(input_rows).arc()); + } + let mut pos = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); let mut occurrence = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); let mut match_type = ConstColumn::new(Series::from_data(vec![""]), input_rows).arc(); for i in 2..columns.len() { match i { - 2 => pos = cast_column_field(&columns[2], &Int64Type::arc())?, - 3 => occurrence = cast_column_field(&columns[3], &Int64Type::arc())?, - _ => match_type = cast_column_field(&columns[4], &StringType::arc())?, + 2 => pos = cast_column_field(&columns[2], &NullableType::arc(Int64Type::arc()))?, + 3 => { + occurrence = + cast_column_field(&columns[3], &NullableType::arc(Int64Type::arc()))? + } + _ => { + match_type = + cast_column_field(&columns[4], &NullableType::arc(StringType::arc()))? + } } } @@ -95,48 +113,47 @@ impl Function for RegexpSubStrFunction { if pat.is_const() && match_type.is_const() { let pat_value = pat.get_string(0)?; let mt_value = match_type.get_string(0)?; + let columns = [columns[0].column(), &pos, &occurrence]; - return self.a_regexp_substr_binary_scalar( - columns[0].column(), - &pat_value, - &pos, - &occurrence, - &mt_value, - input_rows, - ); + return self.a_regexp_substr_binary_scalar(&columns, &pat_value, &mt_value, input_rows); } - self.a_regexp_substr_binary( - columns[0].column(), - pat, - &pos, - &occurrence, - &match_type, - input_rows, - ) + let columns = [columns[0].column(), pat, &pos, &occurrence, &match_type]; + self.a_regexp_substr_binary(&columns, input_rows) + } + + fn passthrough_constant(&self) -> bool { + false } } impl RegexpSubStrFunction { fn a_regexp_substr_binary_scalar( &self, - source: &ColumnRef, + columns: &[&ColumnRef], pat: &[u8], - pos: &ColumnRef, - occurrence: &ColumnRef, mt: &[u8], input_rows: usize, ) -> Result { - let mut builder = NullableColumnBuilder::::with_capacity(source.len()); + let mut builder = NullableColumnBuilder::::with_capacity(columns[0].len()); - let source = Vu8::try_create_viewer(source)?; - let pos = i64::try_create_viewer(pos)?; - let occur = i64::try_create_viewer(occurrence)?; + let source = Vu8::try_create_viewer(columns[0])?; + let pos = i64::try_create_viewer(columns[1])?; + let occur = i64::try_create_viewer(columns[2])?; let re = build_regexp_from_pattern(self.name(), pat, Some(mt))?; - let iter = izip!(source, pos, occur); - for (s_value, pos_value, occur_value) in iter { + for row in 0..input_rows { + if source.null_at(row) || pos.null_at(row) || occur.null_at(row) { + builder.append_null(); + continue; + } + + let s_value = source.value_at(row); + let pos_value = pos.value_at(row); + let occur_value = occur.value_at(row); + validate_regexp_arguments(self.name(), pos_value, Some(occur_value), None, None)?; + if s_value.is_empty() || pat.is_empty() { builder.append_null(); continue; @@ -154,33 +171,44 @@ impl RegexpSubStrFunction { fn a_regexp_substr_binary( &self, - source: &ColumnRef, - pat: &ColumnRef, - pos: &ColumnRef, - occurrence: &ColumnRef, - match_type: &ColumnRef, + columns: &[&ColumnRef], input_rows: usize, ) -> Result { - let mut builder = NullableColumnBuilder::::with_capacity(source.len()); + let mut builder = NullableColumnBuilder::::with_capacity(columns[0].len()); let mut map: HashMap, Regex> = HashMap::new(); let mut key: Vec = Vec::new(); - let source = Vu8::try_create_viewer(source)?; - let pat = Vu8::try_create_viewer(pat)?; - let pos = i64::try_create_viewer(pos)?; - let occur = i64::try_create_viewer(occurrence)?; - let mt = Vu8::try_create_viewer(match_type)?; - - let iter = izip!(source, pat, pos, occur, mt); - for (s_value, pat_value, pos_value, occur_value, mt_value) in iter { - if mt_value.starts_with_str("-") { - return Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to {} match type: {}", - self.name(), - mt_value.to_str_lossy(), - ))); + let source = Vu8::try_create_viewer(columns[0])?; + let pat = Vu8::try_create_viewer(columns[1])?; + let pos = i64::try_create_viewer(columns[2])?; + let occur = i64::try_create_viewer(columns[3])?; + let mt = Vu8::try_create_viewer(columns[4])?; + + for row in 0..input_rows { + if source.null_at(row) + || pat.null_at(row) + || pos.null_at(row) + || occur.null_at(row) + || mt.null_at(row) + { + builder.append_null(); + continue; } + + let s_value = source.value_at(row); + let pat_value = pat.value_at(row); + let pos_value = pos.value_at(row); + let occur_value = occur.value_at(row); + let mt_value = mt.value_at(row); + validate_regexp_arguments( + self.name(), + pos_value, + Some(occur_value), + None, + Some(mt_value), + )?; + if s_value.is_empty() || pat_value.is_empty() { builder.append_null(); continue; @@ -222,19 +250,7 @@ fn regexp_substr<'a>(s: &'a [u8], re: &Regex, pos: i64, occur: i64) -> Option<&' None => return None, }; - let mut i = 1_i64; - let m = loop { - let m = re.find_at(s, pos); - if i == occur || m.is_none() { - break m; - } - - i += 1; - if let Some(m) = m { - // set the start postion of 'find_at' function to the position following the matched substring - pos = m.end(); - } - }; + let m = regexp_match_result(s, re, &mut pos, &occur); m.map(|m| m.as_bytes()) } diff --git a/common/functions/src/scalars/strings/string.rs b/common/functions/src/scalars/strings/string.rs index 8a50a9a011ca..6feb6c2f757c 100644 --- a/common/functions/src/scalars/strings/string.rs +++ b/common/functions/src/scalars/strings/string.rs @@ -44,6 +44,7 @@ use crate::scalars::QuoteFunction; use crate::scalars::RTrimFunction; use crate::scalars::RegexpInStrFunction; use crate::scalars::RegexpLikeFunction; +use crate::scalars::RegexpReplaceFunction; use crate::scalars::RegexpSubStrFunction; use crate::scalars::RepeatFunction; use crate::scalars::ReplaceFunction; @@ -85,6 +86,7 @@ impl StringFunction { factory.register("length", LengthFunction::desc()); factory.register("regexp_instr", RegexpInStrFunction::desc()); factory.register("regexp_like", RegexpLikeFunction::desc()); + factory.register("regexp_replace", RegexpReplaceFunction::desc()); factory.register("regexp_substr", RegexpSubStrFunction::desc()); factory.register("bin", BinFunction::desc()); factory.register("oct", OctFunction::desc()); diff --git a/common/functions/tests/it/scalars/strings/mod.rs b/common/functions/tests/it/scalars/strings/mod.rs index 3fdd46b9c4b1..911ed1d1497e 100644 --- a/common/functions/tests/it/scalars/strings/mod.rs +++ b/common/functions/tests/it/scalars/strings/mod.rs @@ -17,6 +17,7 @@ mod locate; mod lower; mod regexp_instr; mod regexp_like; +mod regexp_replace; mod regexp_substr; mod substring; mod trim; diff --git a/common/functions/tests/it/scalars/strings/regexp_instr.rs b/common/functions/tests/it/scalars/strings/regexp_instr.rs index a1866e386688..3a1c0da92413 100644 --- a/common/functions/tests/it/scalars/strings/regexp_instr.rs +++ b/common/functions/tests/it/scalars/strings/regexp_instr.rs @@ -27,7 +27,7 @@ fn test_regexp_instr_function() -> Result<()> { Series::from_data(vec!["dog cat dog", "aa aaa aaaa aa aaa aaaa", ""]), Series::from_data(vec!["dog", "a{2}", ""]), ], - expect: Series::from_data(vec![1_u64, 1, 0]), + expect: Series::from_data(vec![Some(1_u64), Some(1), Some(0)]), error: "", }, ScalarFunctionTest { @@ -37,7 +37,7 @@ fn test_regexp_instr_function() -> Result<()> { Series::from_data(vec!["dog", "a{2}", ""]), Series::from_data(vec![1_i64, 2, 1]), ], - expect: Series::from_data(vec![1_u64, 4, 0]), + expect: Series::from_data(vec![Some(1_u64), Some(4), Some(0)]), error: "", }, ScalarFunctionTest { @@ -52,7 +52,7 @@ fn test_regexp_instr_function() -> Result<()> { Series::from_data(vec![1_i64, 1, 9]), Series::from_data(vec![2_i64, 3, 2]), ], - expect: Series::from_data(vec![9_u64, 7, 15]), + expect: Series::from_data(vec![Some(9_u64), Some(7), Some(15)]), error: "", }, ScalarFunctionTest { @@ -68,7 +68,7 @@ fn test_regexp_instr_function() -> Result<()> { Series::from_data(vec![2_i64, 2, 2]), Series::from_data(vec![0_i64, 1, 1]), ], - expect: Series::from_data(vec![9_u64, 10, 24]), + expect: Series::from_data(vec![Some(9_u64), Some(10), Some(24)]), error: "", }, ScalarFunctionTest { @@ -85,7 +85,7 @@ fn test_regexp_instr_function() -> Result<()> { Series::from_data(vec![0_i64, 1, 1]), Series::from_data(vec!["i", "c", "i"]), ], - expect: Series::from_data(vec![9_u64, 0, 24]), + expect: Series::from_data(vec![Some(9_u64), Some(0), Some(24)]), error: "", }, ScalarFunctionTest { @@ -102,9 +102,21 @@ fn test_regexp_instr_function() -> Result<()> { Series::from_data(vec![1_i64, 2, 3, 1]), Series::from_data(vec![0_i64, 1, 1, 1]), ], - expect: Series::from_data(vec![1_u64, 9, 14, 9]), + expect: Series::from_data(vec![Some(1_u64), Some(9), Some(14), Some(9)]), error: "", }, + ScalarFunctionTest { + name: "regexp-instr-position-error", + columns: vec![ + Series::from_data(vec!["dog cat dog"]), + Series::from_data(vec!["dog"]), + Series::from_data(vec![0_i64]), + Series::from_data(vec![1_i64]), + Series::from_data(vec![0_i64]), + ], + expect: Series::from_data(Vec::::new()), + error: "Incorrect arguments to regexp_instr: position must be positive, but got 0", + }, ScalarFunctionTest { name: "regexp-instr-return-option-error", columns: vec![ @@ -159,7 +171,7 @@ fn test_regexp_instr_constant_column() -> Result<()> { Series::from_data(vec![2_i64, 1, 1]), Series::from_data(vec![0_i64, 0, 1]), ], - expect: Series::from_data(vec![9_u64, 5, 8]), + expect: Series::from_data(vec![Some(9_u64), Some(5), Some(8)]), error: "", }, ScalarFunctionTest { diff --git a/common/functions/tests/it/scalars/strings/regexp_replace.rs b/common/functions/tests/it/scalars/strings/regexp_replace.rs new file mode 100644 index 000000000000..dc9a7c08642e --- /dev/null +++ b/common/functions/tests/it/scalars/strings/regexp_replace.rs @@ -0,0 +1,187 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_datavalues::prelude::*; +use common_exception::Result; + +use crate::scalars::scalar_function_test::test_scalar_functions; +use crate::scalars::scalar_function_test::ScalarFunctionTest; + +#[test] +fn test_regexp_replace_function() -> Result<()> { + let tests = vec![ + ScalarFunctionTest { + name: "regexp-replace-three-column-passed", + columns: vec![ + Series::from_data(vec!["a b c", "a b c", "a b c", ""]), + Series::from_data(vec!["b", "x", "", "b"]), + Series::from_data(vec!["X", "X", "X", "X"]), + ], + expect: Series::from_data(vec![Some("a X c"), Some("a b c"), Some("a b c"), Some("")]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-replace-four-column-passed", + columns: vec![ + Series::from_data(vec![ + "abc def ghi", + "abc def ghi", + "abc def ghi", + "abc def ghi", + ]), + Series::from_data(vec!["[a-z]+", "[a-z]+", "[a-z]+", "[a-z]+"]), + Series::from_data(vec!["X", "X", "X", "X"]), + Series::from_data(vec![1, 4, 8, 12]), + ], + expect: Series::from_data(vec![ + Some("X X X"), + Some("abc X X"), + Some("abc def X"), + Some("abc def ghi"), + ]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-replace-five-column-passed", + columns: vec![ + Series::from_data(vec![ + "abc def ghi", + "abc def ghi", + "abc def ghi", + "abc def ghi", + ]), + Series::from_data(vec!["[a-z]+", "[a-z]+", "[a-z]+", "[a-z]+"]), + Series::from_data(vec!["X", "X", "X", "X"]), + Series::from_data(vec![1, 1, 4, 4]), + Series::from_data(vec![0, 1, 2, 3]), + ], + expect: Series::from_data(vec![ + Some("X X X"), + Some("X def ghi"), + Some("abc def X"), + Some("abc def ghi"), + ]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-replace-six-column-passed", + columns: vec![ + Series::from_data(vec!["abc def ghi", "abc DEF ghi", "abc DEF ghi"]), + Series::from_data(vec!["[a-z]+", "[a-z]+", "[a-z]+"]), + Series::from_data(vec!["X", "X", "X"]), + Series::from_data(vec![1, 1, 4]), + Series::from_data(vec![0, 2, 1]), + Series::from_data(vec!["", "c", "i"]), + ], + expect: Series::from_data(vec![Some("X X X"), Some("abc DEF X"), Some("abc X ghi")]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-replace-multi-byte-character-passed", + columns: vec![ + Series::from_data(vec![ + "周 周周 周周周 周周周周", + "周 周周 周周周 周周周周", + "周 周周 周周周 周周周周", + "周 周周 周周周 周周周周", + ]), + Series::from_data(vec!["周+", "周+", "周+", "周+"]), + Series::from_data(vec!["唐", "唐", "唐", "唐"]), + Series::from_data(vec![1, 2, 3, 5]), + Series::from_data(vec![0, 1, 3, 1]), + ], + expect: Series::from_data(vec![ + Some("唐 唐 唐 唐"), + Some("周 唐 周周周 周周周周"), + Some("周 周周 周周周 唐"), + Some("周 周周 唐 周周周周"), + ]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-replace-position-error", + columns: vec![ + Series::from_data(vec!["a b c"]), + Series::from_data(vec!["b"]), + Series::from_data(vec!["X"]), + Series::from_data(vec![0]), + ], + expect: Series::from_data(Vec::<&str>::new()), + error: "Incorrect arguments to regexp_replace: position must be positive, but got 0", + }, + ScalarFunctionTest { + name: "regexp-replace-occurrence-error", + columns: vec![ + Series::from_data(vec!["a b c"]), + Series::from_data(vec!["b"]), + Series::from_data(vec!["X"]), + Series::from_data(vec![1]), + Series::from_data(vec![-1]), + ], + expect: Series::from_data(Vec::<&str>::new()), + error: + "Incorrect arguments to regexp_replace: occurrence must not be negative, but got -1", + }, + ScalarFunctionTest { + name: "regexp-replace-match-type-error", + columns: vec![ + Series::from_data(vec!["a b c"]), + Series::from_data(vec!["b"]), + Series::from_data(vec!["X"]), + Series::from_data(vec![1]), + Series::from_data(vec![0]), + Series::from_data(vec!["-c"]), + ], + expect: Series::from_data(Vec::<&str>::new()), + error: "Incorrect arguments to regexp_replace match type: -c", + }, + ]; + + test_scalar_functions("regexp_replace", &tests) +} + +#[test] +fn test_regexp_replace_constant_column() -> Result<()> { + let data_type = DataValue::String("[a-z]+".as_bytes().into()); + let data_value1 = StringType::arc().create_constant_column(&data_type, 3)?; + let data_value2 = StringType::arc().create_constant_column(&data_type, 3)?; + + let tests = vec![ + ScalarFunctionTest { + name: "regexp-repalce-const-column-passed", + columns: vec![ + Series::from_data(vec!["abc def ghi", "abc def ghi", "abc def ghi"]), + data_value1, + Series::from_data(vec!["X", "X", "X"]), + Series::from_data(vec![1, 1, 1]), + Series::from_data(vec![0, 1, 2]), + ], + expect: Series::from_data(vec![Some("X X X"), Some("X def ghi"), Some("abc X ghi")]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-const-column-position-error", + columns: vec![ + Series::from_data(vec!["abc def ghi", "abc def ghi", "abc def ghi"]), + data_value2, + Series::from_data(vec!["X", "X", "X"]), + Series::from_data(vec![1, 0, -1]), + ], + expect: Series::from_data(Vec::<&str>::new()), + error: "Incorrect arguments to regexp_replace: position must be positive, but got 0", + }, + ]; + + test_scalar_functions("regexp_replace", &tests) +} diff --git a/docs/doc/100-faq/40-how-to-write-scalar-functions.md b/docs/doc/100-faq/40-how-to-write-scalar-functions.md index 964b4bd9b73e..427b32f0a125 100644 --- a/docs/doc/100-faq/40-how-to-write-scalar-functions.md +++ b/docs/doc/100-faq/40-how-to-write-scalar-functions.md @@ -26,7 +26,7 @@ Scalar functions (sometimes referred to as User-Defined Functions / UDFs) return ### Knowledge before writing the eval function -#### Logical datatypes and physical datatypes. +#### Logical datatypes and physical datatypes. Logical datatypes are the datatypes that we use in Databend, and physical datatypes are the datatypes that we use in the execution/compute engine. Such as `Date32`, it's a logical data type, but its physical is `Int32`, so its column is represented by `Int32Column`. @@ -34,7 +34,7 @@ Such as `Date32`, it's a logical data type, but its physical is `Int32`, so its We can get logical datatype by `data_type` function of `DataField` , and the physical datatype by `data_type` function in `ColumnRef`. `ColumnsWithField` has `data_type` function which returns the logical datatype. -#### Arrow's memory layout +#### Arrow's memory layout Databend's memory layout is based on the Arrow system, you can find Arrow's memory layout [here] (https://arrow.apache.org/docs/format/Columnar.html#format-columnar). @@ -162,7 +162,7 @@ impl Function for SqrtFunction { Float64Type::arc() } - fn eval(&self, columns: &ColumnsWithField, _input_rows: usize, _func_ctx: FunctionContext) -> Result{ + fn eval(&self, _func_ctx: FunctionContext, columns: &ColumnsWithField, _input_rows: usize) -> Result { let mut ctx = EvalContext::default(); with_match_primitive_type_id!(columns[0].data_type().data_type_id(), |$S| { let col = scalar_unary_op::<$S, f64, _>(columns[0].column(), sqrt::<$S>, &mut ctx)?; diff --git a/docs/doc/30-reference/20-functions/40-string-functions/regexp_replace.md b/docs/doc/30-reference/20-functions/40-string-functions/regexp_replace.md new file mode 100644 index 000000000000..9809e56c0570 --- /dev/null +++ b/docs/doc/30-reference/20-functions/40-string-functions/regexp_replace.md @@ -0,0 +1,51 @@ +--- +title: REGEXP_REPLACE +--- + +Replaces occurrences in the string `expr` that match the regular expression specified by the pattern `pat` with the replacement string `repl`, and returns the resulting string. If `expr`, `pat`, or `repl` is NULL, the return value is NULL. + +## Syntax + +```sql +REGEXP_REPLACE(expr, pat, repl[, pos[, occurrence[, match_type]]]) +``` + +## Arguments + +| Arguments | Description | +| ----------- | ----------- | +| expr | The string expr that to be matched | +| pat | The regular expression | +| repl | The replacement string | +| pos | Optional. The position in expr at which to start the search. If omitted, the default is 1. | +| occurrence | Optional. Which occurrence of a match to replace. If omitted, the default is 0 (which means "replace all occurrences"). | +| match_type | Optional. A string that specifies how to perform matching. The meaning is as described for REGEXP_LIKE(). | + +## Return Type + +A String data type value. + +## Examples + +```sql +SELECT REGEXP_REPLACE('a b c', 'b', 'X'); ++-----------------------------------+ +| REGEXP_REPLACE('a b c', 'b', 'X') | ++-----------------------------------+ +| a X c | ++-----------------------------------+ + +SELECT REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 3); ++----------------------------------------------------+ +| REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 3) | ++----------------------------------------------------+ +| abc def X | ++----------------------------------------------------+ + +SELECT REGEXP_REPLACE('周 周周 周周周', '周+', 'X', 3, 2); ++-----------------------------------------------------------+ +| REGEXP_REPLACE('周 周周 周周周', '周+', 'X', 3, 2) | ++-----------------------------------------------------------+ +| 周 周周 X | ++-----------------------------------------------------------+ +``` diff --git a/tests/suites/0_stateless/02_function/02_0050_function_string_regexp_instr.sql b/tests/suites/0_stateless/02_function/02_0050_function_string_regexp_instr.sql index be972127c0de..7445b1313077 100644 --- a/tests/suites/0_stateless/02_function/02_0050_function_string_regexp_instr.sql +++ b/tests/suites/0_stateless/02_function/02_0050_function_string_regexp_instr.sql @@ -13,6 +13,5 @@ SELECT REGEXP_INSTR('周 周周 周周周 周周周周', '周+', 2, 3, 1); DROP TABLE IF EXISTS t1; CREATE TABLE t1(s String NULL, pat String NULL, pos Int64 NULL, occu Int64 NULL, ro Int64 NULL, mt String NULL) Engine = Memory; INSERT INTO t1 (s, pat, pos, occu, ro, mt) VALUES (NULL, 'dog', 1, 1, 1, ''), ('dog cat dog', 'dog', NULL, 1, 1, 'c'), ('dog cat dog', 'dog', 1, 1, 1, 'c'), ('dog cat dog', 'dog', 1, 1, 1, NULL); -select s from t1 where regexp_instr(s, pat, pos, occu, ro, mt) = 4; -drop table t1; - +SELECT s FROM t1 WHERE REGEXP_INSTR(s, pat, pos, occu, ro, mt) = 4; +DROP TABLE t1; diff --git a/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.result b/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.result index 1aa1f8a50d10..0032fc0a25bf 100644 --- a/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.result +++ b/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.result @@ -7,3 +7,9 @@ b 周周周周 NULL NULL +NULL +NULL +NULL +NULL +====== +abc def ghi diff --git a/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.sql b/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.sql index 72dd9db52fb9..18f2f485f023 100644 --- a/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.sql +++ b/tests/suites/0_stateless/02_function/02_0052_function_string_regexp_substr.sql @@ -5,5 +5,14 @@ SELECT REGEXP_SUBSTR('周周周周', '.*', 2); SELECT REGEXP_SUBSTR('🍣🍣b', 'b', 2); SELECT REGEXP_SUBSTR('µå周çб周周', '周+', 3, 2); SELECT REGEXP_SUBSTR('周 周周 周周周 周周周周', '周+', 2, 3); -SELECT REGEXP_SUBSTR(NULL, ''); SELECT REGEXP_SUBSTR('周 周周', '周+', 5); +SELECT REGEXP_SUBSTR(NULL, ''); +SELECT REGEXP_SUBSTR('abc def ghi', NULL); +SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', NULL); +SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', 1, NULL); +SELECT REGEXP_SUBSTR('abc def ghi', '[a-z]+', 1, 2, NULL); +SELECT '======'; +DROP TABLE IF EXISTS t1; +CREATE TABLE t1(s String NULL, pat String NULL, pos Int64 NULL, occu Int64 NULL, mt String NULL) Engine = Memory; +INSERT INTO t1 (s, pat, pos, occu, mt) VALUES (NULL, '[a-z]+', 1, 1, ''), ('abc def ghi', NULL, 1, 1, 'c'), ('abc def ghi', '[a-z]+', NULL, 1, 'c'), ('abc def ghi', '[a-z]+', 1, NULL, 'c'), ('abc def ghi', '[a-z]+', 1, 1, NULL), ('abc def ghi', '[a-z]+', 1, 1, 'c'); +SELECT s FROM t1 WHERE REGEXP_SUBSTR(s, pat, pos, occu, mt) = 'abc'; diff --git a/tests/suites/0_stateless/02_function/02_0054_function_string_regexp_replace.result b/tests/suites/0_stateless/02_function/02_0054_function_string_regexp_replace.result new file mode 100644 index 000000000000..b61686bdbd80 --- /dev/null +++ b/tests/suites/0_stateless/02_function/02_0054_function_string_regexp_replace.result @@ -0,0 +1,18 @@ +abc +ab c +a b c +a b c +abc def X +abc def ghi +abc def X +🍣🍣X +µå周çб唐 +NULL +NULL +NULL +NULL +NULL +NULL +====== +abc def ghi +====== diff --git a/tests/suites/0_stateless/02_function/02_0054_function_string_regexp_replace.sql b/tests/suites/0_stateless/02_function/02_0054_function_string_regexp_replace.sql new file mode 100644 index 000000000000..c960ad010ad6 --- /dev/null +++ b/tests/suites/0_stateless/02_function/02_0054_function_string_regexp_replace.sql @@ -0,0 +1,25 @@ +SELECT REGEXP_REPLACE('a b c', '( ){1,}', ''); +SELECT REGEXP_REPLACE('a b c', '( ){1,}', '', 1, 1); +SELECT REGEXP_REPLACE('a b c', 'x', '', 1, 1); +SELECT REGEXP_REPLACE('a b c', '( ){1,}', '', 6, 1); +SELECT REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 3); +SELECT REGEXP_REPLACE('abc def ghi', '[a-z]+', 'X', 1, 4); +SELECT REGEXP_REPLACE('abc def GHI', '[a-z]+', 'X', 1, 3, 'i'); +SELECT REGEXP_REPLACE('🍣🍣b', 'b', 'X'); +SELECT REGEXP_REPLACE('µå周çб周周', '周+', '唐', 1, 2); +SELECT REGEXP_REPLACE(NULL, 'b', 'X'); +SELECT REGEXP_REPLACE('a b c', NULL, 'X'); +SELECT REGEXP_REPLACE('a b c', 'b', NULL); +SELECT REGEXP_REPLACE('a b c', 'b', 'X', NULL); +SELECT REGEXP_REPLACE('a b c', 'b', 'X', 1, NULL); +SELECT REGEXP_REPLACE('a b c', 'b', 'X', 1, 2, NULL); +SELECT '======'; +DROP TABLE IF EXISTS t1; +CREATE TABLE t1(s String NULL, pat String NULL, repl String NULL, pos Int64 NULL, occu Int64 NULL, mt String NULL) Engine = Memory; +INSERT INTO t1 (s, pat, repl, pos, occu, mt) VALUES (NULL, 'dog', '[a-z]+', 1, 1, ''), ('abc def ghi', NULL, 'X', 1, 1, 'c'), ('abc def ghi', '[a-z]+', NULL, 1, 1, 'c'), ('abc def ghi', '[a-z]+', 'X', NULL, 1, 'c'), ('abc def ghi', '[a-z]+', 'X', 1, NULL, 'c'), ('abc def ghi', '[a-z]+', 'X', 1, 1, NULL), ('abc def ghi', '[a-z]+', 'X', 1, 1, 'c'); +SELECT s FROM t1 WHERE REGEXP_REPLACE(s, pat, repl, pos, occu, mt) = 'X def ghi'; +DROP TABLE t1; +SELECT '======'; +SELECT REGEXP_REPLACE('a b c', 'b', 'X', 0); -- {ErrorCode 1006} +SELECT REGEXP_REPLACE('a b c', 'b', 'X', 1, -1); -- {ErrorCode 1006} +SELECT REGEXP_REPLACE('a b c', 'b', 'X', 1, 0, '-i'); -- {ErrorCode 1006}