From d90e73ac6c8c028bdb51ac0386d08fe980bcc97d Mon Sep 17 00:00:00 2001 From: nange Date: Thu, 24 Feb 2022 21:02:02 +0800 Subject: [PATCH 1/2] support regexp_instr function --- common/functions/src/scalars/strings/mod.rs | 2 + .../src/scalars/strings/regexp_instr.rs | 261 ++++++++++++++++++ .../src/scalars/strings/regexp_like.rs | 165 ++++++----- .../functions/src/scalars/strings/string.rs | 2 + .../functions/tests/it/scalars/strings/mod.rs | 1 + .../tests/it/scalars/strings/regexp_instr.rs | 172 ++++++++++++ .../tests/it/scalars/strings/regexp_like.rs | 8 +- ...2_0049_function_string_regexp_instr.result | 9 + .../02_0049_function_string_regexp_instr.sql | 13 + 9 files changed, 554 insertions(+), 79 deletions(-) create mode 100644 common/functions/src/scalars/strings/regexp_instr.rs create mode 100644 common/functions/tests/it/scalars/strings/regexp_instr.rs create mode 100644 tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.result create mode 100644 tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.sql diff --git a/common/functions/src/scalars/strings/mod.rs b/common/functions/src/scalars/strings/mod.rs index a30a4e955879..680b56517f9d 100644 --- a/common/functions/src/scalars/strings/mod.rs +++ b/common/functions/src/scalars/strings/mod.rs @@ -36,6 +36,7 @@ mod octet_length; mod ord; mod pad; mod quote; +mod regexp_instr; mod regexp_like; mod repeat; mod replace; @@ -80,6 +81,7 @@ pub use ord::OrdFunction; pub use pad::LeftPadFunction; pub use pad::RightPadFunction; pub use quote::QuoteFunction; +pub use regexp_instr::RegexpInStrFunction; pub use regexp_like::RegexpLikeFunction; pub use repeat::RepeatFunction; pub use replace::ReplaceFunction; diff --git a/common/functions/src/scalars/strings/regexp_instr.rs b/common/functions/src/scalars/strings/regexp_instr.rs new file mode 100644 index 000000000000..8b01fd18b7ec --- /dev/null +++ b/common/functions/src/scalars/strings/regexp_instr.rs @@ -0,0 +1,261 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt; +use std::sync::Arc; + +use bstr::ByteSlice; +use common_datavalues::prelude::*; +use common_exception::ErrorCode; +use common_exception::Result; +use itertools::izip; +use regex::bytes::Regex; + +use crate::scalars::assert_string; +use crate::scalars::cast_column_field; +use crate::scalars::strings::regexp_like::build_regexp_from_pattern; +use crate::scalars::Function; +use crate::scalars::FunctionDescription; +use crate::scalars::FunctionFeatures; + +#[derive(Clone)] +pub struct RegexpInStrFunction { + display_name: String, +} + +impl RegexpInStrFunction { + pub fn try_create(display_name: &str) -> Result> { + Ok(Box::new(Self { + display_name: display_name.to_string(), + })) + } + + pub fn desc() -> FunctionDescription { + FunctionDescription::creator(Box::new(Self::try_create)).features( + FunctionFeatures::default() + .deterministic() + .variadic_arguments(2, 6), + ) + } +} + +impl Function for RegexpInStrFunction { + fn name(&self) -> &str { + &self.display_name + } + + fn return_type(&self, args: &[&DataTypePtr]) -> Result { + for (i, arg) in args.iter().enumerate() { + if i < 2 || i == 5 { + assert_string(*arg)?; + } else if !arg.data_type_id().is_integer() + && !arg.data_type_id().is_string() + && !arg.data_type_id().is_null() + { + return Err(ErrorCode::IllegalDataType(format!( + "Expected integer or string or null, but got {}", + args[i].data_type_id() + ))); + } + } + + Ok(u64::to_data_type()) + } + // Notes: https://dev.mysql.com/doc/refman/8.0/en/regexp.html#function_regexp-instr + fn eval(&self, columns: &ColumnsWithField, input_rows: usize) -> Result { + let mut pos = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); + let mut occurrence = ConstColumn::new(Series::from_data(vec![1_i64]), input_rows).arc(); + let mut return_option = ConstColumn::new(Series::from_data(vec![0_i64]), input_rows).arc(); + let mut match_type = ConstColumn::new(Series::from_data(vec![""]), input_rows).arc(); + + for i in 2..columns.len() { + match i { + 2 => pos = cast_column_field(&columns[2], &Int64Type::arc())?, + 3 => occurrence = cast_column_field(&columns[3], &Int64Type::arc())?, + 4 => return_option = cast_column_field(&columns[4], &Int64Type::arc())?, + _ => match_type = cast_column_field(&columns[5], &StringType::arc())?, + } + } + + let pat = columns[1].column(); + + if pat.is_const() && match_type.is_const() { + let pat_value = pat.get_string(0)?; + let mt_value = match_type.get_string(0)?; + + return Ok(Arc::new(self.a_regexp_instr_binary_scalar( + columns[0].column(), + &pat_value, + &pos, + &occurrence, + &return_option, + &mt_value, + )?)); + } + + Ok(Arc::new(self.a_regexp_instr_binary( + columns[0].column(), + pat, + &pos, + &occurrence, + &return_option, + &match_type, + )?)) + } +} + +impl RegexpInStrFunction { + fn a_regexp_instr_binary_scalar( + &self, + source: &ColumnRef, + pat: &[u8], + pos: &ColumnRef, + occurrence: &ColumnRef, + return_option: &ColumnRef, + mt: &[u8], + ) -> Result { + let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(source.len()); + + let source = Vu8::try_create_viewer(source)?; + let pos = i64::try_create_viewer(pos)?; + let occur = i64::try_create_viewer(occurrence)?; + let ro = i64::try_create_viewer(return_option)?; + + let re = build_regexp_from_pattern(self.name(), pat, Some(mt))?; + + let iter = izip!(source, pos, occur, ro); + for (s_value, pos_value, occur_value, ro_value) in iter { + if ro_value != 0 && ro_value != 1 { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: return_option must be 1 or 0, but got {}", + self.name(), + ro_value + ))); + } + if s_value.is_empty() || pat.is_empty() { + builder.append(0); + continue; + } + + let instr = regexp_instr(s_value, &re, pos_value, occur_value, ro_value); + + builder.append(instr); + } + + Ok(builder.build_column()) + } + + fn a_regexp_instr_binary( + &self, + source: &ColumnRef, + pat: &ColumnRef, + pos: &ColumnRef, + occurrence: &ColumnRef, + return_option: &ColumnRef, + match_type: &ColumnRef, + ) -> Result { + let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(source.len()); + + let mut map: HashMap, Regex> = HashMap::new(); + let mut key: Vec = Vec::new(); + + let source = Vu8::try_create_viewer(source)?; + let pat = Vu8::try_create_viewer(pat)?; + let pos = i64::try_create_viewer(pos)?; + let occur = i64::try_create_viewer(occurrence)?; + let ro = i64::try_create_viewer(return_option)?; + let mt = Vu8::try_create_viewer(match_type)?; + + let iter = izip!(source, pat, pos, occur, ro, mt); + for (s_value, pat_value, pos_value, occur_value, ro_value, mt_value) in iter { + if ro_value != 0 && ro_value != 1 { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {}: return_option must be 1 or 0, but got {}", + self.name(), + ro_value + ))); + } + if mt_value.starts_with_str("-") { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {} match type: {}", + self.name(), + mt_value.to_str_lossy(), + ))); + } + if s_value.is_empty() || pat_value.is_empty() { + builder.append(0); + continue; + } + + key.extend_from_slice(pat_value); + key.extend_from_slice("-".as_bytes()); + key.extend_from_slice(mt_value); + let re = if let Some(re) = map.get(&key) { + re + } else { + let re = build_regexp_from_pattern(self.name(), pat_value, Some(mt_value))?; + map.insert(key.clone(), re); + map.get(&key).unwrap() + }; + key.clear(); + + let instr = regexp_instr(s_value, re, pos_value, occur_value, ro_value); + + builder.append(instr); + } + + Ok(builder.build_column()) + } +} + +#[inline] +fn regexp_instr(s: &[u8], re: &Regex, pos: i64, occur: i64, ro: i64) -> u64 { + let occur = if occur < 1 { 1 } else { occur }; + let mut pos = if pos < 1 { 0 } else { (pos - 1) as usize }; + let mut i = 1_i64; + let m = loop { + let m = re.find_at(s, pos); + if i == occur || m.is_none() { + break m; + } + + i += 1; + if let Some(m) = m { + pos += m.end(); + } + if pos >= s.len() { + break None; + } + }; + + let instr = match m { + Some(m) => { + if ro == 0 { + m.start() + 1 + } else { + m.end() + 1 + } + } + None => 0, + }; + + instr as u64 +} + +impl fmt::Display for RegexpInStrFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.display_name) + } +} diff --git a/common/functions/src/scalars/strings/regexp_like.rs b/common/functions/src/scalars/strings/regexp_like.rs index df79a1aa65d6..61cb2e4082c2 100644 --- a/common/functions/src/scalars/strings/regexp_like.rs +++ b/common/functions/src/scalars/strings/regexp_like.rs @@ -73,10 +73,14 @@ impl Function for RegexpLikeFunction { if columns.len() == 3 { if columns[2].column().is_const() { let mt = columns[2].column().get_string(0)?; - return Ok(Arc::new(a_regexp_binary_scalar(lhs, &rhs, Some(&mt))?)); + return Ok(Arc::new(self.a_regexp_binary_scalar( + lhs, + &rhs, + Some(&mt), + )?)); } } else { - return Ok(Arc::new(a_regexp_binary_scalar(lhs, &rhs, None)?)); + return Ok(Arc::new(self.a_regexp_binary_scalar(lhs, &rhs, None)?)); } } @@ -84,7 +88,7 @@ impl Function for RegexpLikeFunction { if columns.len() == 3 { mt = Some(columns[2].column()) } - Ok(Arc::new(a_regexp_binary( + Ok(Arc::new(self.a_regexp_binary( columns[0].column(), columns[1].column(), mt, @@ -98,86 +102,97 @@ impl fmt::Display for RegexpLikeFunction { } } -#[inline] -fn a_regexp_binary_scalar(lhs: &ColumnRef, rhs: &[u8], mt: Option<&[u8]>) -> Result { - let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(lhs.len()); - - let re = build_regexp_from_pattern(rhs, mt)?; +impl RegexpLikeFunction { + fn a_regexp_binary_scalar( + &self, + lhs: &ColumnRef, + rhs: &[u8], + mt: Option<&[u8]>, + ) -> Result { + let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(lhs.len()); + + let re = build_regexp_from_pattern(self.name(), rhs, mt)?; + + let lhs = Vu8::try_create_viewer(lhs)?; + for lhs_value in lhs.iter() { + builder.append(re.is_match(lhs_value)); + } - let lhs = Vu8::try_create_viewer(lhs)?; - for lhs_value in lhs.iter() { - builder.append(re.is_match(lhs_value)); + Ok(builder.build_column()) } - Ok(builder.build_column()) -} - -#[inline] -fn a_regexp_binary( - lhs: &ColumnRef, - rhs: &ColumnRef, - mt: Option<&ColumnRef>, -) -> Result { - let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(lhs.len()); - - let mut map = HashMap::new(); - let mut key: Vec = Vec::new(); - - let lhs = Vu8::try_create_viewer(lhs)?; - let rhs = Vu8::try_create_viewer(rhs)?; - - if let Some(mt) = mt { - let mt = Vu8::try_create_viewer(mt)?; - let iter = izip!(lhs, rhs, mt); - for (lhs_value, rhs_value, mt_value) in iter { - if mt_value.starts_with_str("-") { - return Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to REGEXP_LIKE match type: {}", - mt_value.to_str_lossy(), - ))); + fn a_regexp_binary( + &self, + lhs: &ColumnRef, + rhs: &ColumnRef, + mt: Option<&ColumnRef>, + ) -> Result { + let mut builder: ColumnBuilder = ColumnBuilder::with_capacity(lhs.len()); + + let mut map = HashMap::new(); + let mut key: Vec = Vec::new(); + + let lhs = Vu8::try_create_viewer(lhs)?; + let rhs = Vu8::try_create_viewer(rhs)?; + + if let Some(mt) = mt { + let mt = Vu8::try_create_viewer(mt)?; + let iter = izip!(lhs, rhs, mt); + for (lhs_value, rhs_value, mt_value) in iter { + if mt_value.starts_with_str("-") { + return Err(ErrorCode::BadArguments(format!( + "Incorrect arguments to {} match type: {}", + self.name(), + mt_value.to_str_lossy(), + ))); + } + key.extend_from_slice(rhs_value); + key.extend_from_slice("-".as_bytes()); + key.extend_from_slice(mt_value); + + let pattern = if let Some(pattern) = map.get(&key) { + pattern + } else { + let re = build_regexp_from_pattern(self.name(), rhs_value, Some(mt_value))?; + map.insert(key.clone(), re); + map.get(&key).unwrap() + }; + key.clear(); + + builder.append(pattern.is_match(lhs_value)); + } + } else { + for (lhs_value, rhs_value) in lhs.zip(rhs) { + key.extend_from_slice(rhs_value); + let pattern = if let Some(pattern) = map.get(&key) { + pattern + } else { + let re = build_regexp_from_pattern(self.name(), rhs_value, None)?; + map.insert(key.clone(), re); + map.get(&key).unwrap() + }; + key.clear(); + + builder.append(pattern.is_match(lhs_value)); } - key.extend_from_slice(rhs_value); - key.extend_from_slice("-".as_bytes()); - key.extend_from_slice(mt_value); - - let pattern = if let Some(pattern) = map.get(&key) { - pattern - } else { - let re = build_regexp_from_pattern(rhs_value, Some(mt_value))?; - map.insert(key.clone(), re); - map.get(&key).unwrap() - }; - key.clear(); - - builder.append(pattern.is_match(lhs_value)); } - } else { - for (lhs_value, rhs_value) in lhs.zip(rhs) { - key.extend_from_slice(rhs_value); - let pattern = if let Some(pattern) = map.get(&key) { - pattern - } else { - let re = build_regexp_from_pattern(rhs_value, None)?; - map.insert(key.clone(), re); - map.get(&key).unwrap() - }; - key.clear(); - builder.append(pattern.is_match(lhs_value)); - } + Ok(builder.build_column()) } - - Ok(builder.build_column()) } #[inline] -fn build_regexp_from_pattern(pat: &[u8], mt: Option<&[u8]>) -> Result { +pub fn build_regexp_from_pattern( + fn_name: &str, + pat: &[u8], + mt: Option<&[u8]>, +) -> Result { let pattern = match pat.is_empty() { true => "^$", false => simdutf8::basic::from_utf8(pat).map_err(|e| { ErrorCode::BadArguments(format!( - "Unable to convert the REGEXP_LIKE pattern to string: {}", - e + "Unable to convert the {} pattern to string: {}", + fn_name, e )) })?, }; @@ -206,12 +221,12 @@ fn build_regexp_from_pattern(pat: &[u8], mt: Option<&[u8]>) -> Result Err(ErrorCode::BadArguments(format!( - "Unsupported arguments to REGEXP_LIKE match type: {}", - c, + "Unsupported arguments to {} match type: {}", + fn_name, c, ))), _ => Err(ErrorCode::BadArguments(format!( - "Incorrect arguments to REGEXP_LIKE match type: {}", - c, + "Incorrect arguments to {} match type: {}", + fn_name, c, ))), }; if let Err(e) = r { @@ -220,8 +235,8 @@ fn build_regexp_from_pattern(pat: &[u8], mt: Option<&[u8]>) -> Result Result<()> { + let tests = vec![ + ScalarFunctionTest { + name: "regexp-instr-two-column-passed", + columns: vec![ + Series::from_data(vec!["dog cat dog", "aa aaa aaaa aa aaa aaaa", ""]), + Series::from_data(vec!["dog", "a{2}", ""]), + ], + expect: Series::from_data(vec![1_u64, 1, 0]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-three-column-passed", + columns: vec![ + Series::from_data(vec!["dog cat dog", "aa aaa aaaa aa aaa aaaa", ""]), + Series::from_data(vec!["dog", "a{2}", ""]), + Series::from_data(vec![1_i64, 2, 1]), + ], + expect: Series::from_data(vec![1_u64, 4, 0]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-four-column-passed", + columns: vec![ + Series::from_data(vec![ + "dog cat dog", + "aa aaa aaaa aa aaa aaaa", + "aa aaa aaaa aa aaa aaaa", + ]), + Series::from_data(vec!["dog", "a{2}", "a{4}"]), + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![2_i64, 2, 2]), + ], + expect: Series::from_data(vec![9_u64, 8, 20]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-five-column-passed", + columns: vec![ + Series::from_data(vec![ + "dog cat dog", + "aa aaa aaaa aa aaa aaaa", + "aa aaa aaaa aa aaa aaaa", + ]), + Series::from_data(vec!["dog", "a{2}", "a{4}"]), + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![2_i64, 2, 2]), + Series::from_data(vec![0_i64, 1, 1]), + ], + expect: Series::from_data(vec![9_u64, 10, 24]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-six-column-passed", + columns: vec![ + Series::from_data(vec![ + "dog cat dog", + "aa aaa aaaa aa aaa aaaa", + "aa aaa aaaa aa aaa aaaa", + ]), + Series::from_data(vec!["dog", "A{2}", "A{4}"]), + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![2_i64, 2, 2]), + Series::from_data(vec![0_i64, 1, 1]), + Series::from_data(vec!["i", "c", "i"]), + ], + expect: Series::from_data(vec![9_u64, 0, 24]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-return-option-error", + columns: vec![ + Series::from_data(vec![ + "dog cat dog", + "aa aaa aaaa aa aaa aaaa", + "aa aaa aaaa aa aaa aaaa", + ]), + Series::from_data(vec!["dog", "A{2}", "A{4}"]), + Series::from_data(vec![2_i64, 2, 2]), + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![0_i64, 2, 1]), + ], + expect: Series::from_data(Vec::::new()), + error: "Incorrect arguments to regexp_instr: return_option must be 1 or 0, but got 2", + }, + ScalarFunctionTest { + name: "regexp-instr-match-type-error", + columns: vec![ + Series::from_data(vec![ + "dog cat dog", + "aa aaa aaaa aa aaa aaaa", + "aa aaa aaaa aa aaa aaaa", + ]), + Series::from_data(vec!["dog", "A{2}", "A{4}"]), + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![2_i64, 2, 2]), + Series::from_data(vec![0_i64, 1, 1]), + Series::from_data(vec!["i", "c", "-i"]), + ], + expect: Series::from_data(Vec::::new()), + error: "Incorrect arguments to regexp_instr match type: -i", + }, + ]; + + test_scalar_functions( + RegexpInStrFunction::try_create("regexp_instr")?, + &tests, + true, + ) +} + +#[test] +fn test_regexp_instr_constant_column() -> Result<()> { + let data_type = DataValue::String("dog".as_bytes().into()); + let data_value1 = StringType::arc().create_constant_column(&data_type, 3)?; + let data_value2 = StringType::arc().create_constant_column(&data_type, 3)?; + + let tests = vec![ + ScalarFunctionTest { + name: "regexp-instr-const-column-passed", + columns: vec![ + Series::from_data(vec!["dog cat dog", "cat dog cat", "cat dog cat"]), + data_value1, + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![2_i64, 1, 1]), + Series::from_data(vec![0_i64, 0, 1]), + ], + expect: Series::from_data(vec![9_u64, 5, 8]), + error: "", + }, + ScalarFunctionTest { + name: "regexp-instr-const-column-return-option-error", + columns: vec![ + Series::from_data(vec!["dog cat dog", "cat dog cat", "cat dog cat"]), + data_value2, + Series::from_data(vec![1_i64, 2, 1]), + Series::from_data(vec![2_i64, 1, 1]), + Series::from_data(vec![2_i64, 0, 1]), + ], + expect: Series::from_data(Vec::::new()), + error: "Incorrect arguments to regexp_instr: return_option must be 1 or 0, but got 2", + }, + ]; + + test_scalar_functions( + RegexpInStrFunction::try_create("regexp_instr")?, + &tests, + true, + ) +} diff --git a/common/functions/tests/it/scalars/strings/regexp_like.rs b/common/functions/tests/it/scalars/strings/regexp_like.rs index d592db9778e8..1b40a1427abf 100644 --- a/common/functions/tests/it/scalars/strings/regexp_like.rs +++ b/common/functions/tests/it/scalars/strings/regexp_like.rs @@ -58,7 +58,7 @@ fn test_regexp_like_function() -> Result<()> { Series::from_data(vec!["x"]), ], expect: Series::from_data(Vec::::new()), - error: "Incorrect arguments to REGEXP_LIKE match type: x", + error: "Incorrect arguments to regexp_like match type: x", }, ScalarFunctionTest { name: "regexp-like-match-type-error2", @@ -68,7 +68,7 @@ fn test_regexp_like_function() -> Result<()> { Series::from_data(vec!["u"]), ], expect: Series::from_data(Vec::::new()), - error: "Unsupported arguments to REGEXP_LIKE match type: u", + error: "Unsupported arguments to regexp_like match type: u", }, ScalarFunctionTest { name: "regexp-like-nullable-passed", @@ -96,7 +96,7 @@ fn test_regexp_like_match_type_joiner() -> Result<()> { Series::from_data(vec!["i", "-i"]), ], expect: Series::from_data(Vec::::new()), - error: "Incorrect arguments to REGEXP_LIKE match type: -i", + error: "Incorrect arguments to regexp_like match type: -i", }, ScalarFunctionTest { name: "regexp-like-match-type-joiner-error-2", @@ -106,7 +106,7 @@ fn test_regexp_like_match_type_joiner() -> Result<()> { Series::from_data(vec!["", "-"]), ], expect: Series::from_data(Vec::::new()), - error: "Incorrect arguments to REGEXP_LIKE match type: -", + error: "Incorrect arguments to regexp_like match type: -", }, ]; diff --git a/tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.result b/tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.result new file mode 100644 index 000000000000..44e46b21b179 --- /dev/null +++ b/tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.result @@ -0,0 +1,9 @@ +1 +9 +9 +12 +12 +0 +NULL +NULL +dog cat dog diff --git a/tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.sql b/tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.sql new file mode 100644 index 000000000000..89a07aaaae3b --- /dev/null +++ b/tests/suites/0_stateless/02_function/02_0049_function_string_regexp_instr.sql @@ -0,0 +1,13 @@ +SELECT REGEXP_INSTR('dog cat dog', 'dog'); +SELECT REGEXP_INSTR('dog cat dog', 'dog', 2); +SELECT REGEXP_INSTR('dog cat dog', 'dog', 1, 2); +SELECT REGEXP_INSTR('dog cat dog', 'dog', 1, 2, 1); +SELECT REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1); +SELECT REGEXP_INSTR('dog cat dog', 'DOG', 1, 2, 1, 'c'); +SELECT REGEXP_INSTR('dog cat dog', NULL); +SELECT REGEXP_INSTR('dog cat dog', 'dog', NULL); +-- +DROP TABLE IF EXISTS t1; +CREATE TABLE t1(s String NULL, pat String NULL, pos Int64 NULL, occu Int64 NULL, ro Int64 NULL, mt String NULL) Engine = Memory; +INSERT INTO t1 (s, pat, pos, occu, ro, mt) VALUES (NULL, 'dog', 1, 1, 1, ''), ('dog cat dog', 'dog', NULL, 1, 1, 'c'), ('dog cat dog', 'dog', 1, 1, 1, 'c'), ('dog cat dog', 'dog', 1, 1, 1, NULL); +select s from t1 where regexp_instr(s, pat, pos, occu, ro, mt) = 4; From aeddc9723e53da9f71a9c3fc78f9ed86a7bccf1b Mon Sep 17 00:00:00 2001 From: nange Date: Wed, 30 Mar 2022 16:03:45 +0800 Subject: [PATCH 2/2] add docs for regexp_instr function --- .../04-string-functions/regexp_instr.md | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/doc/03-reference/02-functions/04-string-functions/regexp_instr.md diff --git a/docs/doc/03-reference/02-functions/04-string-functions/regexp_instr.md b/docs/doc/03-reference/02-functions/04-string-functions/regexp_instr.md new file mode 100644 index 000000000000..53249142f84d --- /dev/null +++ b/docs/doc/03-reference/02-functions/04-string-functions/regexp_instr.md @@ -0,0 +1,58 @@ +--- +title: REGEXP_INSTR +--- + +Returns the starting index of the substring of the string `expr` that matches the regular expression specified by the pattern `pat`, `0` if there is no match. If `expr` or `pat` is NULL, the return value is NULL. Character indexes begin at `1`. + +## Syntax + +```sql +REGEXP_INSTR(expr, pat[, pos[, occurrence[, return_option[, match_type]]]]) +``` + +## Arguments + +| Arguments | Description | +| ----------- | ----------- | +| expr | The string expr that to be matched | +| pat | The regular expression | +| pos | Optional. The position in expr at which to start the search. If omitted, the default is 1. | +| occurrence | Optional. Which occurrence of a match to search for. If omitted, the default is 1. | +| return_option | Optional. Which type of position to return. If this value is 0, REGEXP_INSTR() returns the position of the matched substring's first character. If this value is 1, REGEXP_INSTR() returns the position following the matched substring. If omitted, the default is 0. | +| match_type | Optional. A string that specifies how to perform matching. The meaning is as described for REGEXP_LIKE(). | + +## Return Type + +A number data type value. + +## Examples + +```txt +SELECT REGEXP_INSTR('dog cat dog', 'dog'); ++------------------------------------+ +| REGEXP_INSTR('dog cat dog', 'dog') | ++------------------------------------+ +| 1 | ++------------------------------------+ + +SELECT REGEXP_INSTR('dog cat dog', 'dog', 2); ++---------------------------------------+ +| REGEXP_INSTR('dog cat dog', 'dog', 2) | ++---------------------------------------+ +| 9 | ++---------------------------------------+ + +SELECT REGEXP_INSTR('aa aaa aaaa', 'a{2}'); ++-------------------------------------+ +| REGEXP_INSTR('aa aaa aaaa', 'a{2}') | ++-------------------------------------+ +| 1 | ++-------------------------------------+ + +SELECT REGEXP_INSTR('aa aaa aaaa', 'a{4}'); ++-------------------------------------+ +| REGEXP_INSTR('aa aaa aaaa', 'a{4}') | ++-------------------------------------+ +| 8 | ++-------------------------------------+ +```