Skip to content

Commit

Permalink
improve: improve performance of scalar_regex_match
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuliquan committed Dec 11, 2024
1 parent 7014ac3 commit ef66c56
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 47 deletions.
22 changes: 19 additions & 3 deletions datafusion/physical-expr/benches/scalar_regex_match.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,18 @@ fn make_record_batch(
batch_iter: usize,
batch_size: usize,
string_len: usize,
matched_str: &[&str],
schema: &Schema,
) -> Vec<RecordBatch> {
let mut rng = StdRng::from_seed([123; 32]);
let mut rng = StdRng::seed_from_u64(12345);
let mut batches = vec![];
for _ in 0..batch_iter {
let array = (0..batch_size)
let mut array = (0..batch_size)
.map(|_| Some(Alphanumeric.sample_string(&mut rng, string_len)))
.collect::<Vec<_>>();
for v in matched_str {
array.push(Some(v.to_string()));
}
let array = StringArray::from(array);
let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])
.unwrap();
Expand Down Expand Up @@ -74,7 +78,19 @@ fn init_benchmark() -> (
(
128_usize,
4096_usize,
make_record_batch(128, 4096, 100, &schema),
make_record_batch(
128,
4096,
100,
&[
"example@email.com",
"http://example.com",
"123.4.5.6",
"1236787788",
"55555",
],
&schema,
),
),
];

Expand Down
58 changes: 14 additions & 44 deletions datafusion/physical-expr/src/expressions/scalar_regex_match.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
// under the License.

use super::Literal;
use arrow::array::ArrayData;
use arrow_array::{
Array, ArrayAccessor, BooleanArray, LargeStringArray, StringArray, StringViewArray,
};
use arrow_buffer::BooleanBufferBuilder;
use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder};
use arrow_schema::{DataType, Schema};
use datafusion_common::{Result as DFResult, ScalarValue};
use datafusion_expr::ColumnarValue;
Expand Down Expand Up @@ -144,6 +143,8 @@ impl ScalarRegexMatchExpr {
&self,
array: &Arc<dyn Array>,
) -> datafusion_common::Result<ColumnarValue> {
/// downcast_string_array downcast a [`ArrayRef`] to specific array type
/// example: [`StringArray`], [`LargeStringArray`], [`StringViewArray`]
macro_rules! downcast_string_array {
($ARRAY:expr, $ARRAY_TYPE:ident, $ERR_MSG:expr) => {
&($ARRAY
Expand All @@ -157,7 +158,7 @@ impl ScalarRegexMatchExpr {
Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)))
},
DataType::Utf8 => array_regexp_match(
downcast_string_array!(array, StringArray, "Failed to downcast StringArray"),
downcast_string_array!(array, StringArray, "Failed to downcast StringArray"),
self.compiled.as_ref().unwrap(),
self.negated,
),
Expand Down Expand Up @@ -285,26 +286,6 @@ impl PhysicalExpr for ScalarRegexMatchExpr {
Arc::clone(&children[1]),
)))
}

fn evaluate_selection(
&self,
batch: &arrow_array::RecordBatch,
selection: &BooleanArray,
) -> DFResult<ColumnarValue> {
let tmp_batch = arrow::compute::filter_record_batch(batch, selection)?;

let tmp_result = self.evaluate(&tmp_batch)?;

if batch.num_rows() == tmp_batch.num_rows() {
// All values from the `selection` filter are true.
Ok(tmp_result)
} else if let ColumnarValue::Array(a) = tmp_result {
datafusion_physical_expr_common::utils::scatter(selection, a.as_ref())
.map(ColumnarValue::Array)
} else {
Ok(tmp_result)
}
}
}

/// It is used for scalar regexp matching and copy from arrow-rs
Expand All @@ -313,31 +294,20 @@ fn array_regexp_match(
regex: &Regex,
negated: bool,
) -> DFResult<ColumnarValue> {
let null_bit_buffer = array.nulls().map(|x| x.inner().sliced());
let mut buffer_builder = BooleanBufferBuilder::new(array.len());

if regex.as_str().is_empty() {
buffer_builder.append_n(array.len(), true);
let null_buffer = array.logical_nulls();
let bool_buffer = if regex.as_str().is_empty() {
BooleanBuffer::new_set(array.len())
} else {
let mut bool_buffer_builder = BooleanBufferBuilder::new(array.len());
bool_buffer_builder.advance(array.len());
for i in 0..array.len() {
let value = array.value(i);
buffer_builder.append(regex.is_match(value));
let value = unsafe { array.value_unchecked(i) };
bool_buffer_builder.set_bit(i, regex.is_match(value));
}
}

let buffer = buffer_builder.into();
let bool_array = BooleanArray::from(unsafe {
ArrayData::new_unchecked(
DataType::Boolean,
array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
)
});
bool_buffer_builder.finish()
};

let bool_array = BooleanArray::new(bool_buffer, null_buffer);
let bool_array = if negated {
arrow::compute::kernels::boolean::not(&bool_array)
} else {
Expand Down

0 comments on commit ef66c56

Please sign in to comment.