Skip to content

Commit

Permalink
Merge pull request #1 from jiacai2050/regex_support
Browse files Browse the repository at this point in the history
Regex support
  • Loading branch information
tanruixiang authored May 30, 2023
2 parents c768c9b + f0c4ec4 commit 6f3e865
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 10 deletions.
6 changes: 5 additions & 1 deletion datafusion-examples/examples/rewrite_expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ use std::sync::Arc;
pub fn main() -> Result<()> {
// produce a logical plan using the datafusion-sql crate
let dialect = PostgreSqlDialect {};
let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32";
// let sql = "SELECT * FROM person WHERE age BETWEEN 21 AND 32";
let sql = "select * from person where name ~ '1|2';";
let statements = Parser::parse_sql(&dialect, sql)?;

// produce a logical plan using the datafusion-sql crate
Expand Down Expand Up @@ -136,8 +137,10 @@ impl OptimizerRule for MyOptimizerRule {
) -> Result<Option<LogicalPlan>> {
// recurse down and optimize children first
let optimized_plan = utils::optimize_children(self, plan, config)?;
dbg!(&optimized_plan);
match optimized_plan {
Some(LogicalPlan::Filter(filter)) => {
dbg!(&filter.predicate);
let predicate = my_rewrite(filter.predicate.clone())?;
Ok(Some(LogicalPlan::Filter(Filter::try_new(
predicate,
Expand All @@ -147,6 +150,7 @@ impl OptimizerRule for MyOptimizerRule {
Some(optimized_plan) => Ok(Some(optimized_plan)),
None => match plan {
LogicalPlan::Filter(filter) => {
println!("{:?}", filter.predicate);
let predicate = my_rewrite(filter.predicate.clone())?;
Ok(Some(LogicalPlan::Filter(Filter::try_new(
predicate,
Expand Down
10 changes: 10 additions & 0 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2448,6 +2448,16 @@ mod tests {
regex_not_match(col("c1"), lit("^foo$")),
col("c1").not_eq(lit("foo")),
);
assert_change(
regex_match(col("c1"), lit("^(foo|bar)$")),
col("c1").eq(lit("bar")).or(col("c1").eq(lit("foo"))),
);
assert_change(
regex_not_match(col("c1"), lit("^(foo|bar)$")),
col("c1")
.not_eq(lit("bar"))
.and(col("c1").not_eq(lit("foo"))),
);
assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
assert_no_change(regex_match(col("c1"), lit("^")));
Expand Down
79 changes: 72 additions & 7 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use datafusion_common::{DataFusionError, Result, ScalarValue};
use datafusion_expr::{lit, BinaryExpr, Expr, Like, Operator};
use regex_syntax::hir::{Hir, HirKind, Literal, Look};
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};

/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
Expand All @@ -33,7 +33,6 @@ pub fn simplify_regex_expr(
match regex_syntax::Parser::new().parse(pattern) {
Ok(hir) => {
let kind = hir.kind();

if let HirKind::Alternation(alts) = kind {
if alts.len() <= MAX_REGEX_ALTERNATIONS_EXPANSION {
if let Some(expr) = lower_alt(&mode, &left, alts) {
Expand Down Expand Up @@ -166,6 +165,40 @@ fn is_anchored_literal(v: &[Hir]) -> bool {
.all(|h| matches!(h.kind(), HirKind::Literal(_)))
}

/// returns true if the elements in a `Concat` pattern are:
/// - `[Look::Start, Capture(Alternation(Literals...)), Look::End]`
fn is_anchored_capture(v: &[Hir]) -> bool {
if 3 != v.len() {
return false;
}

let first_last = (
v.first().expect("length checked"),
v.last().expect("length checked"),
);
if !matches!(first_last,
(s, e) if s.kind() == &HirKind::Look(Look::Start)
&& e.kind() == &HirKind::Look(Look::End)
)
{
return false;
}

if let HirKind::Capture(cap, ..) = v[1].kind() {
let Capture { sub, .. } = cap;
if let HirKind::Alternation(alters) = sub.kind() {
let has_non_literal = alters
.iter()
.any(|v| !matches!(v.kind(), &HirKind::Literal(_)));
if has_non_literal {
return false;
}
}
}

true
}

/// extracts a string literal expression assuming that [`is_anchored_literal`]
/// returned true.
fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
Expand All @@ -179,6 +212,36 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
}
}

fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
if 3 != v.len() {
return None;
}

if let HirKind::Capture(cap, ..) = v[1].kind() {
let Capture { sub, .. } = cap;
if let HirKind::Alternation(alters) = sub.kind() {
let mut literals = Vec::with_capacity(alters.len());
for hir in alters {
let mut is_safe = false;
if let HirKind::Literal(l) = hir.kind() {
if let Some(safe_literal) = str_from_literal(l).map(lit) {
literals.push(safe_literal);
is_safe = true;
}
}

if !is_safe {
return None;
}
}

return Some(literals);
}
}

return None;
}

fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
match hir.kind() {
HirKind::Empty => {
Expand All @@ -189,10 +252,13 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
}
HirKind::Concat(inner) if is_anchored_literal(inner) => {
let right = anchored_literal_to_expr(inner)?;
return Some(
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right)),
);
return anchored_literal_to_expr(inner).map(|right| {
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
});
}
HirKind::Concat(inner) if is_anchored_capture(inner) => {
return anchored_alternation_to_exprs(inner)
.map(|right| left.clone().in_list(right, mode.not));
}
HirKind::Concat(inner) => {
if let Some(pattern) = collect_concat_to_like_string(inner) {
Expand All @@ -201,7 +267,6 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
}
_ => {}
}

None
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ impl OptimizerRule for SimplifyExpressions {
plan: &LogicalPlan,
config: &dyn OptimizerConfig,
) -> Result<Option<LogicalPlan>> {
println!("try_optimize: {:?}",plan);
let mut execution_props = ExecutionProps::new();
execution_props.query_execution_start_time = config.query_execution_start_time();
Ok(Some(Self::optimize_internal(plan, &execution_props)?))
Expand All @@ -63,6 +64,7 @@ impl SimplifyExpressions {
plan: &LogicalPlan,
execution_props: &ExecutionProps,
) -> Result<LogicalPlan> {
println!("now plan: {:?}", plan);
let schema = if !plan.inputs().is_empty() {
DFSchemaRef::new(merge_schema(plan.inputs()))
} else if let LogicalPlan::TableScan(_) = plan {
Expand Down Expand Up @@ -91,8 +93,9 @@ impl SimplifyExpressions {
let name = &e.display_name();

// Apply the actual simplification logic
dbg!(&e);
let new_e = simplifier.simplify(e)?;

dbg!(&new_e);
let new_name = &new_e.display_name();

if let (Ok(expr_name), Ok(new_expr_name)) = (name, new_name) {
Expand Down Expand Up @@ -193,7 +196,7 @@ mod tests {
.project(vec![col("a")])?
.filter(and(col("b").gt(lit(1)), col("b").gt(lit(1))))?
.build()?;

println!("{:?}", plan);
assert_optimized_plan_eq(
&plan,
"\
Expand Down

0 comments on commit 6f3e865

Please sign in to comment.