Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ruff] Implement unnecessary-regular-expression (RUF055) #14659

Merged
merged 35 commits into from
Nov 28, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d9d365d
add unnecessary regular expression rule
ntBre Nov 27, 2024
f59c38d
add test case
ntBre Nov 27, 2024
2538401
renumber to 055
ntBre Nov 27, 2024
c2d1256
only applies to `re` for now
ntBre Nov 27, 2024
e05d91c
match on nargs too instead of guard
ntBre Nov 27, 2024
1c871e3
lint
ntBre Nov 27, 2024
fa0651f
generate schema
ntBre Nov 27, 2024
6652fe8
restrict the context for suggestions
ntBre Nov 27, 2024
d56c079
update tests
ntBre Nov 27, 2024
9025495
use in_boolean_test
ntBre Nov 28, 2024
39391f4
update summary with more constraints
ntBre Nov 28, 2024
916fd73
generate Exprs instead of using format
ntBre Nov 28, 2024
ef1da87
rename to avoid double negation
ntBre Nov 28, 2024
fa16e9e
use locate_arg
ntBre Nov 28, 2024
ce3fb1a
check that additional arguments prevent the rule
ntBre Nov 28, 2024
0c10156
remove extra newline
ntBre Nov 28, 2024
bc04c1f
use str::contains instead of chars::any
ntBre Nov 28, 2024
57d8bc1
as_ref to &
ntBre Nov 28, 2024
c170945
retrieve pat from re_func now
ntBre Nov 28, 2024
265a4d0
inline locate_arg
ntBre Nov 28, 2024
e66bced
fmt
ntBre Nov 28, 2024
ec3346f
add brief summary and move details to the end
ntBre Nov 28, 2024
046816a
from_call_expr only needs SemanticModel, not Checker
ntBre Nov 28, 2024
6d8e9b1
move unnecessary_regular_expression to the top
ntBre Nov 28, 2024
421b30a
check safety based on comments, add test case, add docs
ntBre Nov 28, 2024
27773b8
test all metacharacters and raw strings
ntBre Nov 28, 2024
a3f7cd4
update snapshot, unsafe fix moved down in the file
ntBre Nov 28, 2024
31887d5
use with_fix
ntBre Nov 28, 2024
742a936
inline nargs
ntBre Nov 28, 2024
2639ac2
add reference
ntBre Nov 28, 2024
1095362
use Fix::applicable_edit
ntBre Nov 28, 2024
70e1150
improve docs
ntBre Nov 28, 2024
11a7f40
tidy applicable_edit
ntBre Nov 28, 2024
1ad19a8
remove }, ], and ) from metacharacters
ntBre Nov 28, 2024
cc5f75e
extra s
ntBre Nov 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions crates/ruff_linter/resources/test/fixtures/ruff/RUF055.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import re

s = "str"

# this should be replaced with s.replace("abc", "")
re.sub("abc", "", s)


# this example, adapted from https://docs.python.org/3/library/re.html#re.sub,
# should *not* be replaced because repl is a function, not a string
def dashrepl(matchobj):
if matchobj.group(0) == "-":
return " "
else:
return "-"


re.sub("-", dashrepl, "pro----gram-files")

# this one should be replaced with s.startswith("abc") because the Match is
# used in an if context for its truth value
if re.match("abc", s):
pass
if m := re.match("abc", s): # this should *not* be replaced
pass
re.match("abc", s) # this should not be replaced because match returns a Match

# this should be replaced with "abc" in s
if re.search("abc", s):
pass
re.search("abc", s) # this should not be replaced

# this should be replaced with "abc" == s
if re.fullmatch("abc", s):
pass
re.fullmatch("abc", s) # this should not be replaced

# this should be replaced with s.split("abc")
re.split("abc", s)

# these currently should not be modified because the patterns contain regex
# metacharacters
re.sub("ab[c]", "", s)
re.match("ab[c]", s)
re.search("ab[c]", s)
re.fullmatch("ab[c]", s)
re.split("ab[c]", s)
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved

# and these should not be modified because they have extra arguments
re.sub("abc", "", s, flags=re.A)
re.match("abc", s, flags=re.I)
re.search("abc", s, flags=re.L)
re.fullmatch("abc", s, flags=re.M)
re.split("abc", s, maxsplit=2)
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
3 changes: 3 additions & 0 deletions crates/ruff_linter/src/checkers/ast/analyze/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
if checker.enabled(Rule::AirflowDagNoScheduleArgument) {
airflow::rules::dag_no_schedule_argument(checker, expr);
}
if checker.enabled(Rule::UnnecessaryRegularExpression) {
ruff::rules::unnecessary_regular_expression(checker, call);
}
}
Expr::Dict(dict) => {
if checker.any_enabled(&[
Expand Down
1 change: 1 addition & 0 deletions crates/ruff_linter/src/codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
(Ruff, "040") => (RuleGroup::Preview, rules::ruff::rules::InvalidAssertMessageLiteralArgument),
(Ruff, "041") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryNestedLiteral),
(Ruff, "048") => (RuleGroup::Preview, rules::ruff::rules::MapIntVersionParsing),
(Ruff, "055") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryRegularExpression),
(Ruff, "100") => (RuleGroup::Stable, rules::ruff::rules::UnusedNOQA),
(Ruff, "101") => (RuleGroup::Stable, rules::ruff::rules::RedirectedNOQA),

Expand Down
1 change: 1 addition & 0 deletions crates/ruff_linter/src/rules/ruff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,7 @@ mod tests {
#[test_case(Rule::MapIntVersionParsing, Path::new("RUF048_1.py"))]
#[test_case(Rule::UnrawRePattern, Path::new("RUF039.py"))]
#[test_case(Rule::UnrawRePattern, Path::new("RUF039_concat.py"))]
#[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055.py"))]
fn preview_rules(rule_code: Rule, path: &Path) -> Result<()> {
let snapshot = format!(
"preview__{}_{}",
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_linter/src/rules/ruff/rules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ pub(crate) use test_rules::*;
pub(crate) use unnecessary_iterable_allocation_for_first_element::*;
pub(crate) use unnecessary_key_check::*;
pub(crate) use unnecessary_nested_literal::*;
pub(crate) use unnecessary_regular_expression::*;
pub(crate) use unraw_re_pattern::*;
pub(crate) use unsafe_markup_use::*;
pub(crate) use unused_async::*;
Expand Down Expand Up @@ -79,6 +80,7 @@ pub(crate) mod test_rules;
mod unnecessary_iterable_allocation_for_first_element;
mod unnecessary_key_check;
mod unnecessary_nested_literal;
mod unnecessary_regular_expression;
mod unraw_re_pattern;
mod unsafe_markup_use;
mod unused_async;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, ViolationMetadata};
use ruff_python_ast::{
Arguments, CmpOp, Expr, ExprAttribute, ExprCall, ExprCompare, ExprContext, Identifier,
};
use ruff_python_semantic::Modules;
use ruff_text_size::TextRange;

use crate::checkers::ast::Checker;

/// ## What it does
///
/// Reports the following `re` calls when their first arguments are plain string
/// literals, and no additional flags are passed:
///
/// - `sub`
/// - `match`
/// - `search`
/// - `fullmatch`
/// - `split`
///
/// For `sub`, the `repl` (replacement) argument must also be a string literal,
/// not a function. For `match`, `search`, and `fullmatch`, the return value
/// must also be used only for its truth value.
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
///
/// ## Why is this bad?
///
/// Performing checks on strings directly can make the code simpler, may require
/// less escaping, and will often be faster.
///
/// ## Example
///
/// ```python
/// re.sub("abc", "", s)
/// ```
///
/// Use instead:
///
/// ```python
/// s.replace("abc", "")
/// ```
#[derive(ViolationMetadata)]
pub(crate) struct UnnecessaryRegularExpression {
replacement: String,
}

impl AlwaysFixableViolation for UnnecessaryRegularExpression {
#[derive_message_formats]
fn message(&self) -> String {
"Plain string pattern passed to `re` function".to_string()
}

fn fix_title(&self) -> String {
format!("Replace with `{}`", self.replacement)
}
}

/// The `re` functions supported by this rule.
#[derive(Debug)]
enum ReFuncKind<'a> {
Sub { repl: &'a Expr },
Match,
Search,
Fullmatch,
Split,
}

#[derive(Debug)]
struct ReFunc<'a> {
kind: ReFuncKind<'a>,
pattern: &'a Expr,
string: &'a Expr,
}

impl<'a> ReFunc<'a> {
fn from_call_expr(
checker: &'a mut Checker,
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
call: &'a ExprCall,
func_name: &str,
) -> Option<Self> {
let nargs = call.arguments.len();
let locate_arg = |name, position| call.arguments.find_argument(name, position);
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved

// the proposed fixes for match, search, and fullmatch rely on the
// return value only being used for its truth value
let in_if_context = checker.semantic().in_boolean_test();

match (func_name, nargs) {
// `split` is the safest of these to fix, as long as metacharacters
// have already been filtered out from the `pattern`
("split", 2) => Some(ReFunc {
kind: ReFuncKind::Split,
pattern: locate_arg("pattern", 0)?,
string: locate_arg("string", 1)?,
}),
// `sub` is only safe to fix if `repl` is a string. `re.sub` also
// allows it to be a function, which will *not* work in the str
// version
("sub", 3) => {
let repl = locate_arg("repl", 1)?;
if !repl.is_string_literal_expr() {
return None;
}
Some(ReFunc {
kind: ReFuncKind::Sub { repl },
pattern: locate_arg("pattern", 0)?,
string: locate_arg("string", 2)?,
})
}
("match", 2) if in_if_context => Some(ReFunc {
kind: ReFuncKind::Match,
pattern: locate_arg("pattern", 0)?,
string: locate_arg("string", 1)?,
}),
("search", 2) if in_if_context => Some(ReFunc {
kind: ReFuncKind::Search,
pattern: locate_arg("pattern", 0)?,
string: locate_arg("string", 1)?,
}),
("fullmatch", 2) if in_if_context => Some(ReFunc {
kind: ReFuncKind::Fullmatch,
pattern: locate_arg("pattern", 0)?,
string: locate_arg("string", 1)?,
}),
_ => None,
}
}

fn replacement(&self) -> Expr {
match self.kind {
// string.replace(pattern, repl)
ReFuncKind::Sub { repl } => {
self.method_expr("replace", vec![self.pattern.clone(), repl.clone()])
}
// string.startswith(pattern)
ReFuncKind::Match => self.method_expr("startswith", vec![self.pattern.clone()]),
// pattern in string
ReFuncKind::Search => self.compare_expr(CmpOp::In),
// string == pattern
ReFuncKind::Fullmatch => self.compare_expr(CmpOp::Eq),
// string.split(pattern)
ReFuncKind::Split => self.method_expr("split", vec![self.pattern.clone()]),
}
}

/// Return a new compare expr of the form `self.pattern op self.string`
fn compare_expr(&self, op: CmpOp) -> Expr {
Expr::Compare(ExprCompare {
left: Box::new(self.pattern.clone()),
ops: Box::new([op]),
comparators: Box::new([self.string.clone()]),
range: TextRange::default(),
})
}

/// Return a new method call expression on `self.string` with `args` like
/// `self.string.method(args...)`
fn method_expr(&self, method: &str, args: Vec<Expr>) -> Expr {
let method = Expr::Attribute(ExprAttribute {
value: Box::new(self.string.clone()),
attr: Identifier::new(method, TextRange::default()),
ctx: ExprContext::Load,
range: TextRange::default(),
});
Expr::Call(ExprCall {
func: Box::new(method),
arguments: Arguments {
args: args.into_boxed_slice(),
keywords: Box::new([]),
range: TextRange::default(),
},
range: TextRange::default(),
})
}
}

/// RUF055
pub(crate) fn unnecessary_regular_expression(checker: &mut Checker, call: &ExprCall) {
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
// adapted from unraw_re_pattern
let semantic = checker.semantic();

if !semantic.seen_module(Modules::RE) {
return;
}

let Some(qualified_name) = semantic.resolve_qualified_name(call.func.as_ref()) else {
ntBre marked this conversation as resolved.
Show resolved Hide resolved
return;
};

let ["re", func] = qualified_name.segments() else {
return;
};

// skip calls with more than `pattern` and `string` arguments (and `repl`
// for `sub`)
let Some(re_func) = ReFunc::from_call_expr(checker, call, func) else {
return;
};

let Some(pat) = call.arguments.find_argument("pattern", 0) else {
// this should be unreachable given the checks above, so it might be
// safe to unwrap here instead
return;
};
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved

// For now, restrict this rule to string literals
let Some(string_lit) = pat.as_string_literal_expr() else {
return;
};

// For now, reject any regex metacharacters. Compare to the complete list
// from https://docs.python.org/3/howto/regex.html#matching-characters
let has_metacharacters = string_lit.value.chars().any(|c| {
matches!(
c,
'.' | '^' | '$' | '*' | '+' | '?' | '{' | '}' | '[' | ']' | '\\' | '|' | '(' | ')'
)
});
ntBre marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like how you intentionally excluded meta-characters. So consider this an extension, and I think it's totally fine to do this as a follow-up pr (or not at all).

It would be nice if the rule only skips replacement for characters that are different between regex expressions and regular strings. For example, \n matches \n in a regex and a string.


if has_metacharacters {
return;
}

// Here we know the pattern is a string literal with no metacharacters, so
// we can proceed with the str method replacement
let new_expr = re_func.replacement();

let repl = checker.generator().expr(&new_expr);
let mut diagnostic = Diagnostic::new(
UnnecessaryRegularExpression {
replacement: repl.clone(),
},
call.range,
);

diagnostic.set_fix(Fix::safe_edit(Edit::replacement(
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved
repl,
call.range.start(),
call.range.end(),
)));
AlexWaygood marked this conversation as resolved.
Show resolved Hide resolved

checker.diagnostics.push(diagnostic);
}
Loading