Skip to content

Commit

Permalink
Replace lex usages (#11562)
Browse files Browse the repository at this point in the history
## Summary

This PR replaces most usages of `lex` function. The remaining references
are in tests which will be removed at the end because they're all tied
up with `Stylist` and `Indexer`.
  • Loading branch information
dhruvmanila authored May 29, 2024
1 parent 2805e9a commit 99fad61
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 96 deletions.
12 changes: 7 additions & 5 deletions crates/ruff_dev/src/print_tokens.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use anyhow::Result;

use ruff_linter::source_kind::SourceKind;
use ruff_python_ast::PySourceType;
use ruff_python_parser::{lexer, AsMode};
use ruff_python_parser::parse_unchecked_source;

#[derive(clap::Args)]
pub(crate) struct Args {
Expand All @@ -24,11 +24,13 @@ pub(crate) fn main(args: &Args) -> Result<()> {
args.file.display()
)
})?;
for (tok, range) in lexer::lex(source_kind.source_code(), source_type.as_mode()).flatten() {
let program = parse_unchecked_source(source_kind.source_code(), source_type);
for token in program.tokens() {
println!(
"{start:#?} {tok:#?} {end:#?}",
start = range.start(),
end = range.end()
"{start:#?} {kind:#?} {end:#?}",
start = token.start(),
end = token.end()
kind = token.kind()
);
}
Ok(())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use ruff_python_ast::{CmpOp, Expr};
use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::helpers;
use ruff_python_parser::{lexer, Mode, Tok};
use ruff_python_parser::{TokenKind, Tokens};
use ruff_text_size::{Ranged, TextRange, TextSize};

use crate::checkers::ast::Checker;
Expand Down Expand Up @@ -96,7 +96,7 @@ pub(crate) fn invalid_literal_comparison(
{
let mut diagnostic = Diagnostic::new(IsLiteral { cmp_op: op.into() }, expr.range());
if lazy_located.is_none() {
lazy_located = Some(locate_cmp_ops(expr, checker.locator().contents()));
lazy_located = Some(locate_cmp_ops(expr, checker.program().tokens()));
}
if let Some(located_op) = lazy_located.as_ref().and_then(|located| located.get(index)) {
assert_eq!(located_op.op, *op);
Expand Down Expand Up @@ -138,102 +138,85 @@ impl From<&CmpOp> for IsCmpOp {
}
}

/// Extract all [`CmpOp`] operators from an expression snippet, with appropriate
/// ranges.
/// Extract all [`CmpOp`] operators from an expression snippet, with appropriate ranges.
///
/// `RustPython` doesn't include line and column information on [`CmpOp`] nodes.
/// `CPython` doesn't either. This method iterates over the token stream and
/// re-identifies [`CmpOp`] nodes, annotating them with valid ranges.
fn locate_cmp_ops(expr: &Expr, source: &str) -> Vec<LocatedCmpOp> {
// If `Expr` is a multi-line expression, we need to parenthesize it to
// ensure that it's lexed correctly.
let contents = &source[expr.range()];
let parenthesized_contents = format!("({contents})");
let mut tok_iter = lexer::lex(&parenthesized_contents, Mode::Expression)
.flatten()
.skip(1)
.map(|(tok, range)| (tok, range - TextSize::from(1)))
.filter(|(tok, _)| !matches!(tok, Tok::NonLogicalNewline | Tok::Comment(_)))
/// This method iterates over the token stream and re-identifies [`CmpOp`] nodes, annotating them
/// with valid ranges.
fn locate_cmp_ops(expr: &Expr, tokens: &Tokens) -> Vec<LocatedCmpOp> {
let mut tok_iter = tokens
.tokens_in_range(expr.range())
.iter()
.filter(|token| !token.is_trivia())
.peekable();

let mut ops: Vec<LocatedCmpOp> = vec![];

// Track the bracket depth.
let mut par_count = 0u32;
let mut sqb_count = 0u32;
let mut brace_count = 0u32;
// Track the nesting level.
let mut nesting = 0u32;

loop {
let Some((tok, range)) = tok_iter.next() else {
let Some(token) = tok_iter.next() else {
break;
};

match tok {
Tok::Lpar => {
par_count = par_count.saturating_add(1);
match token.kind() {
TokenKind::Lpar | TokenKind::Lsqb | TokenKind::Lbrace => {
nesting = nesting.saturating_add(1);
}
Tok::Rpar => {
par_count = par_count.saturating_sub(1);
}
Tok::Lsqb => {
sqb_count = sqb_count.saturating_add(1);
}
Tok::Rsqb => {
sqb_count = sqb_count.saturating_sub(1);
}
Tok::Lbrace => {
brace_count = brace_count.saturating_add(1);
}
Tok::Rbrace => {
brace_count = brace_count.saturating_sub(1);
TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace => {
nesting = nesting.saturating_sub(1);
}
_ => {}
}

if par_count > 0 || sqb_count > 0 || brace_count > 0 {
if nesting > 0 {
continue;
}

match tok {
Tok::Not => {
if let Some((_, next_range)) = tok_iter.next_if(|(tok, _)| tok.is_in()) {
match token.kind() {
TokenKind::Not => {
if let Some((_, next_range)) =
tok_iter.next_if(|token| token.kind() == TokenKind::In)
{
ops.push(LocatedCmpOp::new(
TextRange::new(range.start(), next_range.end()),
TextRange::new(token.start(), next_range.end()),
CmpOp::NotIn,
));
}
}
Tok::In => {
ops.push(LocatedCmpOp::new(range, CmpOp::In));
TokenKind::In => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::In));
}
Tok::Is => {
let op = if let Some((_, next_range)) = tok_iter.next_if(|(tok, _)| tok.is_not()) {
TokenKind::Is => {
let op = if let Some((_, next_range)) =
tok_iter.next_if(|token| token.kind() == TokenKind::Not)
{
LocatedCmpOp::new(
TextRange::new(range.start(), next_range.end()),
TextRange::new(token.start(), next_range.end()),
CmpOp::IsNot,
)
} else {
LocatedCmpOp::new(range, CmpOp::Is)
LocatedCmpOp::new(token.range(), CmpOp::Is)
};
ops.push(op);
}
Tok::NotEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::NotEq));
TokenKind::NotEqual => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::NotEq));
}
Tok::EqEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::Eq));
TokenKind::EqEqual => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::Eq));
}
Tok::GreaterEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::GtE));
TokenKind::GreaterEqual => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::GtE));
}
Tok::Greater => {
ops.push(LocatedCmpOp::new(range, CmpOp::Gt));
TokenKind::Greater => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::Gt));
}
Tok::LessEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::LtE));
TokenKind::LessEqual => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::LtE));
}
Tok::Less => {
ops.push(LocatedCmpOp::new(range, CmpOp::Lt));
TokenKind::Less => {
ops.push(LocatedCmpOp::new(token.range(), CmpOp::Lt));
}
_ => {}
}
Expand Down Expand Up @@ -266,72 +249,70 @@ mod tests {

use super::{locate_cmp_ops, LocatedCmpOp};

fn extract_cmp_op_locations(source: &str) -> Result<Vec<LocatedCmpOp>> {
let program = parse_expression(source)?;
Ok(locate_cmp_ops(program.expr(), program.tokens()))
}

#[test]
fn extract_cmp_op_location() -> Result<()> {
fn test_locate_cmp_ops() -> Result<()> {
let contents = "x == 1";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::Eq
)]
);

let contents = "x != 1";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::NotEq
)]
);

let contents = "x is 1";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::Is
)]
);

let contents = "x is not 1";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(8),
CmpOp::IsNot
)]
);

let contents = "x in 1";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::In
)]
);

let contents = "x not in 1";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(8),
CmpOp::NotIn
)]
);

let contents = "x != (1 is not 2)";
let expr = parse_expression(contents)?.expr();
assert_eq!(
locate_cmp_ops(expr, contents),
extract_cmp_op_locations(contents)?,
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::NotEq
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use insta::assert_debug_snapshot;

use ruff_python_parser::lexer::lex;
use ruff_python_parser::{Mode, Tok};
use ruff_python_parser::{parse_module, Mode, Tok};
use ruff_python_trivia::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
use ruff_python_trivia::{BackwardsTokenizer, SimpleTokenKind};
use ruff_text_size::{TextLen, TextRange, TextSize};
Expand All @@ -23,17 +23,8 @@ impl TokenizationTestCase {
}

fn tokenize_reverse(&self) -> Vec<SimpleToken> {
let comment_ranges: Vec<_> = lex(self.source, Mode::Module)
.filter_map(|result| {
let (token, range) = result.expect("Input to be a valid python program.");
if matches!(token, Tok::Comment(_)) {
Some(range)
} else {
None
}
})
.collect();
BackwardsTokenizer::new(self.source, self.range, &comment_ranges).collect()
let program = parse_module(self.source).expect("Input to be a valid Python program");
BackwardsTokenizer::new(self.source, self.range, program.comment_ranges()).collect()
}

fn tokens(&self) -> &[SimpleToken] {
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff_wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,9 +263,9 @@ impl Workspace {
}

pub fn tokens(&self, contents: &str) -> Result<String, Error> {
let tokens: Vec<_> = ruff_python_parser::lexer::lex(contents, Mode::Module).collect();
let program = ruff_python_parser::parse_module(contents)?;

Ok(format!("{tokens:#?}"))
Ok(format!("{:#?}", program.tokens()))
}
}

Expand Down

0 comments on commit 99fad61

Please sign in to comment.