diff --git a/ast/Cargo.toml b/ast/Cargo.toml index fe869346..03a566e0 100644 --- a/ast/Cargo.toml +++ b/ast/Cargo.toml @@ -7,14 +7,10 @@ edition = "2021" repository = "https://github.com/RustPython/Parser/" license = "MIT" -[features] -default = ["malachite-bigint"] - [dependencies] rustpython-parser-core = { workspace = true } rustpython-literal = { workspace = true, optional = true } is-macro = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } static_assertions = "1.1.0" diff --git a/ast/src/builtin.rs b/ast/src/builtin.rs index b7fd3c8e..e10b8245 100644 --- a/ast/src/builtin.rs +++ b/ast/src/builtin.rs @@ -2,8 +2,8 @@ use rustpython_parser_core::text_size::TextRange; -use crate::bigint::BigInt; use crate::Ranged; +use num_bigint::BigInt; pub type String = std::string::String; diff --git a/ast/src/lib.rs b/ast/src/lib.rs index 1b12a93e..cbb12ce2 100644 --- a/ast/src/lib.rs +++ b/ast/src/lib.rs @@ -20,11 +20,6 @@ mod generic; mod impls; mod ranged; -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] -pub use num_bigint as bigint; - pub use builtin::*; pub use generic::*; pub use ranged::Ranged; diff --git a/format/Cargo.toml b/format/Cargo.toml index b11b25db..0fda5abc 100644 --- a/format/Cargo.toml +++ b/format/Cargo.toml @@ -13,8 +13,6 @@ rustpython-literal = { workspace = true } bitflags = "2.3.1" itertools = "0.10.5" num-traits = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } [features] -default = ["malachite-bigint"] \ No newline at end of file diff --git a/format/src/cformat.rs b/format/src/cformat.rs index d835fda0..8519bbd6 100644 --- a/format/src/cformat.rs +++ b/format/src/cformat.rs @@ -9,7 +9,7 @@ use std::{ str::FromStr, }; -use crate::bigint::{BigInt, Sign}; +use num_bigint::{BigInt, Sign}; #[derive(Debug, PartialEq)] pub enum CFormatErrorType { diff --git a/format/src/format.rs b/format/src/format.rs index 6bc5796e..09e42b80 100644 --- a/format/src/format.rs +++ b/format/src/format.rs @@ -6,7 +6,7 @@ use rustpython_literal::format::Case; use std::ops::Deref; use std::{cmp, str::FromStr}; -use crate::bigint::{BigInt, Sign}; +use num_bigint::{BigInt, Sign}; trait FormatParse { fn parse(text: &str) -> (Option, &str) diff --git a/format/src/lib.rs b/format/src/lib.rs index 61de9d55..e15074ba 100644 --- a/format/src/lib.rs +++ b/format/src/lib.rs @@ -1,8 +1,3 @@ -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] -pub use num_bigint as bigint; - pub use crate::format::*; pub mod cformat; diff --git a/literal/src/escape.rs b/literal/src/escape.rs index 082248a5..0cb07adb 100644 --- a/literal/src/escape.rs +++ b/literal/src/escape.rs @@ -385,7 +385,7 @@ impl<'a> Escape for AsciiEscape<'a> { fn layout(&self) -> &EscapeLayout { &self.layout } - + #[allow(unsafe_code)] fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { formatter.write_str(unsafe { // SAFETY: this function must be called only when source is printable ascii characters diff --git a/parser/Cargo.toml b/parser/Cargo.toml index b6c20ff8..f2a991ac 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -9,11 +9,7 @@ license = "MIT" edition = "2021" [features] -default = ["malachite-bigint"] serde = ["dep:serde", "rustpython-parser-core/serde"] -full-lexer = [] -malachite-bigint = ["dep:malachite-bigint", "rustpython-ast/malachite-bigint"] -num-bigint = ["dep:num-bigint", "rustpython-ast/num-bigint"] [build-dependencies] anyhow = { workspace = true } @@ -28,17 +24,16 @@ rustpython-parser-core = { workspace = true } itertools = { workspace = true } is-macro = { workspace = true } log = { workspace = true } -malachite-bigint = { workspace = true, optional = true } -num-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } num-traits = { workspace = true } unicode_names2 = { workspace = true } unic-emoji-char = "0.9.0" unic-ucd-ident = "0.9.0" lalrpop-util = { version = "0.20.0", default-features = false } -phf = "0.11.1" rustc-hash = "1.1.0" serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } +static_assertions = "1.1.0" [dev-dependencies] insta = { workspace = true } diff --git a/parser/build.rs b/parser/build.rs index e205c65f..a9bc3832 100644 --- a/parser/build.rs +++ b/parser/build.rs @@ -1,13 +1,10 @@ use std::fmt::Write as _; use std::fs::File; -use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::io::{BufRead, BufReader}; use std::path::{Path, PathBuf}; use tiny_keccak::{Hasher, Sha3}; fn main() -> anyhow::Result<()> { - let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); - gen_phf(&out_dir); - const SOURCE: &str = "src/python.lalrpop"; println!("cargo:rerun-if-changed={SOURCE}"); @@ -16,6 +13,7 @@ fn main() -> anyhow::Result<()> { #[cfg(feature = "lalrpop")] { + let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); target = out_dir.join("src/python.rs"); } #[cfg(not(feature = "lalrpop"))] @@ -113,55 +111,3 @@ fn sha_equal(expected_sha3_str: &str, actual_sha3: &[u8; 32]) -> bool { } *actual_sha3 == expected_sha3 } - -fn gen_phf(out_dir: &Path) { - let mut kwds = phf_codegen::Map::new(); - let kwds = kwds - // Alphabetical keywords: - .entry("...", "Tok::Ellipsis") - .entry("False", "Tok::False") - .entry("None", "Tok::None") - .entry("True", "Tok::True") - // more so "standard" keywords - .entry("and", "Tok::And") - .entry("as", "Tok::As") - .entry("assert", "Tok::Assert") - .entry("async", "Tok::Async") - .entry("await", "Tok::Await") - .entry("break", "Tok::Break") - .entry("case", "Tok::Case") - .entry("class", "Tok::Class") - .entry("continue", "Tok::Continue") - .entry("def", "Tok::Def") - .entry("del", "Tok::Del") - .entry("elif", "Tok::Elif") - .entry("else", "Tok::Else") - .entry("except", "Tok::Except") - .entry("finally", "Tok::Finally") - .entry("for", "Tok::For") - .entry("from", "Tok::From") - .entry("global", "Tok::Global") - .entry("if", "Tok::If") - .entry("import", "Tok::Import") - .entry("in", "Tok::In") - .entry("is", "Tok::Is") - .entry("lambda", "Tok::Lambda") - .entry("match", "Tok::Match") - .entry("nonlocal", "Tok::Nonlocal") - .entry("not", "Tok::Not") - .entry("or", "Tok::Or") - .entry("pass", "Tok::Pass") - .entry("raise", "Tok::Raise") - .entry("return", "Tok::Return") - .entry("try", "Tok::Try") - .entry("type", "Tok::Type") - .entry("while", "Tok::While") - .entry("with", "Tok::With") - .entry("yield", "Tok::Yield") - .build(); - writeln!( - BufWriter::new(File::create(out_dir.join("keywords.rs")).unwrap()), - "{kwds}", - ) - .unwrap(); -} diff --git a/parser/src/gen/parse.rs b/parser/src/gen/parse.rs index fafec6a1..e56491ae 100644 --- a/parser/src/gen/parse.rs +++ b/parser/src/gen/parse.rs @@ -1,12 +1,10 @@ // This file was originally generated from asdl by a python script, but we now edit it manually impl Parse for ast::StmtFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -24,11 +22,8 @@ impl Parse for ast::StmtFunctionDef { } impl Parse for ast::StmtAsyncFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -47,11 +42,8 @@ impl Parse for ast::StmtAsyncFunctionDef { } impl Parse for ast::StmtClassDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -70,11 +62,8 @@ impl Parse for ast::StmtClassDef { } impl Parse for ast::StmtReturn { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -93,11 +82,8 @@ impl Parse for ast::StmtReturn { } impl Parse for ast::StmtDelete { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -116,11 +102,8 @@ impl Parse for ast::StmtDelete { } impl Parse for ast::StmtAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -139,11 +122,8 @@ impl Parse for ast::StmtAssign { } impl Parse for ast::StmtTypeAlias { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -162,11 +142,8 @@ impl Parse for ast::StmtTypeAlias { } impl Parse for ast::StmtAugAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -185,11 +162,8 @@ impl Parse for ast::StmtAugAssign { } impl Parse for ast::StmtAnnAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -208,11 +182,8 @@ impl Parse for ast::StmtAnnAssign { } impl Parse for ast::StmtFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -231,11 +202,8 @@ impl Parse for ast::StmtFor { } impl Parse for ast::StmtAsyncFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -254,11 +222,8 @@ impl Parse for ast::StmtAsyncFor { } impl Parse for ast::StmtWhile { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -277,11 +242,8 @@ impl Parse for ast::StmtWhile { } impl Parse for ast::StmtIf { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -300,11 +262,8 @@ impl Parse for ast::StmtIf { } impl Parse for ast::StmtWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -323,11 +282,8 @@ impl Parse for ast::StmtWith { } impl Parse for ast::StmtAsyncWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -346,11 +302,8 @@ impl Parse for ast::StmtAsyncWith { } impl Parse for ast::StmtMatch { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -369,11 +322,8 @@ impl Parse for ast::StmtMatch { } impl Parse for ast::StmtRaise { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -392,11 +342,8 @@ impl Parse for ast::StmtRaise { } impl Parse for ast::StmtTry { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -415,11 +362,8 @@ impl Parse for ast::StmtTry { } impl Parse for ast::StmtTryStar { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -438,11 +382,8 @@ impl Parse for ast::StmtTryStar { } impl Parse for ast::StmtAssert { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -461,11 +402,8 @@ impl Parse for ast::StmtAssert { } impl Parse for ast::StmtImport { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -484,11 +422,8 @@ impl Parse for ast::StmtImport { } impl Parse for ast::StmtImportFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -507,11 +442,8 @@ impl Parse for ast::StmtImportFrom { } impl Parse for ast::StmtGlobal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -530,11 +462,8 @@ impl Parse for ast::StmtGlobal { } impl Parse for ast::StmtNonlocal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -553,11 +482,8 @@ impl Parse for ast::StmtNonlocal { } impl Parse for ast::StmtExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -576,11 +502,8 @@ impl Parse for ast::StmtExpr { } impl Parse for ast::StmtPass { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -599,11 +522,8 @@ impl Parse for ast::StmtPass { } impl Parse for ast::StmtBreak { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -622,11 +542,8 @@ impl Parse for ast::StmtBreak { } impl Parse for ast::StmtContinue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -645,11 +562,8 @@ impl Parse for ast::StmtContinue { } impl Parse for ast::ExprBoolOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -668,11 +582,8 @@ impl Parse for ast::ExprBoolOp { } impl Parse for ast::ExprNamedExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -691,11 +602,8 @@ impl Parse for ast::ExprNamedExpr { } impl Parse for ast::ExprBinOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -714,11 +622,8 @@ impl Parse for ast::ExprBinOp { } impl Parse for ast::ExprUnaryOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -737,11 +642,8 @@ impl Parse for ast::ExprUnaryOp { } impl Parse for ast::ExprLambda { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -760,11 +662,8 @@ impl Parse for ast::ExprLambda { } impl Parse for ast::ExprIfExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -783,11 +682,8 @@ impl Parse for ast::ExprIfExp { } impl Parse for ast::ExprDict { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -806,11 +702,8 @@ impl Parse for ast::ExprDict { } impl Parse for ast::ExprSet { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -829,11 +722,8 @@ impl Parse for ast::ExprSet { } impl Parse for ast::ExprListComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -852,11 +742,8 @@ impl Parse for ast::ExprListComp { } impl Parse for ast::ExprSetComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -875,11 +762,8 @@ impl Parse for ast::ExprSetComp { } impl Parse for ast::ExprDictComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -898,11 +782,8 @@ impl Parse for ast::ExprDictComp { } impl Parse for ast::ExprGeneratorExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -921,11 +802,8 @@ impl Parse for ast::ExprGeneratorExp { } impl Parse for ast::ExprAwait { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -944,11 +822,8 @@ impl Parse for ast::ExprAwait { } impl Parse for ast::ExprYield { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -967,11 +842,8 @@ impl Parse for ast::ExprYield { } impl Parse for ast::ExprYieldFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -990,11 +862,8 @@ impl Parse for ast::ExprYieldFrom { } impl Parse for ast::ExprCompare { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1013,11 +882,8 @@ impl Parse for ast::ExprCompare { } impl Parse for ast::ExprCall { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1036,11 +902,8 @@ impl Parse for ast::ExprCall { } impl Parse for ast::ExprFormattedValue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1059,11 +922,8 @@ impl Parse for ast::ExprFormattedValue { } impl Parse for ast::ExprJoinedStr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1082,11 +942,8 @@ impl Parse for ast::ExprJoinedStr { } impl Parse for ast::ExprConstant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1105,11 +962,8 @@ impl Parse for ast::ExprConstant { } impl Parse for ast::ExprAttribute { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1128,11 +982,8 @@ impl Parse for ast::ExprAttribute { } impl Parse for ast::ExprSubscript { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1151,11 +1002,8 @@ impl Parse for ast::ExprSubscript { } impl Parse for ast::ExprStarred { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1174,11 +1022,8 @@ impl Parse for ast::ExprStarred { } impl Parse for ast::ExprName { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1197,11 +1042,8 @@ impl Parse for ast::ExprName { } impl Parse for ast::ExprList { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1220,11 +1062,8 @@ impl Parse for ast::ExprList { } impl Parse for ast::ExprTuple { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1243,11 +1082,8 @@ impl Parse for ast::ExprTuple { } impl Parse for ast::ExprSlice { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index eced33ce..1ed322a8 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -27,163 +27,46 @@ //! ``` //! //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html + +use std::iter::FusedIterator; +use std::{char, cmp::Ordering, str::FromStr}; + +use num_traits::{Num, Zero}; +use unic_emoji_char::is_emoji_presentation; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; + +use crate::lexer::cursor::{Cursor, EOF_CHAR}; +use crate::lexer::indentation::{Character, Column, Indentation, Indentations}; + use crate::{ - ast::bigint::BigInt, soft_keywords::SoftKeywordTransformer, string::FStringErrorType, - text_size::{TextLen, TextRange, TextSize}, + text_size::{TextRange, TextSize}, token::{MagicKind, StringKind, Tok}, Mode, }; -use log::trace; -use num_traits::{Num, Zero}; -use std::{char, cmp::Ordering, ops::Index, slice::SliceIndex, str::FromStr}; -use unic_emoji_char::is_emoji_presentation; -use unic_ucd_ident::{is_xid_continue, is_xid_start}; - -// Indentations are tracked by a stack of indentation levels. IndentationLevel keeps -// track of the number of tabs and spaces at the current level. -#[derive(Clone, Copy, PartialEq, Debug, Default)] -struct IndentationLevel { - tabs: u32, - spaces: u32, -} - -impl IndentationLevel { - fn compare_strict( - &self, - other: &IndentationLevel, - location: TextSize, - ) -> Result { - // We only know for sure that we're smaller or bigger if tabs - // and spaces both differ in the same direction. Otherwise we're - // dependent on the size of tabs. - match self.tabs.cmp(&other.tabs) { - Ordering::Less => { - if self.spaces <= other.spaces { - Ok(Ordering::Less) - } else { - Err(LexicalError { - location, - error: LexicalErrorType::TabError, - }) - } - } - Ordering::Greater => { - if self.spaces >= other.spaces { - Ok(Ordering::Greater) - } else { - Err(LexicalError { - location, - error: LexicalErrorType::TabError, - }) - } - } - Ordering::Equal => Ok(self.spaces.cmp(&other.spaces)), - } - } -} - -// The indentations stack is used to keep track of the current indentation level. -// Similar to the CPython implementation, the Indentations stack always has at -// least one level which is never popped. See Reference 2.1.8. -#[derive(Debug)] -struct Indentations { - indent_stack: Vec, -} - -impl Indentations { - fn is_empty(&self) -> bool { - self.indent_stack.len() == 1 - } - - fn push(&mut self, indent: IndentationLevel) { - self.indent_stack.push(indent); - } - - fn pop(&mut self) -> Option { - if self.is_empty() { - return None; - } - self.indent_stack.pop() - } +use num_bigint::BigInt; - fn current(&self) -> &IndentationLevel { - self.indent_stack - .last() - .expect("Indentations must have at least one level") - } -} - -impl Default for Indentations { - fn default() -> Self { - Self { - indent_stack: vec![IndentationLevel::default()], - } - } -} - -// A CharWindow is a sliding window over an iterator of chars. It is used to -// allow for look-ahead when scanning tokens from the source code. -struct CharWindow, const N: usize> { - source: T, - window: [Option; N], -} - -impl CharWindow -where - T: Iterator, -{ - fn new(source: T) -> Self { - Self { - source, - window: [None; N], - } - } - - fn slide(&mut self) -> Option { - self.window.rotate_left(1); - let next = self.source.next(); - *self.window.last_mut().expect("never empty") = next; - next - } -} - -impl Index for CharWindow -where - T: Iterator, - Idx: SliceIndex<[Option]>, -{ - type Output = Idx::Output; - - fn index(&self, index: Idx) -> &Self::Output { - &self.window[index] - } -} +mod cursor; +mod indentation; /// A lexer for Python source code. -pub struct Lexer> { +pub struct Lexer<'source> { // Contains the source code to be lexed. - window: CharWindow, + cursor: Cursor<'source>, + source: &'source str, + // Are we at the beginning of a line? at_begin_of_line: bool, // Amount of parenthesis. nesting: usize, // Indentation levels. indentations: Indentations, - // Pending list of tokens to be returned. - pending: Vec, - // The current location. - location: TextSize, + pending_indentation: Option, // Lexer mode. mode: Mode, } -// generated in build.rs, in gen_phf() -/// A map of keywords to their tokens. -pub static KEYWORDS: phf::Map<&'static str, Tok> = - include!(concat!(env!("OUT_DIR"), "/keywords.rs")); - /// Contains a Token along with its `range`. pub type Spanned = (Tok, TextRange); /// The result of lexing a token. @@ -204,8 +87,43 @@ pub type LexResult = Result; /// } /// ``` #[inline] -pub fn lex(source: &str, mode: Mode) -> impl Iterator + '_ { - lex_starts_at(source, mode, TextSize::default()) +pub fn lex(source: &str, mode: Mode) -> SoftKeywordTransformer { + SoftKeywordTransformer::new(Lexer::new(source, mode), mode) +} + +pub struct LexStartsAtIterator { + start_offset: TextSize, + inner: I, +} + +impl Iterator for LexStartsAtIterator +where + I: Iterator, +{ + type Item = LexResult; + + #[inline] + fn next(&mut self) -> Option { + let result = match self.inner.next()? { + Ok((tok, range)) => Ok((tok, range + self.start_offset)), + Err(error) => Err(LexicalError { + location: error.location + self.start_offset, + ..error + }), + }; + + Some(result) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl FusedIterator for LexStartsAtIterator where I: Iterator + FusedIterator {} +impl ExactSizeIterator for LexStartsAtIterator where + I: Iterator + ExactSizeIterator +{ } /// Create a new lexer from a source string, starting at a given location. @@ -214,193 +132,219 @@ pub fn lex_starts_at( source: &str, mode: Mode, start_offset: TextSize, -) -> SoftKeywordTransformer>> { - SoftKeywordTransformer::new(Lexer::new(source.chars(), mode, start_offset), mode) +) -> LexStartsAtIterator> { + LexStartsAtIterator { + start_offset, + inner: lex(source, mode), + } } -impl Lexer -where - T: Iterator, -{ +impl<'source> Lexer<'source> { /// Create a new lexer from T and a starting location. You probably want to use /// [`lex`] instead. - pub fn new(input: T, mode: Mode, start: TextSize) -> Self { + pub fn new(input: &'source str, mode: Mode) -> Self { let mut lxr = Lexer { at_begin_of_line: true, nesting: 0, indentations: Indentations::default(), - // Usually we have less than 5 tokens pending. - pending: Vec::with_capacity(5), - location: start, - window: CharWindow::new(input), + pending_indentation: None, + + source: input, + cursor: Cursor::new(input), mode, }; - // Fill the window. - lxr.window.slide(); - lxr.window.slide(); - lxr.window.slide(); // TODO: Handle possible mismatch between BOM and explicit encoding declaration. // spell-checker:ignore feff - if let Some('\u{feff}') = lxr.window[0] { - lxr.window.slide(); - lxr.location += '\u{feff}'.text_len(); - } + lxr.cursor.eat_char('\u{feff}'); + lxr } /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier(&mut self) -> LexResult { + fn lex_identifier(&mut self, first: char) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' - match self.window[..3] { - [Some(c), Some('"' | '\''), ..] => { - if let Ok(kind) = StringKind::try_from(c) { - return self.lex_string(kind); + match self.cursor.first() { + quote @ ('\'' | '"') => { + if let Ok(string_kind) = StringKind::try_from(first) { + self.cursor.bump(); + return self.lex_string(string_kind, quote); } } - [Some(c1), Some(c2), Some('"' | '\'')] => { - if let Ok(kind) = StringKind::try_from([c1, c2]) { - return self.lex_string(kind); + second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => { + self.cursor.bump(); + + if let Ok(string_kind) = StringKind::try_from([first, second]) { + let quote = self.cursor.bump().unwrap(); + return self.lex_string(string_kind, quote); } } _ => {} - }; - - let start_pos = self.get_pos(); - let mut name = String::with_capacity(8); - while self.is_identifier_continuation() { - name.push(self.next_char().unwrap()); } - let end_pos = self.get_pos(); - if let Some(tok) = KEYWORDS.get(&name) { - Ok((tok.clone(), TextRange::new(start_pos, end_pos))) - } else { - Ok((Tok::Name { name }, TextRange::new(start_pos, end_pos))) - } + self.cursor.eat_while(is_identifier_continuation); + + let text = self.token_text(); + + let keyword = match text { + "False" => Tok::False, + "None" => Tok::None, + "True" => Tok::True, + "and" => Tok::And, + "as" => Tok::As, + "assert" => Tok::Assert, + "async" => Tok::Async, + "await" => Tok::Await, + "break" => Tok::Break, + "case" => Tok::Case, + "class" => Tok::Class, + "continue" => Tok::Continue, + "def" => Tok::Def, + "del" => Tok::Del, + "elif" => Tok::Elif, + "else" => Tok::Else, + "except" => Tok::Except, + "finally" => Tok::Finally, + "for" => Tok::For, + "from" => Tok::From, + "global" => Tok::Global, + "if" => Tok::If, + "import" => Tok::Import, + "in" => Tok::In, + "is" => Tok::Is, + "lambda" => Tok::Lambda, + "match" => Tok::Match, + "nonlocal" => Tok::Nonlocal, + "not" => Tok::Not, + "or" => Tok::Or, + "pass" => Tok::Pass, + "raise" => Tok::Raise, + "return" => Tok::Return, + "try" => Tok::Try, + "type" => Tok::Type, + "while" => Tok::While, + "with" => Tok::With, + "yield" => Tok::Yield, + _ => { + return Ok(Tok::Name { + name: text.to_string(), + }) + } + }; + + Ok(keyword) } /// Numeric lexing. The feast can start! - fn lex_number(&mut self) -> LexResult { - let start_pos = self.get_pos(); - match self.window[..2] { - [Some('0'), Some('x' | 'X')] => { - // Hex! (0xdeadbeef) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 16) - } - [Some('0'), Some('o' | 'O')] => { - // Octal style! (0o377) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 8) - } - [Some('0'), Some('b' | 'B')] => { - // Binary! (0b_1110_0101) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 2) - } - _ => self.lex_normal_number(), + fn lex_number(&mut self, first: char) -> Result { + if first == '0' { + if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { + self.lex_number_radix(first, Radix::Hex) + } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { + self.lex_number_radix(first, Radix::Octal) + } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { + self.lex_number_radix(first, Radix::Binary) + } else { + self.lex_decimal_number(first) + } + } else { + self.lex_decimal_number(first) } } /// Lex a hex/octal/decimal/binary number without a decimal point. - fn lex_number_radix(&mut self, start_pos: TextSize, radix: u32) -> LexResult { - let value_text = self.radix_run(radix); - let end_pos = self.get_pos(); - let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError { - error: LexicalErrorType::OtherError(format!("{e:?}")), - location: start_pos, - })?; - Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos))) + fn lex_number_radix(&mut self, first: char, radix: Radix) -> Result { + #[cfg(debug_assertions)] + debug_assert!(matches!( + self.cursor.previous().to_ascii_lowercase(), + 'x' | 'o' | 'b' + )); + + let value_text = self.radix_run(Some(first), radix); + let value = + BigInt::from_str_radix(&value_text, radix.as_u32()).map_err(|e| LexicalError { + error: LexicalErrorType::OtherError(format!("{e:?}")), + location: self.token_range().start(), + })?; + Ok(Tok::Int { value }) } /// Lex a normal number, that is, no octal, hex or binary number. - fn lex_normal_number(&mut self) -> LexResult { - let start_pos = self.get_pos(); - let start_is_zero = self.window[0] == Some('0'); - // Normal number: - let mut value_text = self.radix_run(10); + fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> Result { + #[cfg(debug_assertions)] + debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); + let start_is_zero = first_digit_or_dot == '0'; - // If float: - if self.window[0] == Some('.') || self.at_exponent() { - // Take '.': - if self.window[0] == Some('.') { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap()); - value_text.push_str(&self.radix_run(10)); + let mut value_text = if first_digit_or_dot == '.' { + String::new() + } else { + self.radix_run(Some(first_digit_or_dot), Radix::Decimal) + }; + + let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { + value_text.push('.'); + + if self.cursor.eat_char('_') { + return Err(LexicalError { + error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), + location: self.offset() - TextSize::new(1), + }); } - // 1e6 for example: - if let Some('e' | 'E') = self.window[0] { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap().to_ascii_lowercase()); - // Optional +/- - if matches!(self.window[0], Some('-' | '+')) { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap()); + value_text.push_str(&self.radix_run(None, Radix::Decimal)); + true + } else { + // Normal number: + false + }; + + let is_float = match self.cursor.rest().as_bytes() { + [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { + value_text.push('e'); + self.cursor.bump(); // e | E + + if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { + value_text.push(sign); } - value_text.push_str(&self.radix_run(10)); + value_text.push_str(&self.radix_run(None, Radix::Decimal)); + + true } + _ => is_float, + }; + // If float: + if is_float { + // Improvement: Use `Cow` instead of pushing to value text let value = f64::from_str(&value_text).map_err(|_| LexicalError { error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()), - location: self.get_pos(), + location: self.offset(), })?; // Parse trailing 'j': - if matches!(self.window[0], Some('j' | 'J')) { - self.next_char(); - let end_pos = self.get_pos(); - Ok(( - Tok::Complex { - real: 0.0, - imag: value, - }, - TextRange::new(start_pos, end_pos), - )) + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + Ok(Tok::Complex { + real: 0.0, + imag: value, + }) } else { - let end_pos = self.get_pos(); - Ok((Tok::Float { value }, TextRange::new(start_pos, end_pos))) + Ok(Tok::Float { value }) } } else { // Parse trailing 'j': - if matches!(self.window[0], Some('j' | 'J')) { - self.next_char(); - let end_pos = self.get_pos(); + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { let imag = f64::from_str(&value_text).unwrap(); - Ok(( - Tok::Complex { real: 0.0, imag }, - TextRange::new(start_pos, end_pos), - )) + Ok(Tok::Complex { real: 0.0, imag }) } else { - let end_pos = self.get_pos(); let value = value_text.parse::().unwrap(); if start_is_zero && !value.is_zero() { // leading zeros in decimal integer literals are not permitted return Err(LexicalError { error: LexicalErrorType::OtherError("Invalid Token".to_owned()), - location: self.get_pos(), + location: self.token_range().start(), }); } - Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos))) + Ok(Tok::Int { value }) } } } @@ -408,105 +352,39 @@ where /// Consume a sequence of numbers with the given radix, /// the digits can be decorated with underscores /// like this: '1_2_3_4' == '1234' - fn radix_run(&mut self, radix: u32) -> String { - let mut value_text = String::new(); + fn radix_run(&mut self, first: Option, radix: Radix) -> String { + let mut value_text = first.map_or(String::new(), |c| c.to_string()); loop { - if let Some(c) = self.take_number(radix) { + if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { value_text.push(c); - } else if self.window[0] == Some('_') - && Lexer::::is_digit_of_radix(self.window[1], radix) - { - self.next_char(); + } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + self.cursor.bump(); } else { break; } } - value_text - } - - /// Consume a single character with the given radix. - fn take_number(&mut self, radix: u32) -> Option { - let take_char = Lexer::::is_digit_of_radix(self.window[0], radix); - - take_char.then(|| self.next_char().unwrap()) - } - - /// Test if a digit is of a certain radix. - fn is_digit_of_radix(c: Option, radix: u32) -> bool { - match radix { - 2 => matches!(c, Some('0'..='1')), - 8 => matches!(c, Some('0'..='7')), - 10 => matches!(c, Some('0'..='9')), - 16 => matches!(c, Some('0'..='9') | Some('a'..='f') | Some('A'..='F')), - other => unimplemented!("Radix not implemented: {}", other), - } - } - /// Test if we face '[eE][-+]?[0-9]+' - fn at_exponent(&self) -> bool { - match self.window[..2] { - [Some('e' | 'E'), Some('+' | '-')] => matches!(self.window[2], Some('0'..='9')), - [Some('e' | 'E'), Some('0'..='9')] => true, - _ => false, - } + value_text } /// Lex a single comment. - #[cfg(feature = "full-lexer")] - fn lex_comment(&mut self) -> LexResult { - let start_pos = self.get_pos(); - let mut value = String::new(); - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - let end_pos = self.get_pos(); - return Ok((Tok::Comment(value), TextRange::new(start_pos, end_pos))); - } - Some(_) => {} - } - value.push(self.next_char().unwrap()); - } - } + fn lex_comment(&mut self) -> Result { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), '#'); - #[cfg(feature = "full-lexer")] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - let comment = self.lex_comment()?; - self.emit(comment); - Ok(()) - } + self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); - /// Discard comment if full-lexer is not enabled. - #[cfg(not(feature = "full-lexer"))] - fn lex_comment(&mut self) { - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - return; - } - Some(_) => {} - } - self.next_char().unwrap(); - } - } - - #[cfg(not(feature = "full-lexer"))] - #[inline] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - self.lex_comment(); - Ok(()) + return Ok(Tok::Comment(self.token_text().to_string())); } /// Lex a single magic command. - fn lex_magic_command(&mut self, kind: MagicKind) -> (Tok, TextRange) { - let start_pos = self.get_pos(); - for _ in 0..u32::from(kind.prefix_len()) { - self.next_char(); - } + fn lex_magic_command(&mut self, kind: MagicKind) -> Tok { let mut value = String::new(); + loop { - match self.window[0] { - Some('\\') => { + match self.cursor.first() { + '\\' => { // Only skip the line continuation if it is followed by a newline // otherwise it is a normal backslash which is part of the magic command: // @@ -516,94 +394,78 @@ where // && ls -a | sed 's/^/\\ /' // ^^ // Don't skip these backslashes - if matches!(self.window[1], Some('\n' | '\r')) { - self.next_char(); - self.next_char(); + if self.cursor.second() == '\r' { + self.cursor.bump(); + self.cursor.bump(); + self.cursor.eat_char('\n'); continue; + } else if self.cursor.second() == '\n' { + self.cursor.bump(); + self.cursor.bump(); + continue; + } else { + self.cursor.bump(); + value.push('\\'); } } - Some('\n' | '\r') | None => { - let end_pos = self.get_pos(); - return ( - Tok::MagicCommand { kind, value }, - TextRange::new(start_pos, end_pos), - ); + '\n' | '\r' | EOF_CHAR => { + return Tok::MagicCommand { kind, value }; + } + c => { + self.cursor.bump(); + value.push(c); } - Some(_) => {} - } - value.push(self.next_char().unwrap()); - } - } - - fn lex_and_emit_magic_command(&mut self) { - let kind = match self.window[..2] { - [Some(c1), Some(c2)] => { - MagicKind::try_from([c1, c2]).map_or_else(|_| MagicKind::try_from(c1), Ok) } - // When the escape character is the last character of the file. - [Some(c), None] => MagicKind::try_from(c), - _ => return, - }; - if let Ok(kind) = kind { - let magic_command = self.lex_magic_command(kind); - self.emit(magic_command); } } /// Lex a string literal. - fn lex_string(&mut self, kind: StringKind) -> LexResult { - let start_pos = self.get_pos(); - for _ in 0..u32::from(kind.prefix_len()) { - self.next_char(); - } - let quote_char = self.next_char().unwrap(); - let mut string_content = String::with_capacity(5); + fn lex_string(&mut self, kind: StringKind, quote: char) -> Result { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), quote); // If the next two characters are also the quote character, then we have a triple-quoted // string; consume those two characters and ensure that we require a triple-quote to close - let triple_quoted = if self.window[..2] == [Some(quote_char); 2] { - self.next_char(); - self.next_char(); + let triple_quoted = if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); true } else { false }; - loop { - match self.next_char() { - Some(c) => { - if c == '\\' { - if let Some(next_c) = self.next_char() { - string_content.push('\\'); - string_content.push(next_c); - continue; - } - } - if c == '\n' && !triple_quoted { - return Err(LexicalError { - error: LexicalErrorType::OtherError( - "EOL while scanning string literal".to_owned(), - ), - location: self.get_pos(), - }); - } + let value_start = self.offset(); - if c == quote_char { - if triple_quoted { - // Look ahead at the next two characters; if we have two more - // quote_chars, it's the end of the string; consume the remaining - // closing quotes and break the loop - if self.window[..2] == [Some(quote_char); 2] { - self.next_char(); - self.next_char(); - break; - } - } else { - break; + let value_end = loop { + match self.cursor.bump() { + Some('\\') => { + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else { + self.cursor.bump(); + } + } + Some('\r' | '\n') if !triple_quoted => { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "EOL while scanning string literal".to_owned(), + ), + location: self.offset() - TextSize::new(1), + }); + } + Some(c) if c == quote => { + if triple_quoted { + if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); + break self.offset() - TextSize::new(3); } + } else { + break self.offset() - TextSize::new(1); } - string_content.push(c); } + + Some(_) => {} None => { return Err(LexicalError { error: if triple_quoted { @@ -611,708 +473,460 @@ where } else { LexicalErrorType::StringError }, - location: self.get_pos(), + location: self.offset(), }); } } - } - let end_pos = self.get_pos(); + }; + let tok = Tok::String { - value: string_content, + value: self.source[TextRange::new(value_start, value_end)].to_string(), kind, triple_quoted, }; - Ok((tok, TextRange::new(start_pos, end_pos))) - } - - // Checks if the character c is a valid starting character as described - // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers - fn is_identifier_start(&self, c: char) -> bool { - match c { - 'a'..='z' | 'A'..='Z' | '_' => true, - _ => is_xid_start(c), - } - } - - // Checks if the character c is a valid continuation character as described - // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers - fn is_identifier_continuation(&self) -> bool { - match self.window[0] { - Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true, - Some(c) => is_xid_continue(c), - _ => false, - } + Ok(tok) } // This is the main entry point. Call this function to retrieve the next token. // This function is used by the iterator implementation. - fn inner_next(&mut self) -> LexResult { + pub fn next_token(&mut self) -> LexResult { // top loop, keep on processing, until we have something pending. - while self.pending.is_empty() { - // Detect indentation levels - if self.at_begin_of_line { - self.handle_indentations()?; - } - - self.consume_normal()?; - } - - Ok(self.pending.remove(0)) - } - - // Given we are at the start of a line, count the number of spaces and/or tabs until the first character. - fn eat_indentation(&mut self) -> Result { - // Determine indentation: - let mut spaces: u32 = 0; - let mut tabs: u32 = 0; loop { - match self.window[0] { - Some(' ') => { - /* - if tabs != 0 { - // Don't allow spaces after tabs as part of indentation. - // This is technically stricter than python3 but spaces after - // tabs is even more insane than mixing spaces and tabs. - return Some(Err(LexicalError { - error: LexicalErrorType::OtherError("Spaces not allowed as part of indentation after tabs".to_owned()), - location: self.get_pos(), - })); - } - */ - self.next_char(); - spaces += 1; - } - Some('\t') => { - if spaces != 0 { - // Don't allow tabs after spaces as part of indentation. - // This is technically stricter than python3 but spaces before - // tabs is even more insane than mixing spaces and tabs. - return Err(LexicalError { - error: LexicalErrorType::TabsAfterSpaces, - location: self.get_pos(), - }); + if let Some(indentation) = self.pending_indentation.take() { + match self.indentations.current().try_compare(&indentation) { + Ok(Ordering::Greater) => { + self.pending_indentation = Some(indentation); + self.indentations.pop(); + return Ok((Tok::Dedent, TextRange::empty(self.offset()))); } - self.next_char(); - tabs += 1; + // TODO re-add assertion + _ => {} } - Some('#') => { - self.lex_and_emit_comment()?; - spaces = 0; - tabs = 0; - } - // https://github.com/ipython/ipython/blob/635815e8f1ded5b764d66cacc80bbe25e9e2587f/IPython/core/inputtransformer2.py#L345 - Some('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter => { - self.lex_and_emit_magic_command(); - } - Some('\x0C') => { - // Form feed character! - // Reset indentation for the Emacs user. - self.next_char(); - spaces = 0; - tabs = 0; - } - Some('\n' | '\r') => { - // Empty line! - #[cfg(feature = "full-lexer")] - let tok_start = self.get_pos(); - self.next_char(); - #[cfg(feature = "full-lexer")] - let tok_end = self.get_pos(); - #[cfg(feature = "full-lexer")] - self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); - spaces = 0; - tabs = 0; - } - None => { - spaces = 0; - tabs = 0; - break; + } + + if self.at_begin_of_line && self.nesting == 0 { + if let Some(trivia) = self.eat_logical_line_trivia()? { + break Ok(trivia); } - _ => { - self.at_begin_of_line = false; - break; + } + + self.cursor.start_token(); + if let Some(c) = self.cursor.bump() { + if let Some(normal) = self.consume_normal(c)? { + break Ok(normal); } + } else { + break self.consume_end(); } } - - Ok(IndentationLevel { tabs, spaces }) } - // Push/pop indents/dedents based on the current indentation level. - fn handle_indentations(&mut self) -> Result<(), LexicalError> { - let indentation_level = self.eat_indentation()?; + fn eat_logical_line_trivia(&mut self) -> Result, LexicalError> { + let mut column = 0u32; + let mut character = 0u32; + + // Eat over any leading whitespace + self.cursor.start_token(); + self.cursor.eat_while(|c| { + if c == ' ' { + column += 1; + character += 1; + true + } else if c == '\t' { + character += 1; + column = (column / 2 + 1) * 2; + true + } else if c == '\x0C' { + column = 0; + true + } else { + false + } + }); - if self.nesting != 0 { - return Ok(()); - } + let token = match self.cursor.first() { + c @ ('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter => { + self.cursor.start_token(); + self.cursor.bump(); + let kind = if let Ok(kind) = MagicKind::try_from([c, self.cursor.first()]) { + self.cursor.bump(); + kind + } else { + MagicKind::try_from(c).unwrap() + }; - // Determine indent or dedent: - let current_indentation = self.indentations.current(); - let ordering = indentation_level.compare_strict(current_indentation, self.get_pos())?; - match ordering { - Ordering::Equal => { - // Same same - } - Ordering::Greater => { - // New indentation level: - self.indentations.push(indentation_level); - let tok_pos = self.get_pos(); - self.emit(( - Tok::Indent, - TextRange::new( - tok_pos - - TextSize::new(indentation_level.spaces) - - TextSize::new(indentation_level.tabs), - tok_pos, - ), - )); + self.lex_magic_command(kind) } - Ordering::Less => { - // One or more dedentations - // Pop off other levels until col is found: - - loop { - let current_indentation = self.indentations.current(); - let ordering = - indentation_level.compare_strict(current_indentation, self.get_pos())?; - match ordering { - Ordering::Less => { - self.indentations.pop(); - let tok_pos = self.get_pos(); - self.emit((Tok::Dedent, TextRange::empty(tok_pos))); - } - Ordering::Equal => { - // We arrived at proper level of indentation. - break; - } - Ordering::Greater => { - return Err(LexicalError { - error: LexicalErrorType::IndentationError, - location: self.get_pos(), - }); - } - } - } + + '#' => { + self.cursor.start_token(); + self.cursor.bump(); + + self.lex_comment()? + } + + '\n' => { + self.cursor.start_token(); + self.cursor.bump(); + Tok::NonLogicalNewline + } + // `\r` or `\r\n` + '\r' => { + self.cursor.start_token(); + self.cursor.bump(); + self.cursor.eat_char('\n'); + Tok::NonLogicalNewline } - } - Ok(()) + EOF_CHAR => { + // handled by consume end of line + return Ok(None); + } + + _ => { + self.at_begin_of_line = false; + + return self.handle_indentation(Indentation::new( + Column::new(column), + Character::new(character), + )); + } + }; + + Ok(Some((token, self.token_range()))) } - // Take a look at the next character, if any, and decide upon the next steps. - fn consume_normal(&mut self) -> Result<(), LexicalError> { - if let Some(c) = self.window[0] { - // Identifiers are the most common case. - if self.is_identifier_start(c) { - let identifier = self.lex_identifier()?; - self.emit(identifier); - } else { - self.consume_character(c)?; + fn handle_indentation( + &mut self, + indentation: Indentation, + ) -> Result, LexicalError> { + let token = match self.indentations.current().try_compare(&indentation) { + // Dedent + Ok(Ordering::Greater) => { + self.indentations.pop(); + self.pending_indentation = Some(indentation); + + Some((Tok::Dedent, TextRange::empty(self.offset()))) } - } else { - // We reached end of file. - let tok_pos = self.get_pos(); - // First of all, we need all nestings to be finished. - if self.nesting > 0 { + Ok(Ordering::Equal) => None, + + // Indent + Ok(Ordering::Less) => { + self.indentations.push(indentation); + Some((Tok::Indent, self.token_range())) + } + Err(_) => { return Err(LexicalError { - error: LexicalErrorType::Eof, - location: tok_pos, + error: LexicalErrorType::IndentationError, + location: self.offset(), }); } + }; - // Next, insert a trailing newline, if required. - if !self.at_begin_of_line { - self.at_begin_of_line = true; - self.emit((Tok::Newline, TextRange::empty(tok_pos))); - } + Ok(token) + } - // Next, flush the indentation stack to zero. - while !self.indentations.is_empty() { - self.indentations.pop(); - self.emit((Tok::Dedent, TextRange::empty(tok_pos))); - } + // Take a look at the next character, if any, and decide upon the next steps. + fn consume_normal(&mut self, first: char) -> Result, LexicalError> { + if first.is_ascii() { + self.consume_ascii_character(first) + } else if is_unicode_identifier_start(first) { + let identifier = self.lex_identifier(first)?; + Ok(Some((identifier, self.token_range()))) + } else if is_emoji_presentation(first) { + Ok(Some(( + Tok::Name { + name: first.to_string(), + }, + self.token_range(), + ))) + } else { + Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: first }, + location: self.token_start(), + }) + } + } - self.emit((Tok::EndOfFile, TextRange::empty(tok_pos))); + fn consume_end(&mut self) -> Result { + // We reached end of file. + // First of all, we need all nestings to be finished. + if self.nesting > 0 { + return Err(LexicalError { + error: LexicalErrorType::Eof, + location: self.offset(), + }); } - Ok(()) + // Next, insert a trailing newline, if required. + if !self.at_begin_of_line { + self.at_begin_of_line = true; + Ok((Tok::Newline, TextRange::empty(self.offset()))) + } + // Next, flush the indentation stack to zero. + else if self.indentations.pop().is_some() { + Ok((Tok::Dedent, TextRange::empty(self.offset()))) + } else { + Ok((Tok::EndOfFile, TextRange::empty(self.offset()))) + } } // Dispatch based on the given character. - fn consume_character(&mut self, c: char) -> Result<(), LexicalError> { - match c { - '0'..='9' => { - let number = self.lex_number()?; - self.emit(number); - } - '#' => { - self.lex_and_emit_comment()?; - } - '"' | '\'' => { - let string = self.lex_string(StringKind::String)?; - self.emit(string); - } + fn consume_ascii_character(&mut self, c: char) -> Result, LexicalError> { + let token = match c { + c if is_ascii_identifier_start(c) => self.lex_identifier(c)?, + '0'..='9' => self.lex_number(c)?, + '#' => self.lex_comment()?, + '"' | '\'' => self.lex_string(StringKind::String, c)?, '=' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::EqEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Equal, TextRange::new(tok_start, tok_end))); - } + if self.cursor.eat_char('=') { + Tok::EqEqual + } else { + Tok::Equal } } '+' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::PlusEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::PlusEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Plus, TextRange::new(tok_start, tok_end))); + Tok::Plus } } '*' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::StarEqual, TextRange::new(tok_start, tok_end))); - } - Some('*') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::DoubleStarEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::DoubleStar, TextRange::new(tok_start, tok_end))); - } - } - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Star, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::StarEqual + } else if self.cursor.eat_char('*') { + if self.cursor.eat_char('=') { + Tok::DoubleStarEqual + } else { + Tok::DoubleStar } + } else { + Tok::Star } } '/' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::SlashEqual, TextRange::new(tok_start, tok_end))); - } - Some('/') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::DoubleSlashEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::DoubleSlash, TextRange::new(tok_start, tok_end))); - } - } - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Slash, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::SlashEqual + } else if self.cursor.eat_char('/') { + if self.cursor.eat_char('=') { + Tok::DoubleSlashEqual + } else { + Tok::DoubleSlash } + } else { + Tok::Slash } } '%' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::PercentEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::PercentEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Percent, TextRange::new(tok_start, tok_end))); + Tok::Percent } } '|' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::VbarEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::VbarEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Vbar, TextRange::new(tok_start, tok_end))); + Tok::Vbar } } '^' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::CircumflexEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::CircumflexEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::CircumFlex, TextRange::new(tok_start, tok_end))); + Tok::CircumFlex } } '&' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::AmperEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::AmperEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Amper, TextRange::new(tok_start, tok_end))); + Tok::Amper } } '-' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::MinusEqual, TextRange::new(tok_start, tok_end))); - } - Some('>') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::Rarrow, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Minus, TextRange::new(tok_start, tok_end))); - } + if self.cursor.eat_char('=') { + Tok::MinusEqual + } else if self.cursor.eat_char('>') { + Tok::Rarrow + } else { + Tok::Minus } } '@' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::AtEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::AtEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::At, TextRange::new(tok_start, tok_end))); + Tok::At } } '!' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::NotEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::NotEqual } else { return Err(LexicalError { error: LexicalErrorType::UnrecognizedToken { tok: '!' }, - location: tok_start, + location: self.token_start(), }); } } - '~' => { - self.eat_single_char(Tok::Tilde); - } + '~' => Tok::Tilde, '(' => { - self.eat_single_char(Tok::Lpar); - self.nesting += 1; + self.nesting = self.nesting.saturating_add(1); + Tok::Lpar } ')' => { - self.eat_single_char(Tok::Rpar); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rpar } '[' => { - self.eat_single_char(Tok::Lsqb); - self.nesting += 1; + self.nesting = self.nesting.saturating_add(1); + Tok::Lsqb } ']' => { - self.eat_single_char(Tok::Rsqb); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rsqb } '{' => { - self.eat_single_char(Tok::Lbrace); - self.nesting += 1; + self.nesting = self.nesting.saturating_add(1); + Tok::Lbrace } '}' => { - self.eat_single_char(Tok::Rbrace); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rbrace } ':' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::ColonEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::ColonEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Colon, TextRange::new(tok_start, tok_end))); + Tok::Colon } } - ';' => { - self.eat_single_char(Tok::Semi); - } + ';' => Tok::Semi, '<' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('<') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::LeftShiftEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::LeftShift, TextRange::new(tok_start, tok_end))); - } - } - } - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::LessEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Less, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('<') { + if self.cursor.eat_char('=') { + Tok::LeftShiftEqual + } else { + Tok::LeftShift } + } else if self.cursor.eat_char('=') { + Tok::LessEqual + } else { + Tok::Less } } '>' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('>') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::RightShiftEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::RightShift, TextRange::new(tok_start, tok_end))); - } - } - } - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::GreaterEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Greater, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('>') { + if self.cursor.eat_char('=') { + Tok::RightShiftEqual + } else { + Tok::RightShift } + } else if self.cursor.eat_char('=') { + Tok::GreaterEqual + } else { + Tok::Greater } } - ',' => { - self.eat_single_char(Tok::Comma); - } + ',' => Tok::Comma, '.' => { - if let Some('0'..='9') = self.window[1] { - let number = self.lex_number()?; - self.emit(number); + if self.cursor.first().is_ascii_digit() { + self.lex_decimal_number('.')? + } else if self.cursor.first() == '.' && self.cursor.second() == '.' { + self.cursor.bump(); + self.cursor.bump(); + Tok::Ellipsis } else { - let tok_start = self.get_pos(); - self.next_char(); - if self.window[..2] == [Some('.'); 2] { - self.next_char(); - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::Ellipsis, TextRange::new(tok_start, tok_end))); - } else { - let tok_end = self.get_pos(); - self.emit((Tok::Dot, TextRange::new(tok_start, tok_end))); - } + Tok::Dot } } - '\n' | '\r' => { - let tok_start = self.get_pos(); - self.next_char(); - let tok_end = self.get_pos(); + '\n' => { + if self.nesting == 0 { + self.at_begin_of_line = true; + Tok::Newline + } else { + Tok::NonLogicalNewline + } + } + '\r' => { + self.cursor.eat_char('\n'); - // Depending on the nesting level, we emit a logical or - // non-logical newline: if self.nesting == 0 { self.at_begin_of_line = true; - self.emit((Tok::Newline, TextRange::new(tok_start, tok_end))); + Tok::Newline } else { - #[cfg(feature = "full-lexer")] - self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); + Tok::NonLogicalNewline } } ' ' | '\t' | '\x0C' => { - // Skip white-spaces - self.next_char(); - while let Some(' ' | '\t' | '\x0C') = self.window[0] { - self.next_char(); - } + self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C')); + return Ok(None); } - '\\' => { - self.next_char(); - match self.window[0] { - Some('\n' | '\r') => { - self.next_char(); - } - _ => { - return Err(LexicalError { - error: LexicalErrorType::LineContinuationError, - location: self.get_pos(), - }); - } - } - if self.window[0].is_none() { + '\\' => { + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.is_eof() { return Err(LexicalError { error: LexicalErrorType::Eof, - location: self.get_pos(), + location: self.token_start(), }); - } - } - _ => { - if is_emoji_presentation(c) { - let tok_start = self.get_pos(); - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::Name { - name: c.to_string(), - }, - TextRange::new(tok_start, tok_end), - )); - } else { - let c = self.next_char(); + } else if !self.cursor.eat_char('\n') { return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() }, - location: self.get_pos(), + error: LexicalErrorType::LineContinuationError, + location: self.token_start(), }); } + return Ok(None); } - } - Ok(()) + _ => { + return Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: c }, + location: self.token_start(), + }); + } + }; + + Ok(Some((token, self.token_range()))) } - // Used by single character tokens to advance the window and emit the correct token. - fn eat_single_char(&mut self, ty: Tok) { - let tok_start = self.get_pos(); - self.next_char().unwrap_or_else(|| unsafe { - // SAFETY: eat_single_char has been called only after a character has been read - // from the window, so the window is guaranteed to be non-empty. - std::hint::unreachable_unchecked() - }); - let tok_end = self.get_pos(); - self.emit((ty, TextRange::new(tok_start, tok_end))); - } - - // Helper function to go to the next character coming up. - fn next_char(&mut self) -> Option { - let mut c = self.window[0]; - self.window.slide(); - match c { - Some('\r') => { - if self.window[0] == Some('\n') { - self.location += TextSize::from(1); - self.window.slide(); - } + #[inline] + fn token_range(&self) -> TextRange { + let end = self.offset(); + let len = self.cursor.token_len(); - self.location += TextSize::from(1); - c = Some('\n'); - } - #[allow(unused_variables)] - Some(c) => { - self.location += c.text_len(); - } - _ => {} - } - c + TextRange::at(end - len, len) + } + + #[inline] + fn token_text(&self) -> &'source str { + &self.source[self.token_range()] } - // Helper function to retrieve the current position. - fn get_pos(&self) -> TextSize { - self.location + #[inline] + fn offset(&self) -> TextSize { + TextSize::new(self.source.len() as u32) - self.cursor.text_len() } - // Helper function to emit a lexed token to the queue of tokens. - fn emit(&mut self, spanned: Spanned) { - self.pending.push(spanned); + #[inline] + fn token_start(&self) -> TextSize { + self.token_range().start() } } // Implement iterator pattern for Lexer. // Calling the next element in the iterator will yield the next lexical // token. -impl Iterator for Lexer -where - T: Iterator, -{ +impl Iterator for Lexer<'_> { type Item = LexResult; fn next(&mut self) -> Option { - let token = self.inner_next(); - trace!( - "Lex token {:?}, nesting={:?}, indent stack: {:?}", - token, - self.nesting, - self.indentations, - ); + let token = self.next_token(); match token { Ok((Tok::EndOfFile, _)) => None, @@ -1321,6 +935,8 @@ where } } +impl FusedIterator for Lexer<'_> {} + /// Represents an error that occur during lexing and are /// returned by the `parse_*` functions in the iterator in the /// [lexer] implementation. @@ -1427,10 +1043,62 @@ impl std::fmt::Display for LexicalErrorType { } } +#[derive(Copy, Clone, Debug)] +enum Radix { + Binary, + Octal, + Decimal, + Hex, +} + +impl Radix { + const fn as_u32(self) -> u32 { + match self { + Radix::Binary => 2, + Radix::Octal => 8, + Radix::Decimal => 10, + Radix::Hex => 16, + } + } + + const fn is_digit(self, c: char) -> bool { + match self { + Radix::Binary => matches!(c, '0'..='1'), + Radix::Octal => matches!(c, '0'..='7'), + Radix::Decimal => c.is_ascii_digit(), + Radix::Hex => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } +} + +const fn is_quote(c: char) -> bool { + matches!(c, '\'' | '"') +} + +const fn is_ascii_identifier_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_') +} + +// Checks if the character c is a valid starting character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_unicode_identifier_start(c: char) -> bool { + is_xid_start(c) +} + +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char) -> bool { + match c { + 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true, + c => is_xid_continue(c), + } +} + #[cfg(test)] mod tests { + use num_bigint::BigInt; + use super::*; - use crate::ast::bigint::BigInt; const WINDOWS_EOL: &str = "\r\n"; const MAC_EOL: &str = "\r"; @@ -1527,49 +1195,41 @@ mod tests { value: "".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Magic2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Shell, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::ShCap, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Paren, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Quote, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), @@ -1605,61 +1265,51 @@ mod tests { value: "foo".to_string(), kind: MagicKind::Help, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo".to_string(), kind: MagicKind::Help2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "timeit a = b".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "timeit a % 3".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "matplotlib --inline".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "pwd && ls -a | sed 's/^/\\\\ /'".to_string(), kind: MagicKind::Shell, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "cd /Users/foo/Library/Application\\ Support/".to_string(), kind: MagicKind::ShCap, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Paren, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "ls".to_string(), @@ -1714,7 +1364,7 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!(r"99232 # {}", $eol); let tokens = lex_source(&source); @@ -1735,7 +1385,7 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("123 # Foo{}456", $eol); let tokens = lex_source(&source); @@ -1791,7 +1441,7 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1829,7 +1479,7 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1870,7 +1520,7 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1923,7 +1573,7 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = r"x = [ @@ -1986,7 +1636,7 @@ mod tests { } #[test] - #[cfg(feature = "full-lexer")] + fn test_non_logical_newline_in_string_continuation() { let source = r"( 'a' @@ -2016,7 +1666,7 @@ mod tests { } #[test] - #[cfg(feature = "full-lexer")] + fn test_logical_newline_line_comment() { let source = "#Hello\n#World\n"; let tokens = lex_source(source); diff --git a/parser/src/lexer/cursor.rs b/parser/src/lexer/cursor.rs new file mode 100644 index 00000000..90f9f7b2 --- /dev/null +++ b/parser/src/lexer/cursor.rs @@ -0,0 +1,108 @@ +use crate::text_size::{TextLen, TextSize}; +use std::str::Chars; + +pub(crate) const EOF_CHAR: char = '\0'; + +#[derive(Clone, Debug)] +pub(super) struct Cursor<'a> { + chars: Chars<'a>, + source_length: TextSize, + #[cfg(debug_assertions)] + prev_char: char, +} + +impl<'a> Cursor<'a> { + pub fn new(source: &'a str) -> Self { + Self { + source_length: source.text_len(), + chars: source.chars(), + #[cfg(debug_assertions)] + prev_char: EOF_CHAR, + } + } + + /// Returns the previous token. Useful for debug assertions. + #[cfg(debug_assertions)] + pub(super) const fn previous(&self) -> char { + self.prev_char + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the file is at the end of the file. + pub(super) fn first(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the second character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn second(&self) -> char { + let mut chars = self.chars.clone(); + chars.next(); + chars.next().unwrap_or(EOF_CHAR) + } + + /// Peeks the third character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn rest(&self) -> &'a str { + self.chars.as_str() + } + + // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + #[allow(clippy::cast_possible_truncation)] + pub(super) fn text_len(&self) -> TextSize { + TextSize::new(self.chars.as_str().len() as u32) + } + + pub(super) fn token_len(&self) -> TextSize { + self.source_length - self.text_len() + } + + pub(super) fn start_token(&mut self) { + self.source_length = self.text_len() + } + + pub(super) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Consumes the next character + pub(super) fn bump(&mut self) -> Option { + let prev = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev_char = prev; + } + + Some(prev) + } + + pub(super) fn eat_char(&mut self, c: char) -> bool { + if self.first() == c { + self.bump(); + true + } else { + false + } + } + + pub(super) fn eat_if(&mut self, mut predicate: F) -> Option + where + F: FnMut(char) -> bool, + { + if predicate(self.first()) && !self.is_eof() { + self.bump() + } else { + None + } + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/parser/src/lexer/indentation.rs b/parser/src/lexer/indentation.rs new file mode 100644 index 00000000..b85e314c --- /dev/null +++ b/parser/src/lexer/indentation.rs @@ -0,0 +1,124 @@ +use static_assertions::assert_eq_size; +use std::cmp::Ordering; +use std::fmt::Debug; + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Column(u32); + +impl Column { + pub(super) const fn new(column: u32) -> Self { + Self(column) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Character(u32); + +impl Character { + pub(super) const fn new(characters: u32) -> Self { + Self(characters) + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] +pub(super) struct Indentation { + column: Column, + character: Character, +} + +impl Indentation { + pub(super) const fn root() -> Self { + Self { + column: Column::new(0), + character: Character::new(0), + } + } + + pub(super) const fn new(column: Column, character: Character) -> Self { + Self { character, column } + } + + pub(super) const fn column(&self) -> Column { + self.column + } + + pub(super) const fn character(&self) -> Character { + self.character + } + + pub(super) fn try_compare( + &self, + other: &Indentation, + ) -> Result { + let column_ordering = self.column.cmp(&other.column); + let character_ordering = self.character.cmp(&other.character); + + if column_ordering == character_ordering { + Ok(column_ordering) + } else { + Err(UnexpectedIndentation) + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub(super) struct UnexpectedIndentation; + +// The indentations stack is used to keep track of the current indentation level. +// Similar to the CPython implementation, the Indentations stack always has at +// least one level which is never popped. See Reference 2.1.8. +#[derive(Debug, Clone)] +pub(super) struct Indentations { + stack: Vec, +} + +impl Indentations { + pub fn is_empty(&self) -> bool { + self.stack.len() == 1 + } + + pub fn push(&mut self, indent: Indentation) { + debug_assert_eq!(self.current().try_compare(&indent), Ok(Ordering::Less)); + + self.stack.push(indent); + } + + pub fn pop(&mut self) -> Option { + if self.is_empty() { + None + } else { + self.stack.pop() + } + } + + pub fn current(&self) -> &Indentation { + self.stack.last().expect("Expected indentation") + } +} + +impl Default for Indentations { + fn default() -> Self { + Self { + stack: vec![Indentation::root()], + } + } +} + +assert_eq_size!(Indentation, u64); + +#[cfg(test)] +mod tests { + use super::{Character, Column, Indentation}; + use std::cmp::Ordering; + + #[test] + fn indentation_try_compare() { + let tab = Indentation::new(Column::new(8), Character::new(1)); + + assert_eq!(tab.try_compare(&tab), Ok(Ordering::Equal)); + + let two_tabs = Indentation::new(Column::new(16), Character::new(2)); + assert_eq!(two_tabs.try_compare(&tab), Ok(Ordering::Greater)); + assert_eq!(tab.try_compare(&two_tabs), Ok(Ordering::Less)); + } +} diff --git a/parser/src/parser.rs b/parser/src/parser.rs index aed39489..993d56fd 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -12,6 +12,12 @@ //! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree //! [`Mode`]: crate::mode +use std::iter; + +use itertools::Itertools; +pub(super) use lalrpop_util::ParseError as LalrpopError; + +use crate::lexer::{lex, lex_starts_at}; use crate::{ ast::{self, Ranged}, lexer::{self, LexResult, LexicalError, LexicalErrorType}, @@ -20,11 +26,6 @@ use crate::{ token::Tok, Mode, }; -use itertools::Itertools; -use std::iter; - -use crate::{lexer::Lexer, soft_keywords::SoftKeywordTransformer}; -pub(super) use lalrpop_util::ParseError as LalrpopError; /// Parse Python code string to implementor's type. /// @@ -57,44 +58,43 @@ where Self: Sized, { fn parse(source: &str, source_path: &str) -> Result { - Self::parse_starts_at(source, source_path, TextSize::default()) + let tokens = lex(source, Self::mode()); + + Self::parse_tokens(tokens, source_path) } + fn parse_without_path(source: &str) -> Result { Self::parse(source, "") } + fn parse_starts_at( source: &str, source_path: &str, offset: TextSize, ) -> Result { - let lxr = Self::lex_starts_at(source, offset); - #[cfg(feature = "full-lexer")] - let lxr = - lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); - Self::parse_tokens(lxr, source_path) + let tokens = lex_starts_at(source, Self::mode(), offset); + + Self::parse_tokens(tokens, source_path) } - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer>; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result; + + fn mode() -> Mode; } impl Parse for ast::ModModule { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Module, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Module, source_path)? { + match parse_tokens(lxr, Mode::Module, source_path)? { ast::Mod::Module(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -102,17 +102,15 @@ impl Parse for ast::ModModule { } impl Parse for ast::ModExpression { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Expression, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Expression, source_path)? { + match parse_tokens(lxr, Mode::Expression, source_path)? { ast::Mod::Expression(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -120,17 +118,14 @@ impl Parse for ast::ModExpression { } impl Parse for ast::ModInteractive { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Interactive, offset) + fn mode() -> Mode { + Mode::Interactive } fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Interactive, source_path)? { + match parse_tokens(lxr, Mode::Interactive, source_path)? { ast::Mod::Interactive(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -138,12 +133,10 @@ impl Parse for ast::ModInteractive { } impl Parse for ast::Suite { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModModule::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -153,12 +146,10 @@ impl Parse for ast::Suite { } impl Parse for ast::Stmt { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModModule::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -186,12 +177,10 @@ impl Parse for ast::Stmt { } impl Parse for ast::Expr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModExpression::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -201,12 +190,10 @@ impl Parse for ast::Expr { } impl Parse for ast::Identifier { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -227,12 +214,10 @@ impl Parse for ast::Identifier { } impl Parse for ast::Constant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -429,7 +414,7 @@ pub fn parse_tokens( source_path: &str, ) -> Result { let lxr = lxr.into_iter(); - #[cfg(feature = "full-lexer")] + let lxr = lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); if mode == Mode::Jupyter { @@ -578,8 +563,10 @@ include!("gen/parse.rs"); #[cfg(test)] mod tests { - use super::*; use crate::{ast, Parse}; + use insta::assert_debug_snapshot; + + use super::*; #[test] fn test_parse_empty() { @@ -663,7 +650,6 @@ class Foo(A, B): } #[test] - #[cfg(feature = "all-nodes-with-ranges")] fn test_parse_class_generic_types() { let source = "\ # TypeVar @@ -694,7 +680,6 @@ class Foo[X, Y: str, *U, **P](): insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); } #[test] - #[cfg(feature = "all-nodes-with-ranges")] fn test_parse_function_definition() { let source = "\ def func(a): @@ -992,6 +977,57 @@ x = type = 1 insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); } + #[test] + fn numeric_literals() { + let source = r#"x = 123456789 +x = 123456 +x = .1 +x = 1. +x = 1E+1 +x = 1E-1 +x = 1.000_000_01 +x = 123456789.123456789 +x = 123456789.123456789E123456789 +x = 123456789E123456789 +x = 123456789J +x = 123456789.123456789J +x = 0XB1ACC +x = 0B1011 +x = 0O777 +x = 0.000000006 +x = 10000 +x = 133333 +"#; + + insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); + } + + #[test] + fn numeric_literals_attribute_access() { + let source = r#"x = .1.is_integer() +x = 1. .imag +x = 1E+1.imag +x = 1E-1.real +x = 123456789.123456789.hex() +x = 123456789.123456789E123456789 .real +x = 123456789E123456789 .conjugate() +x = 123456789J.real +x = 123456789.123456789J.__add__(0b1011.bit_length()) +x = 0XB1ACC.conjugate() +x = 0B1011 .conjugate() +x = 0O777 .real +x = 0.000000006 .hex() +x = -100.0000J + +if 10 .real: + ... + +y = 100[no] +y = 100(no) +"#; + assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()) + } + #[test] fn test_match_as_identifier() { let source = r#"\ diff --git a/parser/src/python.lalrpop b/parser/src/python.lalrpop index a9db8231..df827fe8 100644 --- a/parser/src/python.lalrpop +++ b/parser/src/python.lalrpop @@ -3,8 +3,9 @@ // See also: file:///usr/share/doc/python/html/reference/compound_stmts.html#function-definitions // See also: https://greentreesnakes.readthedocs.io/en/latest/nodes.html#keyword +use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt}, + ast::{self as ast, Ranged}, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, context::set_context, diff --git a/parser/src/python.rs b/parser/src/python.rs index f8b1aa34..5adacebc 100644 --- a/parser/src/python.rs +++ b/parser/src/python.rs @@ -1,7 +1,8 @@ // auto-generated: "lalrpop 0.20.0" -// sha3: e7613070cb5214ab021650015eb34d0a63631e3e1e61e1b26445cf55a4919727 +// sha3: 66283a7af6ea156e14559f5f1cc40e7df0127f8865083f7ab4a226187bfb4c53 +use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt}, + ast::{self as ast, Ranged}, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, context::set_context, @@ -20,8 +21,9 @@ extern crate alloc; #[allow(non_snake_case, non_camel_case_types, unused_mut, unused_variables, unused_imports, unused_parens, clippy::all)] mod __parse__Top { + use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt}, + ast::{self as ast, Ranged}, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, context::set_context, diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap.new b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap.new new file mode 100644 index 00000000..f4c708cc --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap.new @@ -0,0 +1,441 @@ +--- +source: parser/src/parser.rs +assertion_line: 1001 +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + Assign( + StmtAssign { + range: 0..13, + targets: [ + Name( + ExprName { + range: 0..1, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 4..13, + value: Int( + 123456789, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 14..24, + targets: [ + Name( + ExprName { + range: 14..15, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 18..24, + value: Int( + 123456, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 25..31, + targets: [ + Name( + ExprName { + range: 25..26, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 29..31, + value: Int( + 1, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 32..38, + targets: [ + Name( + ExprName { + range: 32..33, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 36..38, + value: Float( + 1.0, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 39..47, + targets: [ + Name( + ExprName { + range: 39..40, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 43..47, + value: Float( + 10.0, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 48..56, + targets: [ + Name( + ExprName { + range: 48..49, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 52..56, + value: Float( + 0.1, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 57..73, + targets: [ + Name( + ExprName { + range: 57..58, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 61..73, + value: Float( + 1.00000001, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 74..97, + targets: [ + Name( + ExprName { + range: 74..75, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 78..97, + value: Float( + 123456789.12345679, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 98..131, + targets: [ + Name( + ExprName { + range: 98..99, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 102..131, + value: Float( + inf, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 132..155, + targets: [ + Name( + ExprName { + range: 132..133, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 136..155, + value: Float( + inf, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 156..170, + targets: [ + Name( + ExprName { + range: 156..157, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 160..170, + value: Complex { + real: 0.0, + imag: 123456789.0, + }, + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 171..195, + targets: [ + Name( + ExprName { + range: 171..172, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 175..195, + value: Complex { + real: 0.0, + imag: 123456789.12345679, + }, + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 196..207, + targets: [ + Name( + ExprName { + range: 196..197, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 200..207, + value: Int( + 727756, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 208..218, + targets: [ + Name( + ExprName { + range: 208..209, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 212..218, + value: Int( + 11, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 219..228, + targets: [ + Name( + ExprName { + range: 219..220, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 223..228, + value: Int( + 511, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 229..244, + targets: [ + Name( + ExprName { + range: 229..230, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 233..244, + value: Float( + 6e-9, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 245..254, + targets: [ + Name( + ExprName { + range: 245..246, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 249..254, + value: Int( + 10000, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 255..265, + targets: [ + Name( + ExprName { + range: 255..256, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 259..265, + value: Int( + 133333, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), +] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap.new b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap.new new file mode 100644 index 00000000..c1339b7f --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap.new @@ -0,0 +1,673 @@ +--- +source: parser/src/parser.rs +assertion_line: 1028 +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + Assign( + StmtAssign { + range: 0..19, + targets: [ + Name( + ExprName { + range: 0..1, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 4..19, + func: Attribute( + ExprAttribute { + range: 4..17, + value: Constant( + ExprConstant { + range: 4..6, + value: Float( + 0.1, + ), + kind: None, + }, + ), + attr: Identifier { + id: "is_integer", + range: 7..17, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 20..32, + targets: [ + Name( + ExprName { + range: 20..21, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 24..32, + value: Constant( + ExprConstant { + range: 24..26, + value: Float( + 1.0, + ), + kind: None, + }, + ), + attr: Identifier { + id: "imag", + range: 28..32, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 33..46, + targets: [ + Name( + ExprName { + range: 33..34, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 37..46, + value: Constant( + ExprConstant { + range: 37..41, + value: Float( + 10.0, + ), + kind: None, + }, + ), + attr: Identifier { + id: "imag", + range: 42..46, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 47..60, + targets: [ + Name( + ExprName { + range: 47..48, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 51..60, + value: Constant( + ExprConstant { + range: 51..55, + value: Float( + 0.1, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 56..60, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 61..90, + targets: [ + Name( + ExprName { + range: 61..62, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 65..90, + func: Attribute( + ExprAttribute { + range: 65..88, + value: Constant( + ExprConstant { + range: 65..84, + value: Float( + 123456789.12345679, + ), + kind: None, + }, + ), + attr: Identifier { + id: "hex", + range: 85..88, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 91..130, + targets: [ + Name( + ExprName { + range: 91..92, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 95..130, + value: Constant( + ExprConstant { + range: 95..124, + value: Float( + inf, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 126..130, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 131..167, + targets: [ + Name( + ExprName { + range: 131..132, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 135..167, + func: Attribute( + ExprAttribute { + range: 135..165, + value: Constant( + ExprConstant { + range: 135..154, + value: Float( + inf, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 156..165, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 168..187, + targets: [ + Name( + ExprName { + range: 168..169, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 172..187, + value: Constant( + ExprConstant { + range: 172..182, + value: Complex { + real: 0.0, + imag: 123456789.0, + }, + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 183..187, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 188..241, + targets: [ + Name( + ExprName { + range: 188..189, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 192..241, + func: Attribute( + ExprAttribute { + range: 192..220, + value: Constant( + ExprConstant { + range: 192..212, + value: Complex { + real: 0.0, + imag: 123456789.12345679, + }, + kind: None, + }, + ), + attr: Identifier { + id: "__add__", + range: 213..220, + }, + ctx: Load, + }, + ), + args: [ + Call( + ExprCall { + range: 221..240, + func: Attribute( + ExprAttribute { + range: 221..238, + value: Constant( + ExprConstant { + range: 221..227, + value: Int( + 11, + ), + kind: None, + }, + ), + attr: Identifier { + id: "bit_length", + range: 228..238, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + ], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 242..265, + targets: [ + Name( + ExprName { + range: 242..243, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 246..265, + func: Attribute( + ExprAttribute { + range: 246..263, + value: Constant( + ExprConstant { + range: 246..253, + value: Int( + 727756, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 254..263, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 266..289, + targets: [ + Name( + ExprName { + range: 266..267, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 270..289, + func: Attribute( + ExprAttribute { + range: 270..287, + value: Constant( + ExprConstant { + range: 270..276, + value: Int( + 11, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 278..287, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 290..305, + targets: [ + Name( + ExprName { + range: 290..291, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 294..305, + value: Constant( + ExprConstant { + range: 294..299, + value: Int( + 511, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 301..305, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 306..329, + targets: [ + Name( + ExprName { + range: 306..307, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 310..329, + func: Attribute( + ExprAttribute { + range: 310..327, + value: Constant( + ExprConstant { + range: 310..321, + value: Float( + 6e-9, + ), + kind: None, + }, + ), + attr: Identifier { + id: "hex", + range: 324..327, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 330..344, + targets: [ + Name( + ExprName { + range: 330..331, + id: "x", + ctx: Store, + }, + ), + ], + value: UnaryOp( + ExprUnaryOp { + range: 334..344, + op: USub, + operand: Constant( + ExprConstant { + range: 335..344, + value: Complex { + real: 0.0, + imag: 100.0, + }, + kind: None, + }, + ), + }, + ), + type_comment: None, + }, + ), + If( + StmtIf { + range: 346..366, + test: Attribute( + ExprAttribute { + range: 349..357, + value: Constant( + ExprConstant { + range: 349..351, + value: Int( + 10, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 353..357, + }, + ctx: Load, + }, + ), + body: [ + Expr( + StmtExpr { + range: 363..366, + value: Constant( + ExprConstant { + range: 363..366, + value: Ellipsis, + kind: None, + }, + ), + }, + ), + ], + elif_else_clauses: [], + }, + ), + Assign( + StmtAssign { + range: 368..379, + targets: [ + Name( + ExprName { + range: 368..369, + id: "y", + ctx: Store, + }, + ), + ], + value: Subscript( + ExprSubscript { + range: 372..379, + value: Constant( + ExprConstant { + range: 372..375, + value: Int( + 100, + ), + kind: None, + }, + ), + slice: Name( + ExprName { + range: 376..378, + id: "no", + ctx: Load, + }, + ), + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 380..391, + targets: [ + Name( + ExprName { + range: 380..381, + id: "y", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 384..391, + func: Constant( + ExprConstant { + range: 384..387, + value: Int( + 100, + ), + kind: None, + }, + ), + args: [ + Name( + ExprName { + range: 388..390, + id: "no", + ctx: Load, + }, + ), + ], + keywords: [], + }, + ), + type_comment: None, + }, + ), +] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap index c48429b1..672b6230 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap @@ -6,9 +6,10 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ClassDef( StmtClassDef { range: 10..29, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 16..19, + }, bases: [], keywords: [], body: [ @@ -25,26 +26,28 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 20..21, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 20..21, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 52..76, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 58..61, + }, bases: [], keywords: [], body: [ @@ -61,21 +64,19 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 62..68, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 62..63, + }, bound: Some( Name( ExprName { range: 65..68, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -83,14 +84,16 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 105..138, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 111..114, + }, bases: [], keywords: [], body: [ @@ -107,14 +110,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 115..130, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 115..116, + }, bound: Some( Tuple( ExprTuple { @@ -123,18 +126,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 119..122, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), Name( ExprName { range: 124..129, - id: Identifier( - "bytes", - ), + id: "bytes", ctx: Load, }, ), @@ -146,14 +145,16 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 159..181, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 165..168, + }, bases: [], keywords: [], body: [ @@ -170,35 +171,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 169..170, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 169..170, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 172..173, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 172..173, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 200..223, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 206..209, + }, bases: [], keywords: [], body: [ @@ -215,35 +219,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 210..211, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 210..211, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 213..214, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 213..214, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 240..261, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 246..249, + }, bases: [], keywords: [], body: [ @@ -260,25 +267,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVarTuple( TypeParamTypeVarTuple { range: 250..253, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 251..253, + }, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 275..296, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 281..284, + }, bases: [], keywords: [], body: [ @@ -295,25 +304,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ ParamSpec( TypeParamParamSpec { range: 285..288, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 287..288, + }, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 312..351, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 318..321, + }, bases: [], keywords: [], body: [ @@ -323,30 +334,29 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 322..323, - name: Identifier( - "X", - ), + name: Identifier { + id: "X", + range: 322..323, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 325..331, - name: Identifier( - "Y", - ), + name: Identifier { + id: "Y", + range: 325..326, + }, bound: Some( Name( ExprName { range: 328..331, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -356,20 +366,23 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" TypeVarTuple( TypeParamTypeVarTuple { range: 333..335, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 334..335, + }, }, ), ParamSpec( TypeParamParamSpec { range: 337..340, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 339..340, + }, }, ), ], + decorator_list: [], }, ), ] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap index 2d65a64e..f84851f8 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap @@ -6,20 +6,22 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" FunctionDef( StmtFunctionDef { range: 0..20, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 4..8, + }, args: Arguments { - range: 9..10, + range: 8..11, posonlyargs: [], args: [ ArgWithDefault { range: 9..10, def: Arg { range: 9..10, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 9..10, + }, annotation: None, type_comment: None, }, @@ -46,34 +48,34 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 22..53, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 26..30, + }, args: Arguments { - range: 34..38, + range: 33..39, posonlyargs: [], args: [ ArgWithDefault { range: 34..38, def: Arg { range: 34..38, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 34..35, + }, annotation: Some( Name( ExprName { range: 37..38, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -106,51 +108,50 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 43..44, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 31..32, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 31..32, + }, bound: None, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 55..91, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 59..63, + }, args: Arguments { - range: 72..76, + range: 71..77, posonlyargs: [], args: [ ArgWithDefault { range: 72..76, def: Arg { range: 72..76, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 72..73, + }, annotation: Some( Name( ExprName { range: 75..76, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -183,28 +184,24 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 81..82, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 64..70, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 64..65, + }, bound: Some( Name( ExprName { range: 67..70, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -212,32 +209,33 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 93..138, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 97..101, + }, args: Arguments { - range: 119..123, + range: 118..124, posonlyargs: [], args: [ ArgWithDefault { range: 119..123, def: Arg { range: 119..123, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 119..120, + }, annotation: Some( Name( ExprName { range: 122..123, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -270,21 +268,19 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 128..129, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 102..117, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 102..103, + }, bound: Some( Tuple( ExprTuple { @@ -293,18 +289,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 106..109, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), Name( ExprName { range: 111..116, - id: Identifier( - "bytes", - ), + id: "bytes", ctx: Load, }, ), @@ -316,24 +308,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 140..171, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 144..148, + }, args: Arguments { - range: 154..161, + range: 153..162, posonlyargs: [], args: [], vararg: Some( Arg { range: 155..161, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 155..156, + }, annotation: Some( Starred( ExprStarred { @@ -341,9 +336,7 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 159..161, - id: Identifier( - "Ts", - ), + id: "Ts", ctx: Load, }, ), @@ -373,35 +366,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ TypeVarTuple( TypeParamTypeVarTuple { range: 149..152, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 150..152, + }, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 173..230, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 177..181, + }, args: Arguments { - range: 187..220, + range: 186..221, posonlyargs: [], args: [], vararg: Some( Arg { range: 188..200, - arg: Identifier( - "args", - ), + arg: Identifier { + id: "args", + range: 188..192, + }, annotation: Some( Attribute( ExprAttribute { @@ -409,15 +405,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 194..195, - id: Identifier( - "P", - ), + id: "P", ctx: Load, }, ), - attr: Identifier( - "args", - ), + attr: Identifier { + id: "args", + range: 196..200, + }, ctx: Load, }, ), @@ -429,9 +424,10 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" kwarg: Some( Arg { range: 204..220, - arg: Identifier( - "kwargs", - ), + arg: Identifier { + id: "kwargs", + range: 204..210, + }, annotation: Some( Attribute( ExprAttribute { @@ -439,15 +435,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 212..213, - id: Identifier( - "P", - ), + id: "P", ctx: Load, }, ), - attr: Identifier( - "kwargs", - ), + attr: Identifier { + id: "kwargs", + range: 214..220, + }, ctx: Load, }, ), @@ -472,25 +467,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ ParamSpec( TypeParamParamSpec { range: 182..185, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 184..185, + }, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 232..273, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 236..240, + }, args: Arguments { range: 261..263, posonlyargs: [], @@ -508,30 +505,29 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 241..242, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 241..242, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 244..250, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 244..245, + }, bound: Some( Name( ExprName { range: 247..250, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -541,20 +537,23 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" TypeVarTuple( TypeParamTypeVarTuple { range: 252..255, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 253..255, + }, }, ), ParamSpec( TypeParamParamSpec { range: 257..260, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 259..260, + }, }, ), ], + type_comment: None, }, ), ] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__regression.snap.new b/parser/src/snapshots/rustpython_parser__parser__tests__regression.snap.new new file mode 100644 index 00000000..79f4d24e --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__regression.snap.new @@ -0,0 +1,220 @@ +--- +source: parser/src/parser.rs +assertion_line: 992 +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + FunctionDef( + StmtFunctionDef { + range: 0..157, + name: Identifier { + id: "f", + range: 4..5, + }, + args: Arguments { + range: 5..7, + posonlyargs: [], + args: [], + vararg: None, + kwonlyargs: [], + kwarg: None, + }, + body: [ + With( + StmtWith { + range: 13..43, + items: [ + WithItem { + range: 18..29, + context_expr: Call( + ExprCall { + range: 18..23, + func: Name( + ExprName { + range: 18..21, + id: "foo", + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + optional_vars: Some( + Name( + ExprName { + range: 27..29, + id: "x1", + ctx: Store, + }, + ), + ), + }, + ], + body: [ + Pass( + StmtPass { + range: 39..43, + }, + ), + ], + type_comment: None, + }, + ), + With( + StmtWith { + range: 53..89, + items: [ + WithItem { + range: 58..75, + context_expr: Call( + ExprCall { + range: 58..63, + func: Name( + ExprName { + range: 58..61, + id: "foo", + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + optional_vars: Some( + Tuple( + ExprTuple { + range: 67..75, + elts: [ + Name( + ExprName { + range: 68..70, + id: "x2", + ctx: Store, + }, + ), + Name( + ExprName { + range: 72..74, + id: "y2", + ctx: Store, + }, + ), + ], + ctx: Store, + }, + ), + ), + }, + ], + body: [ + Pass( + StmtPass { + range: 85..89, + }, + ), + ], + type_comment: None, + }, + ), + With( + StmtWith { + range: 99..157, + items: [ + WithItem { + range: 105..116, + context_expr: Call( + ExprCall { + range: 105..110, + func: Name( + ExprName { + range: 105..108, + id: "foo", + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + optional_vars: Some( + Name( + ExprName { + range: 114..116, + id: "x3", + ctx: Store, + }, + ), + ), + }, + WithItem { + range: 118..129, + context_expr: Call( + ExprCall { + range: 118..123, + func: Name( + ExprName { + range: 118..121, + id: "foo", + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + optional_vars: Some( + Name( + ExprName { + range: 127..129, + id: "y3", + ctx: Store, + }, + ), + ), + }, + WithItem { + range: 131..142, + context_expr: Call( + ExprCall { + range: 131..136, + func: Name( + ExprName { + range: 131..134, + id: "foo", + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + optional_vars: Some( + Name( + ExprName { + range: 140..142, + id: "z3", + ctx: Store, + }, + ), + ), + }, + ], + body: [ + Pass( + StmtPass { + range: 153..157, + }, + ), + ], + type_comment: None, + }, + ), + ], + decorator_list: [], + returns: None, + type_params: [], + type_comment: None, + }, + ), +] diff --git a/parser/src/soft_keywords.rs b/parser/src/soft_keywords.rs index 9abcd395..51278a46 100644 --- a/parser/src/soft_keywords.rs +++ b/parser/src/soft_keywords.rs @@ -134,7 +134,6 @@ where self.start_of_line = next.as_ref().map_or(false, |lex_result| { lex_result.as_ref().map_or(false, |(tok, _)| { - #[cfg(feature = "full-lexer")] if matches!(tok, Tok::NonLogicalNewline | Tok::Comment { .. }) { return self.start_of_line; } diff --git a/parser/src/token.rs b/parser/src/token.rs index 1fd78251..46586aae 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -4,8 +4,8 @@ //! loosely based on the token definitions found in the [CPython source]. //! //! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h -use crate::ast::bigint::BigInt; use crate::{text_size::TextSize, Mode}; +use num_bigint::BigInt; use std::fmt; /// The set of tokens the Python source code can be tokenized in. @@ -51,13 +51,11 @@ pub enum Tok { kind: MagicKind, }, /// Token value for a comment. These are filtered out of the token stream prior to parsing. - #[cfg(feature = "full-lexer")] Comment(String), /// Token value for a newline. Newline, /// Token value for a newline that is not a logical line break. These are filtered out of /// the token stream prior to parsing. - #[cfg(feature = "full-lexer")] NonLogicalNewline, /// Token value for an indent. Indent, @@ -235,7 +233,7 @@ impl fmt::Display for Tok { } MagicCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), - #[cfg(feature = "full-lexer")] + NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), @@ -249,7 +247,7 @@ impl fmt::Display for Tok { Rsqb => f.write_str("']'"), Colon => f.write_str("':'"), Comma => f.write_str("','"), - #[cfg(feature = "full-lexer")] + Comment(value) => f.write_str(value), Semi => f.write_str("';'"), Plus => f.write_str("'+'"),