diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 989f80eb..fa5f8a8f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,10 +37,8 @@ jobs: - uses: Swatinem/rust-cache@v2 - - name: run tests with num-bigint - run: cargo test --all --no-default-features --features num-bigint - - name: run tests with malachite-bigint and all features - run: cargo test --all --features malachite-bigint,full-lexer,serde + - name: run tests + run: cargo test --all --all-features lint: name: Check Rust code with rustfmt and clippy @@ -53,9 +51,7 @@ jobs: - name: run rustfmt run: cargo fmt --all -- --check - name: run clippy - run: cargo clippy --all --no-default-features --features num-bigint - - name: run clippy - run: cargo clippy --all --features malachite-bigint,full-lexer,serde -- -Dwarnings + run: cargo clippy --all --all-features -- -Dwarnings - uses: actions/setup-python@v4 with: diff --git a/Cargo.toml b/Cargo.toml index 219221e6..7949185b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,23 +21,17 @@ rustpython-literal = { path = "literal" } rustpython-format = { path = "format" } rustpython-parser = { path = "parser", default-features = false } -ahash = "0.7.6" anyhow = "1.0.45" cfg-if = "1.0" insta = "1.14.0" itertools = "0.10.3" is-macro = "0.2.2" -log = "0.4.16" num-complex = "0.4.0" num-bigint = "0.4.3" num-traits = "0.2" -pyo3 = { version = "0.19.0" } -malachite-bigint = { version = "0.1.0" } -memchr = "2.5.0" rand = "0.8.5" serde = "1.0" static_assertions = "1.1" -once_cell = "1.17.1" unicode_names2 = { version = "0.6.0", git = "https://github.com/youknowone/unicode_names2.git", rev = "4ce16aa85cbcdd9cc830410f1a72ef9a235f2fde" } [profile.dev.package."*"] diff --git a/ast/Cargo.toml b/ast/Cargo.toml index fe869346..03a566e0 100644 --- a/ast/Cargo.toml +++ b/ast/Cargo.toml @@ -7,14 +7,10 @@ edition = "2021" repository = "https://github.com/RustPython/Parser/" license = "MIT" -[features] -default = ["malachite-bigint"] - [dependencies] rustpython-parser-core = { workspace = true } rustpython-literal = { workspace = true, optional = true } is-macro = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } static_assertions = "1.1.0" diff --git a/ast/src/builtin.rs b/ast/src/builtin.rs index b7fd3c8e..e10b8245 100644 --- a/ast/src/builtin.rs +++ b/ast/src/builtin.rs @@ -2,8 +2,8 @@ use rustpython_parser_core::text_size::TextRange; -use crate::bigint::BigInt; use crate::Ranged; +use num_bigint::BigInt; pub type String = std::string::String; diff --git a/ast/src/generic.rs b/ast/src/generic.rs index db255fca..df9575da 100644 --- a/ast/src/generic.rs +++ b/ast/src/generic.rs @@ -1,6 +1,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] -use crate::text_size::TextRange; -pub use crate::{builtin::*, text_size::TextSize, ConversionFlag, Node}; +use crate::text_size::{TextRange, TextSize}; +pub(crate) use crate::{builtin::*, ConversionFlag, Node}; use std::fmt::{self, Debug}; // This file was originally generated from asdl by a python script, but we now edit it manually diff --git a/ast/src/lib.rs b/ast/src/lib.rs index 1b12a93e..cbb12ce2 100644 --- a/ast/src/lib.rs +++ b/ast/src/lib.rs @@ -20,11 +20,6 @@ mod generic; mod impls; mod ranged; -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] -pub use num_bigint as bigint; - pub use builtin::*; pub use generic::*; pub use ranged::Ranged; diff --git a/ast/src/ranged.rs b/ast/src/ranged.rs index f1d08b91..1893fd1c 100644 --- a/ast/src/ranged.rs +++ b/ast/src/ranged.rs @@ -2,8 +2,6 @@ use crate::text_size::{TextRange, TextSize}; -pub use crate::builtin::*; - pub trait Ranged { fn range(&self) -> TextRange; diff --git a/core/Cargo.toml b/core/Cargo.toml index 82e1cae5..2c477757 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -13,7 +13,6 @@ ruff_text_size = { path = "../ruff_text_size" } serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } is-macro.workspace = true -memchr.workspace = true [features] default = [] diff --git a/format/Cargo.toml b/format/Cargo.toml index b11b25db..0fda5abc 100644 --- a/format/Cargo.toml +++ b/format/Cargo.toml @@ -13,8 +13,6 @@ rustpython-literal = { workspace = true } bitflags = "2.3.1" itertools = "0.10.5" num-traits = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } [features] -default = ["malachite-bigint"] \ No newline at end of file diff --git a/format/src/cformat.rs b/format/src/cformat.rs index d835fda0..8519bbd6 100644 --- a/format/src/cformat.rs +++ b/format/src/cformat.rs @@ -9,7 +9,7 @@ use std::{ str::FromStr, }; -use crate::bigint::{BigInt, Sign}; +use num_bigint::{BigInt, Sign}; #[derive(Debug, PartialEq)] pub enum CFormatErrorType { diff --git a/format/src/format.rs b/format/src/format.rs index 6bc5796e..09e42b80 100644 --- a/format/src/format.rs +++ b/format/src/format.rs @@ -6,7 +6,7 @@ use rustpython_literal::format::Case; use std::ops::Deref; use std::{cmp, str::FromStr}; -use crate::bigint::{BigInt, Sign}; +use num_bigint::{BigInt, Sign}; trait FormatParse { fn parse(text: &str) -> (Option, &str) diff --git a/format/src/lib.rs b/format/src/lib.rs index 61de9d55..e15074ba 100644 --- a/format/src/lib.rs +++ b/format/src/lib.rs @@ -1,8 +1,3 @@ -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] -pub use num_bigint as bigint; - pub use crate::format::*; pub mod cformat; diff --git a/literal/src/escape.rs b/literal/src/escape.rs index 082248a5..0cb07adb 100644 --- a/literal/src/escape.rs +++ b/literal/src/escape.rs @@ -385,7 +385,7 @@ impl<'a> Escape for AsciiEscape<'a> { fn layout(&self) -> &EscapeLayout { &self.layout } - + #[allow(unsafe_code)] fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { formatter.write_str(unsafe { // SAFETY: this function must be called only when source is printable ascii characters diff --git a/parser/Cargo.toml b/parser/Cargo.toml index b6c20ff8..129db9f8 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -9,16 +9,11 @@ license = "MIT" edition = "2021" [features] -default = ["malachite-bigint"] serde = ["dep:serde", "rustpython-parser-core/serde"] -full-lexer = [] -malachite-bigint = ["dep:malachite-bigint", "rustpython-ast/malachite-bigint"] -num-bigint = ["dep:num-bigint", "rustpython-ast/num-bigint"] [build-dependencies] anyhow = { workspace = true } lalrpop = { version = "0.20.0", default-features = false, optional = true } -phf_codegen = "0.11.1" tiny-keccak = { version = "2", features = ["sha3"] } [dependencies] @@ -27,18 +22,16 @@ rustpython-parser-core = { workspace = true } itertools = { workspace = true } is-macro = { workspace = true } -log = { workspace = true } -malachite-bigint = { workspace = true, optional = true } -num-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } num-traits = { workspace = true } unicode_names2 = { workspace = true } unic-emoji-char = "0.9.0" unic-ucd-ident = "0.9.0" lalrpop-util = { version = "0.20.0", default-features = false } -phf = "0.11.1" rustc-hash = "1.1.0" serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } +static_assertions = "1.1.0" [dev-dependencies] insta = { workspace = true } diff --git a/parser/build.rs b/parser/build.rs index e205c65f..a9bc3832 100644 --- a/parser/build.rs +++ b/parser/build.rs @@ -1,13 +1,10 @@ use std::fmt::Write as _; use std::fs::File; -use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::io::{BufRead, BufReader}; use std::path::{Path, PathBuf}; use tiny_keccak::{Hasher, Sha3}; fn main() -> anyhow::Result<()> { - let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); - gen_phf(&out_dir); - const SOURCE: &str = "src/python.lalrpop"; println!("cargo:rerun-if-changed={SOURCE}"); @@ -16,6 +13,7 @@ fn main() -> anyhow::Result<()> { #[cfg(feature = "lalrpop")] { + let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); target = out_dir.join("src/python.rs"); } #[cfg(not(feature = "lalrpop"))] @@ -113,55 +111,3 @@ fn sha_equal(expected_sha3_str: &str, actual_sha3: &[u8; 32]) -> bool { } *actual_sha3 == expected_sha3 } - -fn gen_phf(out_dir: &Path) { - let mut kwds = phf_codegen::Map::new(); - let kwds = kwds - // Alphabetical keywords: - .entry("...", "Tok::Ellipsis") - .entry("False", "Tok::False") - .entry("None", "Tok::None") - .entry("True", "Tok::True") - // more so "standard" keywords - .entry("and", "Tok::And") - .entry("as", "Tok::As") - .entry("assert", "Tok::Assert") - .entry("async", "Tok::Async") - .entry("await", "Tok::Await") - .entry("break", "Tok::Break") - .entry("case", "Tok::Case") - .entry("class", "Tok::Class") - .entry("continue", "Tok::Continue") - .entry("def", "Tok::Def") - .entry("del", "Tok::Del") - .entry("elif", "Tok::Elif") - .entry("else", "Tok::Else") - .entry("except", "Tok::Except") - .entry("finally", "Tok::Finally") - .entry("for", "Tok::For") - .entry("from", "Tok::From") - .entry("global", "Tok::Global") - .entry("if", "Tok::If") - .entry("import", "Tok::Import") - .entry("in", "Tok::In") - .entry("is", "Tok::Is") - .entry("lambda", "Tok::Lambda") - .entry("match", "Tok::Match") - .entry("nonlocal", "Tok::Nonlocal") - .entry("not", "Tok::Not") - .entry("or", "Tok::Or") - .entry("pass", "Tok::Pass") - .entry("raise", "Tok::Raise") - .entry("return", "Tok::Return") - .entry("try", "Tok::Try") - .entry("type", "Tok::Type") - .entry("while", "Tok::While") - .entry("with", "Tok::With") - .entry("yield", "Tok::Yield") - .build(); - writeln!( - BufWriter::new(File::create(out_dir.join("keywords.rs")).unwrap()), - "{kwds}", - ) - .unwrap(); -} diff --git a/parser/src/function.rs b/parser/src/function.rs index 67749ea3..1f8215ec 100644 --- a/parser/src/function.rs +++ b/parser/src/function.rs @@ -10,8 +10,8 @@ use rustc_hash::FxHashSet; use rustpython_ast::Ranged; pub(crate) struct ArgumentList { - pub args: Vec, - pub keywords: Vec, + pub(crate) args: Vec, + pub(crate) keywords: Vec, } // Perform validation of function/lambda arguments in a function definition. diff --git a/parser/src/gen/parse.rs b/parser/src/gen/parse.rs index fafec6a1..6c659c1b 100644 --- a/parser/src/gen/parse.rs +++ b/parser/src/gen/parse.rs @@ -1,12 +1,8 @@ // This file was originally generated from asdl by a python script, but we now edit it manually impl Parse for ast::StmtFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -24,12 +20,7 @@ impl Parse for ast::StmtFunctionDef { } impl Parse for ast::StmtAsyncFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -47,12 +38,7 @@ impl Parse for ast::StmtAsyncFunctionDef { } impl Parse for ast::StmtClassDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -70,12 +56,7 @@ impl Parse for ast::StmtClassDef { } impl Parse for ast::StmtReturn { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -93,12 +74,7 @@ impl Parse for ast::StmtReturn { } impl Parse for ast::StmtDelete { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -116,12 +92,7 @@ impl Parse for ast::StmtDelete { } impl Parse for ast::StmtAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -139,12 +110,7 @@ impl Parse for ast::StmtAssign { } impl Parse for ast::StmtTypeAlias { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -162,12 +128,7 @@ impl Parse for ast::StmtTypeAlias { } impl Parse for ast::StmtAugAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -185,12 +146,7 @@ impl Parse for ast::StmtAugAssign { } impl Parse for ast::StmtAnnAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -208,12 +164,7 @@ impl Parse for ast::StmtAnnAssign { } impl Parse for ast::StmtFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -231,12 +182,7 @@ impl Parse for ast::StmtFor { } impl Parse for ast::StmtAsyncFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -254,12 +200,7 @@ impl Parse for ast::StmtAsyncFor { } impl Parse for ast::StmtWhile { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -277,12 +218,7 @@ impl Parse for ast::StmtWhile { } impl Parse for ast::StmtIf { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -300,12 +236,7 @@ impl Parse for ast::StmtIf { } impl Parse for ast::StmtWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -323,12 +254,7 @@ impl Parse for ast::StmtWith { } impl Parse for ast::StmtAsyncWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -346,12 +272,7 @@ impl Parse for ast::StmtAsyncWith { } impl Parse for ast::StmtMatch { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -369,12 +290,7 @@ impl Parse for ast::StmtMatch { } impl Parse for ast::StmtRaise { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -392,12 +308,7 @@ impl Parse for ast::StmtRaise { } impl Parse for ast::StmtTry { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -415,12 +326,7 @@ impl Parse for ast::StmtTry { } impl Parse for ast::StmtTryStar { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -438,12 +344,7 @@ impl Parse for ast::StmtTryStar { } impl Parse for ast::StmtAssert { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -461,12 +362,7 @@ impl Parse for ast::StmtAssert { } impl Parse for ast::StmtImport { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -484,12 +380,7 @@ impl Parse for ast::StmtImport { } impl Parse for ast::StmtImportFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -507,12 +398,7 @@ impl Parse for ast::StmtImportFrom { } impl Parse for ast::StmtGlobal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -530,12 +416,7 @@ impl Parse for ast::StmtGlobal { } impl Parse for ast::StmtNonlocal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -553,12 +434,7 @@ impl Parse for ast::StmtNonlocal { } impl Parse for ast::StmtExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -576,12 +452,7 @@ impl Parse for ast::StmtExpr { } impl Parse for ast::StmtPass { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -599,12 +470,7 @@ impl Parse for ast::StmtPass { } impl Parse for ast::StmtBreak { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -622,12 +488,7 @@ impl Parse for ast::StmtBreak { } impl Parse for ast::StmtContinue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -645,12 +506,7 @@ impl Parse for ast::StmtContinue { } impl Parse for ast::ExprBoolOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -668,12 +524,7 @@ impl Parse for ast::ExprBoolOp { } impl Parse for ast::ExprNamedExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -691,12 +542,7 @@ impl Parse for ast::ExprNamedExpr { } impl Parse for ast::ExprBinOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -714,12 +560,7 @@ impl Parse for ast::ExprBinOp { } impl Parse for ast::ExprUnaryOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -737,12 +578,7 @@ impl Parse for ast::ExprUnaryOp { } impl Parse for ast::ExprLambda { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -760,12 +596,7 @@ impl Parse for ast::ExprLambda { } impl Parse for ast::ExprIfExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -783,12 +614,7 @@ impl Parse for ast::ExprIfExp { } impl Parse for ast::ExprDict { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -806,12 +632,7 @@ impl Parse for ast::ExprDict { } impl Parse for ast::ExprSet { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -829,12 +650,7 @@ impl Parse for ast::ExprSet { } impl Parse for ast::ExprListComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -852,12 +668,7 @@ impl Parse for ast::ExprListComp { } impl Parse for ast::ExprSetComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -875,12 +686,7 @@ impl Parse for ast::ExprSetComp { } impl Parse for ast::ExprDictComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -898,12 +704,7 @@ impl Parse for ast::ExprDictComp { } impl Parse for ast::ExprGeneratorExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -921,12 +722,7 @@ impl Parse for ast::ExprGeneratorExp { } impl Parse for ast::ExprAwait { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -944,12 +740,7 @@ impl Parse for ast::ExprAwait { } impl Parse for ast::ExprYield { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -967,12 +758,7 @@ impl Parse for ast::ExprYield { } impl Parse for ast::ExprYieldFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -990,12 +776,7 @@ impl Parse for ast::ExprYieldFrom { } impl Parse for ast::ExprCompare { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1013,12 +794,7 @@ impl Parse for ast::ExprCompare { } impl Parse for ast::ExprCall { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1036,12 +812,7 @@ impl Parse for ast::ExprCall { } impl Parse for ast::ExprFormattedValue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1059,12 +830,7 @@ impl Parse for ast::ExprFormattedValue { } impl Parse for ast::ExprJoinedStr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1082,12 +848,7 @@ impl Parse for ast::ExprJoinedStr { } impl Parse for ast::ExprConstant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1105,12 +866,7 @@ impl Parse for ast::ExprConstant { } impl Parse for ast::ExprAttribute { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1128,12 +884,7 @@ impl Parse for ast::ExprAttribute { } impl Parse for ast::ExprSubscript { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1151,12 +902,7 @@ impl Parse for ast::ExprSubscript { } impl Parse for ast::ExprStarred { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1174,12 +920,7 @@ impl Parse for ast::ExprStarred { } impl Parse for ast::ExprName { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1197,12 +938,7 @@ impl Parse for ast::ExprName { } impl Parse for ast::ExprList { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1220,12 +956,7 @@ impl Parse for ast::ExprList { } impl Parse for ast::ExprTuple { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1243,12 +974,7 @@ impl Parse for ast::ExprTuple { } impl Parse for ast::ExprSlice { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index 98b653c2..2df4d498 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -27,166 +27,47 @@ //! ``` //! //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html + +use std::borrow::Cow; +use std::iter::FusedIterator; +use std::{char, cmp::Ordering, str::FromStr}; + +use num_bigint::BigInt; +use num_traits::{Num, Zero}; +use rustpython_ast::MagicKind; +use unic_emoji_char::is_emoji_presentation; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; + +use crate::lexer::cursor::{Cursor, EOF_CHAR}; +use crate::lexer::indentation::{Indentation, Indentations}; +use crate::text_size::TextLen; use crate::{ - ast::bigint::BigInt, - ast::MagicKind, soft_keywords::SoftKeywordTransformer, string::FStringErrorType, - text_size::{TextLen, TextRange, TextSize}, + text_size::{TextRange, TextSize}, token::{StringKind, Tok}, Mode, }; -use log::trace; -use num_traits::{Num, Zero}; -use std::{char, cmp::Ordering, ops::Index, slice::SliceIndex, str::FromStr}; -use unic_emoji_char::is_emoji_presentation; -use unic_ucd_ident::{is_xid_continue, is_xid_start}; - -// Indentations are tracked by a stack of indentation levels. IndentationLevel keeps -// track of the number of tabs and spaces at the current level. -#[derive(Clone, Copy, PartialEq, Debug, Default)] -struct IndentationLevel { - tabs: u32, - spaces: u32, -} -impl IndentationLevel { - fn compare_strict( - &self, - other: &IndentationLevel, - location: TextSize, - ) -> Result { - // We only know for sure that we're smaller or bigger if tabs - // and spaces both differ in the same direction. Otherwise we're - // dependent on the size of tabs. - match self.tabs.cmp(&other.tabs) { - Ordering::Less => { - if self.spaces <= other.spaces { - Ok(Ordering::Less) - } else { - Err(LexicalError { - location, - error: LexicalErrorType::TabError, - }) - } - } - Ordering::Greater => { - if self.spaces >= other.spaces { - Ok(Ordering::Greater) - } else { - Err(LexicalError { - location, - error: LexicalErrorType::TabError, - }) - } - } - Ordering::Equal => Ok(self.spaces.cmp(&other.spaces)), - } - } -} - -// The indentations stack is used to keep track of the current indentation level. -// Similar to the CPython implementation, the Indentations stack always has at -// least one level which is never popped. See Reference 2.1.8. -#[derive(Debug)] -struct Indentations { - indent_stack: Vec, -} - -impl Indentations { - fn is_empty(&self) -> bool { - self.indent_stack.len() == 1 - } - - fn push(&mut self, indent: IndentationLevel) { - self.indent_stack.push(indent); - } - - fn pop(&mut self) -> Option { - if self.is_empty() { - return None; - } - self.indent_stack.pop() - } - - fn current(&self) -> &IndentationLevel { - self.indent_stack - .last() - .expect("Indentations must have at least one level") - } -} - -impl Default for Indentations { - fn default() -> Self { - Self { - indent_stack: vec![IndentationLevel::default()], - } - } -} - -// A CharWindow is a sliding window over an iterator of chars. It is used to -// allow for look-ahead when scanning tokens from the source code. -struct CharWindow, const N: usize> { - source: T, - window: [Option; N], -} - -impl CharWindow -where - T: Iterator, -{ - fn new(source: T) -> Self { - Self { - source, - window: [None; N], - } - } - - fn slide(&mut self) -> Option { - self.window.rotate_left(1); - let next = self.source.next(); - *self.window.last_mut().expect("never empty") = next; - next - } -} - -impl Index for CharWindow -where - T: Iterator, - Idx: SliceIndex<[Option]>, -{ - type Output = Idx::Output; - - fn index(&self, index: Idx) -> &Self::Output { - &self.window[index] - } -} +mod cursor; +mod indentation; /// A lexer for Python source code. -pub struct Lexer> { +pub struct Lexer<'source> { // Contains the source code to be lexed. - window: CharWindow, - // Are we at the beginning of a line? - at_begin_of_line: bool, + cursor: Cursor<'source>, + source: &'source str, + + state: State, // Amount of parenthesis. - nesting: usize, + nesting: u32, // Indentation levels. indentations: Indentations, - // Pending list of tokens to be returned. - pending: Vec, - // The current location. - location: TextSize, - // Is the last token an equal sign? - last_token_is_equal: bool, + pending_indentation: Option, // Lexer mode. mode: Mode, } -// generated in build.rs, in gen_phf() -/// A map of keywords to their tokens. -pub static KEYWORDS: phf::Map<&'static str, Tok> = - include!(concat!(env!("OUT_DIR"), "/keywords.rs")); - /// Contains a Token along with its `range`. pub type Spanned = (Tok, TextRange); /// The result of lexing a token. @@ -207,8 +88,43 @@ pub type LexResult = Result; /// } /// ``` #[inline] -pub fn lex(source: &str, mode: Mode) -> impl Iterator + '_ { - lex_starts_at(source, mode, TextSize::default()) +pub fn lex(source: &str, mode: Mode) -> SoftKeywordTransformer { + SoftKeywordTransformer::new(Lexer::new(source, mode), mode) +} + +pub struct LexStartsAtIterator { + start_offset: TextSize, + inner: I, +} + +impl Iterator for LexStartsAtIterator +where + I: Iterator, +{ + type Item = LexResult; + + #[inline] + fn next(&mut self) -> Option { + let result = match self.inner.next()? { + Ok((tok, range)) => Ok((tok, range + self.start_offset)), + Err(error) => Err(LexicalError { + location: error.location + self.start_offset, + ..error + }), + }; + + Some(result) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl FusedIterator for LexStartsAtIterator where I: Iterator + FusedIterator {} +impl ExactSizeIterator for LexStartsAtIterator where + I: Iterator + ExactSizeIterator +{ } /// Create a new lexer from a source string, starting at a given location. @@ -217,194 +133,219 @@ pub fn lex_starts_at( source: &str, mode: Mode, start_offset: TextSize, -) -> SoftKeywordTransformer>> { - SoftKeywordTransformer::new(Lexer::new(source.chars(), mode, start_offset), mode) +) -> LexStartsAtIterator> { + LexStartsAtIterator { + start_offset, + inner: lex(source, mode), + } } -impl Lexer -where - T: Iterator, -{ +impl<'source> Lexer<'source> { /// Create a new lexer from T and a starting location. You probably want to use /// [`lex`] instead. - pub fn new(input: T, mode: Mode, start: TextSize) -> Self { + pub fn new(input: &'source str, mode: Mode) -> Self { let mut lxr = Lexer { - at_begin_of_line: true, + state: State::AfterNewline, nesting: 0, indentations: Indentations::default(), - // Usually we have less than 5 tokens pending. - pending: Vec::with_capacity(5), - location: start, - window: CharWindow::new(input), - last_token_is_equal: false, + pending_indentation: None, + + source: input, + cursor: Cursor::new(input), mode, }; - // Fill the window. - lxr.window.slide(); - lxr.window.slide(); - lxr.window.slide(); // TODO: Handle possible mismatch between BOM and explicit encoding declaration. // spell-checker:ignore feff - if let Some('\u{feff}') = lxr.window[0] { - lxr.window.slide(); - lxr.location += '\u{feff}'.text_len(); - } + lxr.cursor.eat_char('\u{feff}'); + lxr } /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier(&mut self) -> LexResult { + fn lex_identifier(&mut self, first: char) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' - match self.window[..3] { - [Some(c), Some('"' | '\''), ..] => { - if let Ok(kind) = StringKind::try_from(c) { - return self.lex_string(kind); + match self.cursor.first() { + quote @ ('\'' | '"') => { + if let Ok(string_kind) = StringKind::try_from(first) { + self.cursor.bump(); + return self.lex_string(string_kind, quote); } } - [Some(c1), Some(c2), Some('"' | '\'')] => { - if let Ok(kind) = StringKind::try_from([c1, c2]) { - return self.lex_string(kind); + second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => { + self.cursor.bump(); + + if let Ok(string_kind) = StringKind::try_from([first, second]) { + let quote = self.cursor.bump().unwrap(); + return self.lex_string(string_kind, quote); } } _ => {} - }; - - let start_pos = self.get_pos(); - let mut name = String::with_capacity(8); - while self.is_identifier_continuation() { - name.push(self.next_char().unwrap()); } - let end_pos = self.get_pos(); - if let Some(tok) = KEYWORDS.get(&name) { - Ok((tok.clone(), TextRange::new(start_pos, end_pos))) - } else { - Ok((Tok::Name { name }, TextRange::new(start_pos, end_pos))) - } + self.cursor.eat_while(is_identifier_continuation); + + let text = self.token_text(); + + let keyword = match text { + "False" => Tok::False, + "None" => Tok::None, + "True" => Tok::True, + "and" => Tok::And, + "as" => Tok::As, + "assert" => Tok::Assert, + "async" => Tok::Async, + "await" => Tok::Await, + "break" => Tok::Break, + "case" => Tok::Case, + "class" => Tok::Class, + "continue" => Tok::Continue, + "def" => Tok::Def, + "del" => Tok::Del, + "elif" => Tok::Elif, + "else" => Tok::Else, + "except" => Tok::Except, + "finally" => Tok::Finally, + "for" => Tok::For, + "from" => Tok::From, + "global" => Tok::Global, + "if" => Tok::If, + "import" => Tok::Import, + "in" => Tok::In, + "is" => Tok::Is, + "lambda" => Tok::Lambda, + "match" => Tok::Match, + "nonlocal" => Tok::Nonlocal, + "not" => Tok::Not, + "or" => Tok::Or, + "pass" => Tok::Pass, + "raise" => Tok::Raise, + "return" => Tok::Return, + "try" => Tok::Try, + "type" => Tok::Type, + "while" => Tok::While, + "with" => Tok::With, + "yield" => Tok::Yield, + _ => { + return Ok(Tok::Name { + name: text.to_string(), + }) + } + }; + + Ok(keyword) } /// Numeric lexing. The feast can start! - fn lex_number(&mut self) -> LexResult { - let start_pos = self.get_pos(); - match self.window[..2] { - [Some('0'), Some('x' | 'X')] => { - // Hex! (0xdeadbeef) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 16) - } - [Some('0'), Some('o' | 'O')] => { - // Octal style! (0o377) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 8) - } - [Some('0'), Some('b' | 'B')] => { - // Binary! (0b_1110_0101) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 2) + fn lex_number(&mut self, first: char) -> Result { + if first == '0' { + if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { + self.lex_number_radix(Radix::Hex) + } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { + self.lex_number_radix(Radix::Octal) + } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { + self.lex_number_radix(Radix::Binary) + } else { + self.lex_decimal_number(first) } - _ => self.lex_normal_number(), + } else { + self.lex_decimal_number(first) } } /// Lex a hex/octal/decimal/binary number without a decimal point. - fn lex_number_radix(&mut self, start_pos: TextSize, radix: u32) -> LexResult { - let value_text = self.radix_run(radix); - let end_pos = self.get_pos(); - let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError { - error: LexicalErrorType::OtherError(format!("{e:?}")), - location: start_pos, - })?; - Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos))) + fn lex_number_radix(&mut self, radix: Radix) -> Result { + #[cfg(debug_assertions)] + debug_assert!(matches!( + self.cursor.previous().to_ascii_lowercase(), + 'x' | 'o' | 'b' + )); + + let value_text = self.radix_run(None, radix); + let value = + BigInt::from_str_radix(&value_text, radix.as_u32()).map_err(|e| LexicalError { + error: LexicalErrorType::OtherError(format!("{e:?}")), + location: self.token_range().start(), + })?; + Ok(Tok::Int { value }) } /// Lex a normal number, that is, no octal, hex or binary number. - fn lex_normal_number(&mut self) -> LexResult { - let start_pos = self.get_pos(); - let start_is_zero = self.window[0] == Some('0'); - // Normal number: - let mut value_text = self.radix_run(10); - - // If float: - if self.window[0] == Some('.') || self.at_exponent() { - // Take '.': - if self.window[0] == Some('.') { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap()); - value_text.push_str(&self.radix_run(10)); + fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> Result { + #[cfg(debug_assertions)] + debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); + let start_is_zero = first_digit_or_dot == '0'; + + let mut value_text = if first_digit_or_dot == '.' { + String::new() + } else { + self.radix_run(Some(first_digit_or_dot), Radix::Decimal) + .into_owned() + }; + + let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { + value_text.push('.'); + + if self.cursor.eat_char('_') { + return Err(LexicalError { + error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), + location: self.offset() - TextSize::new(1), + }); } - // 1e6 for example: - if let Some('e' | 'E') = self.window[0] { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap().to_ascii_lowercase()); - // Optional +/- - if matches!(self.window[0], Some('-' | '+')) { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap()); + value_text.push_str(&self.radix_run(None, Radix::Decimal)); + true + } else { + // Normal number: + false + }; + + let is_float = match self.cursor.rest().as_bytes() { + [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { + value_text.push('e'); + self.cursor.bump(); // e | E + + if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { + value_text.push(sign); } - value_text.push_str(&self.radix_run(10)); + value_text.push_str(&self.radix_run(None, Radix::Decimal)); + + true } + _ => is_float, + }; + if is_float { + // Improvement: Use `Cow` instead of pushing to value text let value = f64::from_str(&value_text).map_err(|_| LexicalError { error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()), - location: self.get_pos(), + location: self.token_start(), })?; // Parse trailing 'j': - if matches!(self.window[0], Some('j' | 'J')) { - self.next_char(); - let end_pos = self.get_pos(); - Ok(( - Tok::Complex { - real: 0.0, - imag: value, - }, - TextRange::new(start_pos, end_pos), - )) + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + Ok(Tok::Complex { + real: 0.0, + imag: value, + }) } else { - let end_pos = self.get_pos(); - Ok((Tok::Float { value }, TextRange::new(start_pos, end_pos))) + Ok(Tok::Float { value }) } } else { // Parse trailing 'j': - if matches!(self.window[0], Some('j' | 'J')) { - self.next_char(); - let end_pos = self.get_pos(); + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { let imag = f64::from_str(&value_text).unwrap(); - Ok(( - Tok::Complex { real: 0.0, imag }, - TextRange::new(start_pos, end_pos), - )) + Ok(Tok::Complex { real: 0.0, imag }) } else { - let end_pos = self.get_pos(); let value = value_text.parse::().unwrap(); if start_is_zero && !value.is_zero() { // leading zeros in decimal integer literals are not permitted return Err(LexicalError { error: LexicalErrorType::OtherError("Invalid Token".to_owned()), - location: self.get_pos(), + location: self.token_range().start(), }); } - Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos))) + Ok(Tok::Int { value }) } } } @@ -412,105 +353,54 @@ where /// Consume a sequence of numbers with the given radix, /// the digits can be decorated with underscores /// like this: '1_2_3_4' == '1234' - fn radix_run(&mut self, radix: u32) -> String { - let mut value_text = String::new(); - - loop { - if let Some(c) = self.take_number(radix) { - value_text.push(c); - } else if self.window[0] == Some('_') - && Lexer::::is_digit_of_radix(self.window[1], radix) - { - self.next_char(); - } else { - break; - } - } - value_text - } - - /// Consume a single character with the given radix. - fn take_number(&mut self, radix: u32) -> Option { - let take_char = Lexer::::is_digit_of_radix(self.window[0], radix); - - take_char.then(|| self.next_char().unwrap()) - } + fn radix_run(&mut self, first: Option, radix: Radix) -> Cow<'source, str> { + let start = if let Some(first) = first { + self.offset() - first.text_len() + } else { + self.offset() + }; + self.cursor.eat_while(|c| radix.is_digit(c)); - /// Test if a digit is of a certain radix. - fn is_digit_of_radix(c: Option, radix: u32) -> bool { - match radix { - 2 => matches!(c, Some('0'..='1')), - 8 => matches!(c, Some('0'..='7')), - 10 => matches!(c, Some('0'..='9')), - 16 => matches!(c, Some('0'..='9') | Some('a'..='f') | Some('A'..='F')), - other => unimplemented!("Radix not implemented: {}", other), - } - } + let number = &self.source[TextRange::new(start, self.offset())]; - /// Test if we face '[eE][-+]?[0-9]+' - fn at_exponent(&self) -> bool { - match self.window[..2] { - [Some('e' | 'E'), Some('+' | '-')] => matches!(self.window[2], Some('0'..='9')), - [Some('e' | 'E'), Some('0'..='9')] => true, - _ => false, - } - } + // Number that contains `_` separators. Remove them from the parsed text. + if radix.is_digit(self.cursor.second()) && self.cursor.eat_char('_') { + let mut value_text = number.to_string(); - /// Lex a single comment. - #[cfg(feature = "full-lexer")] - fn lex_comment(&mut self) -> LexResult { - let start_pos = self.get_pos(); - let mut value = String::new(); - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - let end_pos = self.get_pos(); - return Ok((Tok::Comment(value), TextRange::new(start_pos, end_pos))); + loop { + if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { + value_text.push(c); + } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + // Skip over `_` + self.cursor.bump(); + } else { + break; } - Some(_) => {} } - value.push(self.next_char().unwrap()); + + Cow::Owned(value_text) + } else { + Cow::Borrowed(number) } } - #[cfg(feature = "full-lexer")] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - let comment = self.lex_comment()?; - self.emit(comment); - Ok(()) - } + /// Lex a single comment. + fn lex_comment(&mut self) -> Result { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), '#'); - /// Discard comment if full-lexer is not enabled. - #[cfg(not(feature = "full-lexer"))] - fn lex_comment(&mut self) { - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - return; - } - Some(_) => {} - } - self.next_char().unwrap(); - } - } + self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); - #[cfg(not(feature = "full-lexer"))] - #[inline] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - self.lex_comment(); - Ok(()) + return Ok(Tok::Comment(self.token_text().to_string())); } /// Lex a single magic command. - fn lex_magic_command(&mut self, kind: MagicKind) -> (Tok, TextRange) { - let start_pos = self.get_pos(); - for _ in 0..u32::from(kind.prefix_len()) { - self.next_char(); - } + fn lex_magic_command(&mut self, kind: MagicKind) -> Tok { let mut value = String::new(); + loop { - match self.window[0] { - Some('\\') => { + match self.cursor.first() { + '\\' => { // Only skip the line continuation if it is followed by a newline // otherwise it is a normal backslash which is part of the magic command: // @@ -520,94 +410,78 @@ where // && ls -a | sed 's/^/\\ /' // ^^ // Don't skip these backslashes - if matches!(self.window[1], Some('\n' | '\r')) { - self.next_char(); - self.next_char(); + if self.cursor.second() == '\r' { + self.cursor.bump(); + self.cursor.bump(); + self.cursor.eat_char('\n'); continue; + } else if self.cursor.second() == '\n' { + self.cursor.bump(); + self.cursor.bump(); + continue; + } else { + self.cursor.bump(); + value.push('\\'); } } - Some('\n' | '\r') | None => { - let end_pos = self.get_pos(); - return ( - Tok::MagicCommand { kind, value }, - TextRange::new(start_pos, end_pos), - ); + '\n' | '\r' | EOF_CHAR => { + return Tok::MagicCommand { kind, value }; + } + c => { + self.cursor.bump(); + value.push(c); } - Some(_) => {} - } - value.push(self.next_char().unwrap()); - } - } - - fn lex_and_emit_magic_command(&mut self) { - let kind = match self.window[..2] { - [Some(c1), Some(c2)] => { - MagicKind::try_from([c1, c2]).map_or_else(|_| MagicKind::try_from(c1), Ok) } - // When the escape character is the last character of the file. - [Some(c), None] => MagicKind::try_from(c), - _ => return, - }; - if let Ok(kind) = kind { - let magic_command = self.lex_magic_command(kind); - self.emit(magic_command); } } /// Lex a string literal. - fn lex_string(&mut self, kind: StringKind) -> LexResult { - let start_pos = self.get_pos(); - for _ in 0..u32::from(kind.prefix_len()) { - self.next_char(); - } - let quote_char = self.next_char().unwrap(); - let mut string_content = String::with_capacity(5); + fn lex_string(&mut self, kind: StringKind, quote: char) -> Result { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), quote); // If the next two characters are also the quote character, then we have a triple-quoted // string; consume those two characters and ensure that we require a triple-quote to close - let triple_quoted = if self.window[..2] == [Some(quote_char); 2] { - self.next_char(); - self.next_char(); + let triple_quoted = if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); true } else { false }; - loop { - match self.next_char() { - Some(c) => { - if c == '\\' { - if let Some(next_c) = self.next_char() { - string_content.push('\\'); - string_content.push(next_c); - continue; - } - } - if c == '\n' && !triple_quoted { - return Err(LexicalError { - error: LexicalErrorType::OtherError( - "EOL while scanning string literal".to_owned(), - ), - location: self.get_pos(), - }); - } + let value_start = self.offset(); - if c == quote_char { - if triple_quoted { - // Look ahead at the next two characters; if we have two more - // quote_chars, it's the end of the string; consume the remaining - // closing quotes and break the loop - if self.window[..2] == [Some(quote_char); 2] { - self.next_char(); - self.next_char(); - break; - } - } else { - break; + let value_end = loop { + match self.cursor.bump() { + Some('\\') => { + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else { + self.cursor.bump(); + } + } + Some('\r' | '\n') if !triple_quoted => { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "EOL while scanning string literal".to_owned(), + ), + location: self.offset() - TextSize::new(1), + }); + } + Some(c) if c == quote => { + if triple_quoted { + if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); + break self.offset() - TextSize::new(3); } + } else { + break self.offset() - TextSize::new(1); } - string_content.push(c); } + + Some(_) => {} None => { return Err(LexicalError { error: if triple_quoted { @@ -615,719 +489,437 @@ where } else { LexicalErrorType::StringError }, - location: self.get_pos(), + location: self.offset(), }); } } - } - let end_pos = self.get_pos(); + }; + let tok = Tok::String { - value: string_content, + value: self.source[TextRange::new(value_start, value_end)].to_string(), kind, triple_quoted, }; - Ok((tok, TextRange::new(start_pos, end_pos))) - } - - // Checks if the character c is a valid starting character as described - // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers - fn is_identifier_start(&self, c: char) -> bool { - match c { - 'a'..='z' | 'A'..='Z' | '_' => true, - _ => is_xid_start(c), - } - } - - // Checks if the character c is a valid continuation character as described - // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers - fn is_identifier_continuation(&self) -> bool { - match self.window[0] { - Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true, - Some(c) => is_xid_continue(c), - _ => false, - } + Ok(tok) } // This is the main entry point. Call this function to retrieve the next token. // This function is used by the iterator implementation. - fn inner_next(&mut self) -> LexResult { - // top loop, keep on processing, until we have something pending. - while self.pending.is_empty() { - // Detect indentation levels - if self.at_begin_of_line { - self.handle_indentations()?; - if self.mode == Mode::Jupyter - // https://github.com/ipython/ipython/blob/635815e8f1ded5b764d66cacc80bbe25e9e2587f/IPython/core/inputtransformer2.py#L345 - && matches!(self.window[0], Some('%' | '!' | '?' | '/' | ';' | ',')) - { - self.lex_and_emit_magic_command(); - } + pub fn next_token(&mut self) -> LexResult { + // Return dedent tokens until the current indentation level matches the indentation of the next token. + if let Some(indentation) = self.pending_indentation.take() { + if let Ok(Ordering::Greater) = self.indentations.current().try_compare(&indentation) { + self.pending_indentation = Some(indentation); + self.indentations.pop(); + return Ok((Tok::Dedent, TextRange::empty(self.offset()))); } - - self.consume_normal()?; } - Ok(self.pending.remove(0)) - } + let mut indentation = Indentation::root(); + self.cursor.start_token(); - // Given we are at the start of a line, count the number of spaces and/or tabs until the first character. - fn eat_indentation(&mut self) -> Result { - // Determine indentation: - let mut spaces: u32 = 0; - let mut tabs: u32 = 0; loop { - match self.window[0] { - Some(' ') => { - /* - if tabs != 0 { - // Don't allow spaces after tabs as part of indentation. - // This is technically stricter than python3 but spaces after - // tabs is even more insane than mixing spaces and tabs. - return Some(Err(LexicalError { - error: LexicalErrorType::OtherError("Spaces not allowed as part of indentation after tabs".to_owned()), - location: self.get_pos(), - })); - } - */ - self.next_char(); - spaces += 1; + match self.cursor.first() { + ' ' => { + self.cursor.bump(); + indentation = indentation.add_space(); } - Some('\t') => { - if spaces != 0 { - // Don't allow tabs after spaces as part of indentation. - // This is technically stricter than python3 but spaces before - // tabs is even more insane than mixing spaces and tabs. + '\t' => { + self.cursor.bump(); + indentation = indentation.add_tab(); + } + '\\' => { + self.cursor.bump(); + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.is_eof() { return Err(LexicalError { - error: LexicalErrorType::TabsAfterSpaces, - location: self.get_pos(), + error: LexicalErrorType::Eof, + location: self.token_start(), + }); + } else if !self.cursor.eat_char('\n') { + return Err(LexicalError { + error: LexicalErrorType::LineContinuationError, + location: self.token_start(), }); } - self.next_char(); - tabs += 1; - } - Some('#') => { - self.lex_and_emit_comment()?; - spaces = 0; - tabs = 0; + indentation = Indentation::root(); } - Some('\x0C') => { - // Form feed character! - // Reset indentation for the Emacs user. - self.next_char(); - spaces = 0; - tabs = 0; - } - Some('\n' | '\r') => { - // Empty line! - #[cfg(feature = "full-lexer")] - let tok_start = self.get_pos(); - self.next_char(); - #[cfg(feature = "full-lexer")] - let tok_end = self.get_pos(); - #[cfg(feature = "full-lexer")] - self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); - spaces = 0; - tabs = 0; - } - None => { - spaces = 0; - tabs = 0; - break; - } - _ => { - self.at_begin_of_line = false; - break; + // Form feed + '\x0C' => { + self.cursor.bump(); + indentation = Indentation::root(); } + _ => break, } } - Ok(IndentationLevel { tabs, spaces }) - } - - // Push/pop indents/dedents based on the current indentation level. - fn handle_indentations(&mut self) -> Result<(), LexicalError> { - let indentation_level = self.eat_indentation()?; + if self.state.is_after_newline() { + // Handle indentation if this is a new, not all empty, logical line + if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) { + self.state = State::NonEmptyLogicalLine; - if self.nesting != 0 { - return Ok(()); - } + if let Some(spanned) = self.handle_indentation(indentation)? { + // Set to false so that we don't handle indentation on the next call. - // Determine indent or dedent: - let current_indentation = self.indentations.current(); - let ordering = indentation_level.compare_strict(current_indentation, self.get_pos())?; - match ordering { - Ordering::Equal => { - // Same same - } - Ordering::Greater => { - // New indentation level: - self.indentations.push(indentation_level); - let tok_pos = self.get_pos(); - self.emit(( - Tok::Indent, - TextRange::new( - tok_pos - - TextSize::new(indentation_level.spaces) - - TextSize::new(indentation_level.tabs), - tok_pos, - ), - )); - } - Ordering::Less => { - // One or more dedentations - // Pop off other levels until col is found: - - loop { - let current_indentation = self.indentations.current(); - let ordering = - indentation_level.compare_strict(current_indentation, self.get_pos())?; - match ordering { - Ordering::Less => { - self.indentations.pop(); - let tok_pos = self.get_pos(); - self.emit((Tok::Dedent, TextRange::empty(tok_pos))); - } - Ordering::Equal => { - // We arrived at proper level of indentation. - break; - } - Ordering::Greater => { - return Err(LexicalError { - error: LexicalErrorType::IndentationError, - location: self.get_pos(), - }); - } - } + return Ok(spanned); } } } - Ok(()) - } + self.cursor.start_token(); + if let Some(c) = self.cursor.bump() { + if c.is_ascii() { + self.consume_ascii_character(c) + } else if is_unicode_identifier_start(c) { + let identifier = self.lex_identifier(c)?; + self.state = State::Other; + + Ok((identifier, self.token_range())) + } else if is_emoji_presentation(c) { + self.state = State::Other; - // Take a look at the next character, if any, and decide upon the next steps. - fn consume_normal(&mut self) -> Result<(), LexicalError> { - if let Some(c) = self.window[0] { - // Identifiers are the most common case. - if self.is_identifier_start(c) { - let identifier = self.lex_identifier()?; - self.emit(identifier); + Ok(( + Tok::Name { + name: c.to_string(), + }, + self.token_range(), + )) } else { - self.consume_character(c)?; + Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: c }, + location: self.token_start(), + }) } } else { - // We reached end of file. - let tok_pos = self.get_pos(); + // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line, + // empty the dedent stack, and finally, return the EndOfFile token. + self.consume_end() + } + } - // First of all, we need all nestings to be finished. - if self.nesting > 0 { - return Err(LexicalError { - error: LexicalErrorType::Eof, - location: tok_pos, - }); - } + fn handle_indentation( + &mut self, + indentation: Indentation, + ) -> Result, LexicalError> { + let token = match self.indentations.current().try_compare(&indentation) { + // Dedent + Ok(Ordering::Greater) => { + self.indentations.pop(); + self.pending_indentation = Some(indentation); - // Next, insert a trailing newline, if required. - if !self.at_begin_of_line { - self.at_begin_of_line = true; - self.emit((Tok::Newline, TextRange::empty(tok_pos))); + Some((Tok::Dedent, TextRange::empty(self.offset()))) } - // Next, flush the indentation stack to zero. - while !self.indentations.is_empty() { - self.indentations.pop(); - self.emit((Tok::Dedent, TextRange::empty(tok_pos))); + Ok(Ordering::Equal) => None, + + // Indent + Ok(Ordering::Less) => { + self.indentations.push(indentation); + Some((Tok::Indent, self.token_range())) } + Err(_) => { + return Err(LexicalError { + error: LexicalErrorType::IndentationError, + location: self.offset(), + }); + } + }; - self.emit((Tok::EndOfFile, TextRange::empty(tok_pos))); + Ok(token) + } + + fn consume_end(&mut self) -> Result { + // We reached end of file. + // First of all, we need all nestings to be finished. + if self.nesting > 0 { + return Err(LexicalError { + error: LexicalErrorType::Eof, + location: self.offset(), + }); } - Ok(()) + // Next, insert a trailing newline, if required. + if !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + Ok((Tok::Newline, TextRange::empty(self.offset()))) + } + // Next, flush the indentation stack to zero. + else if self.indentations.pop().is_some() { + Ok((Tok::Dedent, TextRange::empty(self.offset()))) + } else { + Ok((Tok::EndOfFile, TextRange::empty(self.offset()))) + } } // Dispatch based on the given character. - fn consume_character(&mut self, c: char) -> Result<(), LexicalError> { - match c { - '0'..='9' => { - let number = self.lex_number()?; - self.emit(number); - } - '#' => { - self.lex_and_emit_comment()?; - } - '"' | '\'' => { - let string = self.lex_string(StringKind::String)?; - self.emit(string); - } + fn consume_ascii_character(&mut self, c: char) -> Result { + let token = match c { + c if is_ascii_identifier_start(c) => self.lex_identifier(c)?, + '0'..='9' => self.lex_number(c)?, + '#' => return self.lex_comment().map(|token| (token, self.token_range())), + '"' | '\'' => self.lex_string(StringKind::String, c)?, '=' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::EqEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Equal, TextRange::new(tok_start, tok_end))); - } + if self.cursor.eat_char('=') { + Tok::EqEqual + } else { + self.state = State::AfterEqual; + return Ok((Tok::Equal, self.token_range())); } } '+' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::PlusEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::PlusEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Plus, TextRange::new(tok_start, tok_end))); + Tok::Plus } } '*' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::StarEqual, TextRange::new(tok_start, tok_end))); - } - Some('*') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::DoubleStarEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::DoubleStar, TextRange::new(tok_start, tok_end))); - } - } - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Star, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::StarEqual + } else if self.cursor.eat_char('*') { + if self.cursor.eat_char('=') { + Tok::DoubleStarEqual + } else { + Tok::DoubleStar } + } else { + Tok::Star } } + + c @ ('%' | '!') + if self.mode == Mode::Jupyter + && self.state.is_after_equal() + && self.nesting == 0 => + { + // SAFETY: Safe because `c` has been matched against one of the possible magic command prefix + self.lex_magic_command(MagicKind::try_from(c).unwrap()) + } + + c @ ('%' | '!' | '?' | '/' | ';' | ',') + if self.mode == Mode::Jupyter && self.state.is_new_logical_line() => + { + let kind = if let Ok(kind) = MagicKind::try_from([c, self.cursor.first()]) { + self.cursor.bump(); + kind + } else { + // SAFETY: Safe because `c` has been matched against one of the possible magic command prefix + MagicKind::try_from(c).unwrap() + }; + + self.lex_magic_command(kind) + } '/' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::SlashEqual, TextRange::new(tok_start, tok_end))); - } - Some('/') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::DoubleSlashEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::DoubleSlash, TextRange::new(tok_start, tok_end))); - } - } - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Slash, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::SlashEqual + } else if self.cursor.eat_char('/') { + if self.cursor.eat_char('=') { + Tok::DoubleSlashEqual + } else { + Tok::DoubleSlash } + } else { + Tok::Slash } } '%' => { - if self.mode == Mode::Jupyter && self.nesting == 0 && self.last_token_is_equal { - self.lex_and_emit_magic_command(); + if self.cursor.eat_char('=') { + Tok::PercentEqual } else { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::PercentEqual, TextRange::new(tok_start, tok_end))); - } else { - let tok_end = self.get_pos(); - self.emit((Tok::Percent, TextRange::new(tok_start, tok_end))); - } + Tok::Percent } } '|' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::VbarEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::VbarEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Vbar, TextRange::new(tok_start, tok_end))); + Tok::Vbar } } '^' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::CircumflexEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::CircumflexEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::CircumFlex, TextRange::new(tok_start, tok_end))); + Tok::CircumFlex } } '&' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::AmperEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::AmperEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Amper, TextRange::new(tok_start, tok_end))); + Tok::Amper } } '-' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::MinusEqual, TextRange::new(tok_start, tok_end))); - } - Some('>') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::Rarrow, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Minus, TextRange::new(tok_start, tok_end))); - } + if self.cursor.eat_char('=') { + Tok::MinusEqual + } else if self.cursor.eat_char('>') { + Tok::Rarrow + } else { + Tok::Minus } } '@' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::AtEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::AtEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::At, TextRange::new(tok_start, tok_end))); + Tok::At } } '!' => { - if self.mode == Mode::Jupyter && self.nesting == 0 && self.last_token_is_equal { - self.lex_and_emit_magic_command(); + if self.cursor.eat_char('=') { + Tok::NotEqual } else { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::NotEqual, TextRange::new(tok_start, tok_end))); - } else { - return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: '!' }, - location: tok_start, - }); - } + return Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: '!' }, + location: self.token_start(), + }); } } - '~' => { - self.eat_single_char(Tok::Tilde); - } + '~' => Tok::Tilde, '(' => { - self.eat_single_char(Tok::Lpar); self.nesting += 1; + Tok::Lpar } ')' => { - self.eat_single_char(Tok::Rpar); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rpar } '[' => { - self.eat_single_char(Tok::Lsqb); self.nesting += 1; + Tok::Lsqb } ']' => { - self.eat_single_char(Tok::Rsqb); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rsqb } '{' => { - self.eat_single_char(Tok::Lbrace); self.nesting += 1; + Tok::Lbrace } '}' => { - self.eat_single_char(Tok::Rbrace); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rbrace } ':' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::ColonEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::ColonEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Colon, TextRange::new(tok_start, tok_end))); + Tok::Colon } } - ';' => { - self.eat_single_char(Tok::Semi); - } + ';' => Tok::Semi, '<' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('<') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::LeftShiftEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::LeftShift, TextRange::new(tok_start, tok_end))); - } - } - } - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::LessEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Less, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('<') { + if self.cursor.eat_char('=') { + Tok::LeftShiftEqual + } else { + Tok::LeftShift } + } else if self.cursor.eat_char('=') { + Tok::LessEqual + } else { + Tok::Less } } '>' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('>') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::RightShiftEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::RightShift, TextRange::new(tok_start, tok_end))); - } - } - } - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::GreaterEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Greater, TextRange::new(tok_start, tok_end))); - } - } - } - ',' => { - self.eat_single_char(Tok::Comma); - } - '.' => { - if let Some('0'..='9') = self.window[1] { - let number = self.lex_number()?; - self.emit(number); - } else { - let tok_start = self.get_pos(); - self.next_char(); - if self.window[..2] == [Some('.'); 2] { - self.next_char(); - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::Ellipsis, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('>') { + if self.cursor.eat_char('=') { + Tok::RightShiftEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Dot, TextRange::new(tok_start, tok_end))); + Tok::RightShift } + } else if self.cursor.eat_char('=') { + Tok::GreaterEqual + } else { + Tok::Greater } } - '\n' | '\r' => { - let tok_start = self.get_pos(); - self.next_char(); - let tok_end = self.get_pos(); - - // Depending on the nesting level, we emit a logical or - // non-logical newline: - if self.nesting == 0 { - self.at_begin_of_line = true; - self.emit((Tok::Newline, TextRange::new(tok_start, tok_end))); + ',' => Tok::Comma, + '.' => { + if self.cursor.first().is_ascii_digit() { + self.lex_decimal_number('.')? + } else if self.cursor.first() == '.' && self.cursor.second() == '.' { + self.cursor.bump(); + self.cursor.bump(); + Tok::Ellipsis } else { - #[cfg(feature = "full-lexer")] - self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); + Tok::Dot } } - ' ' | '\t' | '\x0C' => { - // Skip white-spaces - self.next_char(); - while let Some(' ' | '\t' | '\x0C') = self.window[0] { - self.next_char(); - } + '\n' => { + return Ok(( + if self.nesting == 0 && !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + Tok::Newline + } else { + Tok::NonLogicalNewline + }, + self.token_range(), + )) } - '\\' => { - self.next_char(); - match self.window[0] { - Some('\n' | '\r') => { - self.next_char(); - } - _ => { - return Err(LexicalError { - error: LexicalErrorType::LineContinuationError, - location: self.get_pos(), - }); - } - } + '\r' => { + self.cursor.eat_char('\n'); - if self.window[0].is_none() { - return Err(LexicalError { - error: LexicalErrorType::Eof, - location: self.get_pos(), - }); - } + return Ok(( + if self.nesting == 0 && !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + Tok::Newline + } else { + Tok::NonLogicalNewline + }, + self.token_range(), + )); } + _ => { - if is_emoji_presentation(c) { - let tok_start = self.get_pos(); - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::Name { - name: c.to_string(), - }, - TextRange::new(tok_start, tok_end), - )); - } else { - let c = self.next_char(); - return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() }, - location: self.get_pos(), - }); - } + self.state = State::Other; + + return Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: c }, + location: self.token_start(), + }); } - } + }; - Ok(()) - } - - // Used by single character tokens to advance the window and emit the correct token. - fn eat_single_char(&mut self, ty: Tok) { - let tok_start = self.get_pos(); - self.next_char().unwrap_or_else(|| unsafe { - // SAFETY: eat_single_char has been called only after a character has been read - // from the window, so the window is guaranteed to be non-empty. - std::hint::unreachable_unchecked() - }); - let tok_end = self.get_pos(); - self.emit((ty, TextRange::new(tok_start, tok_end))); - } - - // Helper function to go to the next character coming up. - fn next_char(&mut self) -> Option { - let mut c = self.window[0]; - self.window.slide(); - match c { - Some('\r') => { - if self.window[0] == Some('\n') { - self.location += TextSize::from(1); - self.window.slide(); - } + self.state = State::Other; - self.location += TextSize::from(1); - c = Some('\n'); - } - #[allow(unused_variables)] - Some(c) => { - self.location += c.text_len(); - } - _ => {} - } - c + Ok((token, self.token_range())) + } + + #[inline] + fn token_range(&self) -> TextRange { + let end = self.offset(); + let len = self.cursor.token_len(); + + TextRange::at(end - len, len) } - // Helper function to retrieve the current position. - fn get_pos(&self) -> TextSize { - self.location + #[inline] + fn token_text(&self) -> &'source str { + &self.source[self.token_range()] } - // Helper function to emit a lexed token to the queue of tokens. - fn emit(&mut self, spanned: Spanned) { - self.last_token_is_equal = matches!(spanned.0, Tok::Equal); - self.pending.push(spanned); + #[inline] + fn offset(&self) -> TextSize { + TextSize::new(self.source.len() as u32) - self.cursor.text_len() + } + + #[inline] + fn token_start(&self) -> TextSize { + self.token_range().start() } } // Implement iterator pattern for Lexer. // Calling the next element in the iterator will yield the next lexical // token. -impl Iterator for Lexer -where - T: Iterator, -{ +impl Iterator for Lexer<'_> { type Item = LexResult; fn next(&mut self) -> Option { - let token = self.inner_next(); - trace!( - "Lex token {:?}, nesting={:?}, indent stack: {:?}", - token, - self.nesting, - self.indentations, - ); + let token = self.next_token(); match token { Ok((Tok::EndOfFile, _)) => None, @@ -1336,6 +928,8 @@ where } } +impl FusedIterator for Lexer<'_> {} + /// Represents an error that occur during lexing and are /// returned by the `parse_*` functions in the iterator in the /// [lexer] implementation. @@ -1442,21 +1036,103 @@ impl std::fmt::Display for LexicalErrorType { } } +#[derive(Copy, Clone, Debug)] +enum State { + /// Lexer is right at the beginning of the file or after a `Newline` token. + AfterNewline, + + /// The lexer is at the start of a new logical line but **after** the indentation + NonEmptyLogicalLine, + + /// Lexer is right after an equal token + AfterEqual, + + /// Inside of a logical line + Other, +} + +impl State { + const fn is_after_newline(self) -> bool { + matches!(self, State::AfterNewline) + } + + const fn is_new_logical_line(self) -> bool { + matches!(self, State::AfterNewline | State::NonEmptyLogicalLine) + } + + const fn is_after_equal(self) -> bool { + matches!(self, State::AfterEqual) + } +} + +#[derive(Copy, Clone, Debug)] +enum Radix { + Binary, + Octal, + Decimal, + Hex, +} + +impl Radix { + const fn as_u32(self) -> u32 { + match self { + Radix::Binary => 2, + Radix::Octal => 8, + Radix::Decimal => 10, + Radix::Hex => 16, + } + } + + const fn is_digit(self, c: char) -> bool { + match self { + Radix::Binary => matches!(c, '0'..='1'), + Radix::Octal => matches!(c, '0'..='7'), + Radix::Decimal => c.is_ascii_digit(), + Radix::Hex => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } +} + +const fn is_quote(c: char) -> bool { + matches!(c, '\'' | '"') +} + +const fn is_ascii_identifier_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_') +} + +// Checks if the character c is a valid starting character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_unicode_identifier_start(c: char) -> bool { + is_xid_start(c) +} + +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char) -> bool { + match c { + 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true, + c => is_xid_continue(c), + } +} + #[cfg(test)] mod tests { + use num_bigint::BigInt; + use rustpython_ast::MagicKind; + use super::*; - use crate::ast::bigint::BigInt; const WINDOWS_EOL: &str = "\r\n"; const MAC_EOL: &str = "\r"; const UNIX_EOL: &str = "\n"; - pub fn lex_source(source: &str) -> Vec { + pub(crate) fn lex_source(source: &str) -> Vec { let lexer = lex(source, Mode::Module); lexer.map(|x| x.unwrap().0).collect() } - pub fn lex_jupyter_source(source: &str) -> Vec { + pub(crate) fn lex_jupyter_source(source: &str) -> Vec { let lexer = lex(source, Mode::Jupyter); lexer.map(|x| x.unwrap().0).collect() } @@ -1607,8 +1283,10 @@ mod tests { !!cd /Users/foo/Library/Application\ Support/ /foo 1 2 ,foo 1 2 -;foo 1 2" - .trim(); +;foo 1 2 +!ls +" + .trim(); let tokens = lex_jupyter_source(source); assert_eq!( tokens, @@ -1663,10 +1341,14 @@ mod tests { kind: MagicKind::Quote2, }, Tok::Newline, + Tok::MagicCommand { + value: "ls".to_string(), + kind: MagicKind::Shell, + }, + Tok::Newline, ] ) } - #[test] fn test_jupyter_magic_indentation() { let source = r" @@ -1816,7 +1498,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!(r"99232 # {}", $eol); let tokens = lex_source(&source); @@ -1837,7 +1519,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("123 # Foo{}456", $eol); let tokens = lex_source(&source); @@ -1893,7 +1575,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1931,7 +1613,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1972,7 +1654,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -2025,7 +1707,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = r"x = [ @@ -2088,7 +1770,6 @@ def f(arg=%timeit a = b): } #[test] - #[cfg(feature = "full-lexer")] fn test_non_logical_newline_in_string_continuation() { let source = r"( 'a' @@ -2118,7 +1799,6 @@ def f(arg=%timeit a = b): } #[test] - #[cfg(feature = "full-lexer")] fn test_logical_newline_line_comment() { let source = "#Hello\n#World\n"; let tokens = lex_source(source); @@ -2171,29 +1851,29 @@ def f(arg=%timeit a = b): ); } - macro_rules! test_string_continuation { - ($($name:ident: $eol:expr,)*) => { - $( - #[test] - fn $name() { - let source = format!("\"abc\\{}def\"", $eol); - let tokens = lex_source(&source); - assert_eq!( - tokens, - vec![ - str_tok("abc\\\ndef"), - Tok::Newline, - ] - ) - } - )* - } + fn assert_string_continuation_with_eol(eol: &str) { + let source = format!("\"abc\\{}def\"", eol); + let tokens = lex_source(&source); + + assert_eq!( + tokens, + vec![str_tok(&format!("abc\\{}def", eol)), Tok::Newline] + ) + } + + #[test] + fn test_string_continuation_windows_eol() { + assert_string_continuation_with_eol(WINDOWS_EOL); } - test_string_continuation! { - test_string_continuation_windows_eol: WINDOWS_EOL, - test_string_continuation_mac_eol: MAC_EOL, - test_string_continuation_unix_eol: UNIX_EOL, + #[test] + fn test_string_continuation_mac_eol() { + assert_string_continuation_with_eol(MAC_EOL); + } + + #[test] + fn test_string_continuation_unix_eol() { + assert_string_continuation_with_eol(UNIX_EOL); } #[test] @@ -2203,32 +1883,34 @@ def f(arg=%timeit a = b): assert_eq!(tokens, vec![str_tok(r"\N{EN SPACE}"), Tok::Newline]) } - macro_rules! test_triple_quoted { - ($($name:ident: $eol:expr,)*) => { - $( - #[test] - fn $name() { - let source = format!("\"\"\"{0} test string{0} \"\"\"", $eol); - let tokens = lex_source(&source); - assert_eq!( - tokens, - vec![ - Tok::String { - value: "\n test string\n ".to_owned(), - kind: StringKind::String, - triple_quoted: true, - }, - Tok::Newline, - ] - ) - } - )* - } + fn assert_triple_quoted(eol: &str) { + let source = format!("\"\"\"{0} test string{0} \"\"\"", eol); + let tokens = lex_source(&source); + assert_eq!( + tokens, + vec![ + Tok::String { + value: format!("{0} test string{0} ", eol), + kind: StringKind::String, + triple_quoted: true, + }, + Tok::Newline, + ] + ) } - test_triple_quoted! { - test_triple_quoted_windows_eol: WINDOWS_EOL, - test_triple_quoted_mac_eol: MAC_EOL, - test_triple_quoted_unix_eol: UNIX_EOL, + #[test] + fn triple_quoted_windows_eol() { + assert_triple_quoted(WINDOWS_EOL); + } + + #[test] + fn triple_quoted_unix_eol() { + assert_triple_quoted(UNIX_EOL); + } + + #[test] + fn triple_quoted_macos_eol() { + assert_triple_quoted(MAC_EOL); } } diff --git a/parser/src/lexer/cursor.rs b/parser/src/lexer/cursor.rs new file mode 100644 index 00000000..ff1f3b74 --- /dev/null +++ b/parser/src/lexer/cursor.rs @@ -0,0 +1,107 @@ +use crate::text_size::{TextLen, TextSize}; +use std::str::Chars; + +pub(crate) const EOF_CHAR: char = '\0'; + +#[derive(Clone, Debug)] +pub(super) struct Cursor<'a> { + chars: Chars<'a>, + source_length: TextSize, + #[cfg(debug_assertions)] + prev_char: char, +} + +impl<'a> Cursor<'a> { + pub(crate) fn new(source: &'a str) -> Self { + Self { + source_length: source.text_len(), + chars: source.chars(), + #[cfg(debug_assertions)] + prev_char: EOF_CHAR, + } + } + + /// Returns the previous token. Useful for debug assertions. + #[cfg(debug_assertions)] + pub(super) const fn previous(&self) -> char { + self.prev_char + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the file is at the end of the file. + pub(super) fn first(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the second character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn second(&self) -> char { + let mut chars = self.chars.clone(); + chars.next(); + chars.next().unwrap_or(EOF_CHAR) + } + + /// Returns the remaining text to lex. + pub(super) fn rest(&self) -> &'a str { + self.chars.as_str() + } + + // SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + #[allow(clippy::cast_possible_truncation)] + pub(super) fn text_len(&self) -> TextSize { + TextSize::new(self.chars.as_str().len() as u32) + } + + pub(super) fn token_len(&self) -> TextSize { + self.source_length - self.text_len() + } + + pub(super) fn start_token(&mut self) { + self.source_length = self.text_len() + } + + pub(super) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Consumes the next character + pub(super) fn bump(&mut self) -> Option { + let prev = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev_char = prev; + } + + Some(prev) + } + + pub(super) fn eat_char(&mut self, c: char) -> bool { + if self.first() == c { + self.bump(); + true + } else { + false + } + } + + pub(super) fn eat_if(&mut self, mut predicate: F) -> Option + where + F: FnMut(char) -> bool, + { + if predicate(self.first()) && !self.is_eof() { + self.bump() + } else { + None + } + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/parser/src/lexer/indentation.rs b/parser/src/lexer/indentation.rs new file mode 100644 index 00000000..31732e21 --- /dev/null +++ b/parser/src/lexer/indentation.rs @@ -0,0 +1,129 @@ +use static_assertions::assert_eq_size; +use std::cmp::Ordering; +use std::fmt::Debug; + +/// The column index of an indentation. +/// +/// A space increments the column by one. A tab adds up to 2 (if tab size is 2) indices, but just one +/// if the column isn't even. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Column(u32); + +impl Column { + pub(super) const fn new(column: u32) -> Self { + Self(column) + } +} + +/// The number of characters in an indentation. Each character accounts for 1. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Character(u32); + +impl Character { + pub(super) const fn new(characters: u32) -> Self { + Self(characters) + } +} + +/// The [Indentation](https://docs.python.org/3/reference/lexical_analysis.html#indentation) of a logical line. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] +pub(super) struct Indentation { + column: Column, + character: Character, +} + +impl Indentation { + const TAB_SIZE: u32 = 2; + + pub(super) const fn root() -> Self { + Self { + column: Column::new(0), + character: Character::new(0), + } + } + + #[cfg(test)] + pub(super) const fn new(column: Column, character: Character) -> Self { + Self { character, column } + } + + #[must_use] + pub(super) fn add_space(self) -> Self { + Self { + character: Character(self.character.0 + 1), + column: Column(self.column.0 + 1), + } + } + + #[must_use] + pub(super) fn add_tab(self) -> Self { + Self { + character: Character(self.character.0 + 1), + // Compute the column index: + // * Adds `TAB_SIZE` if `column` is a multiple of `TAB_SIZE` + // * Rounds `column` up to the next multiple of `TAB_SIZE` otherwise. + // https://github.com/python/cpython/blob/2cf99026d6320f38937257da1ab014fc873a11a6/Parser/tokenizer.c#L1818 + column: Column((self.column.0 / Self::TAB_SIZE + 1) * Self::TAB_SIZE), + } + } + + pub(super) fn try_compare( + &self, + other: &Indentation, + ) -> Result { + let column_ordering = self.column.cmp(&other.column); + let character_ordering = self.character.cmp(&other.character); + + if column_ordering == character_ordering { + Ok(column_ordering) + } else { + Err(UnexpectedIndentation) + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub(super) struct UnexpectedIndentation; + +// The indentations stack is used to keep track of the current indentation level +// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation). +#[derive(Debug, Clone, Default)] +pub(super) struct Indentations { + stack: Vec, +} + +impl Indentations { + pub(super) fn push(&mut self, indent: Indentation) { + debug_assert_eq!(self.current().try_compare(&indent), Ok(Ordering::Less)); + + self.stack.push(indent); + } + + pub(super) fn pop(&mut self) -> Option { + self.stack.pop() + } + + pub(super) fn current(&self) -> &Indentation { + static ROOT: Indentation = Indentation::root(); + self.stack.last().unwrap_or(&ROOT) + } +} + +assert_eq_size!(Indentation, u64); + +#[cfg(test)] +mod tests { + use super::{Character, Column, Indentation}; + use std::cmp::Ordering; + + #[test] + fn indentation_try_compare() { + let tab = Indentation::new(Column::new(8), Character::new(1)); + + assert_eq!(tab.try_compare(&tab), Ok(Ordering::Equal)); + + let two_tabs = Indentation::new(Column::new(16), Character::new(2)); + assert_eq!(two_tabs.try_compare(&tab), Ok(Ordering::Greater)); + assert_eq!(tab.try_compare(&two_tabs), Ok(Ordering::Less)); + } +} diff --git a/parser/src/parser.rs b/parser/src/parser.rs index c2dac39e..b2675f96 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -12,6 +12,12 @@ //! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree //! [`Mode`]: crate::mode +use std::iter; + +use itertools::Itertools; +pub(super) use lalrpop_util::ParseError as LalrpopError; + +use crate::lexer::{lex, lex_starts_at}; use crate::{ ast::{self, Ranged}, lexer::{self, LexResult, LexicalError, LexicalErrorType}, @@ -20,11 +26,6 @@ use crate::{ token::Tok, Mode, }; -use itertools::Itertools; -use std::iter; - -use crate::{lexer::Lexer, soft_keywords::SoftKeywordTransformer}; -pub(super) use lalrpop_util::ParseError as LalrpopError; /// Parse Python code string to implementor's type. /// @@ -56,27 +57,28 @@ pub trait Parse where Self: Sized, { + const MODE: Mode; + fn parse(source: &str, source_path: &str) -> Result { - Self::parse_starts_at(source, source_path, TextSize::default()) + let tokens = lex(source, Self::MODE); + + Self::parse_tokens(tokens, source_path) } + fn parse_without_path(source: &str) -> Result { Self::parse(source, "") } + fn parse_starts_at( source: &str, source_path: &str, offset: TextSize, ) -> Result { - let lxr = Self::lex_starts_at(source, offset); - #[cfg(feature = "full-lexer")] - let lxr = - lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); - Self::parse_tokens(lxr, source_path) + let tokens = lex_starts_at(source, Self::MODE, offset); + + Self::parse_tokens(tokens, source_path) } - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer>; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -84,17 +86,13 @@ where } impl Parse for ast::ModModule { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Module, offset) - } + const MODE: Mode = Mode::Module; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Module, source_path)? { + match parse_tokens(lxr, Mode::Module, source_path)? { ast::Mod::Module(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -102,17 +100,13 @@ impl Parse for ast::ModModule { } impl Parse for ast::ModExpression { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Expression, offset) - } + const MODE: Mode = Mode::Expression; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Expression, source_path)? { + match parse_tokens(lxr, Mode::Expression, source_path)? { ast::Mod::Expression(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -120,17 +114,12 @@ impl Parse for ast::ModExpression { } impl Parse for ast::ModInteractive { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Interactive, offset) - } + const MODE: Mode = Mode::Interactive; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Interactive, source_path)? { + match parse_tokens(lxr, Mode::Interactive, source_path)? { ast::Mod::Interactive(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -138,12 +127,8 @@ impl Parse for ast::ModInteractive { } impl Parse for ast::Suite { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModModule::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -153,12 +138,8 @@ impl Parse for ast::Suite { } impl Parse for ast::Stmt { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModModule::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Module; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -186,12 +167,8 @@ impl Parse for ast::Stmt { } impl Parse for ast::Expr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModExpression::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -201,12 +178,8 @@ impl Parse for ast::Expr { } impl Parse for ast::Identifier { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -227,12 +200,8 @@ impl Parse for ast::Identifier { } impl Parse for ast::Constant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) - } + const MODE: Mode = Mode::Expression; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -426,10 +395,12 @@ pub fn parse_tokens( source_path: &str, ) -> Result { let lxr = lxr.into_iter(); - #[cfg(feature = "full-lexer")] - let lxr = - lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); - parse_filtered_tokens(lxr, mode, source_path) + + parse_filtered_tokens( + lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)), + mode, + source_path, + ) } fn parse_filtered_tokens( @@ -571,8 +542,10 @@ include!("gen/parse.rs"); #[cfg(test)] mod tests { - use super::*; use crate::{ast, Parse}; + use insta::assert_debug_snapshot; + + use super::*; #[test] fn test_parse_empty() { @@ -656,7 +629,6 @@ class Foo(A, B): } #[test] - #[cfg(feature = "all-nodes-with-ranges")] fn test_parse_class_generic_types() { let source = "\ # TypeVar @@ -687,7 +659,6 @@ class Foo[X, Y: str, *U, **P](): insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); } #[test] - #[cfg(feature = "all-nodes-with-ranges")] fn test_parse_function_definition() { let source = "\ def func(a): @@ -985,6 +956,57 @@ x = type = 1 insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); } + #[test] + fn numeric_literals() { + let source = r#"x = 123456789 +x = 123456 +x = .1 +x = 1. +x = 1E+1 +x = 1E-1 +x = 1.000_000_01 +x = 123456789.123456789 +x = 123456789.123456789E123456789 +x = 123456789E123456789 +x = 123456789J +x = 123456789.123456789J +x = 0XB1ACC +x = 0B1011 +x = 0O777 +x = 0.000000006 +x = 10000 +x = 133333 +"#; + + insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); + } + + #[test] + fn numeric_literals_attribute_access() { + let source = r#"x = .1.is_integer() +x = 1. .imag +x = 1E+1.imag +x = 1E-1.real +x = 123456789.123456789.hex() +x = 123456789.123456789E123456789 .real +x = 123456789E123456789 .conjugate() +x = 123456789J.real +x = 123456789.123456789J.__add__(0b1011.bit_length()) +x = 0XB1ACC.conjugate() +x = 0B1011 .conjugate() +x = 0O777 .real +x = 0.000000006 .hex() +x = -100.0000J + +if 10 .real: + ... + +y = 100[no] +y = 100(no) +"#; + assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()) + } + #[test] fn test_match_as_identifier() { let source = r#"\ diff --git a/parser/src/python.lalrpop b/parser/src/python.lalrpop index 7b605d37..68796dcc 100644 --- a/parser/src/python.lalrpop +++ b/parser/src/python.lalrpop @@ -3,8 +3,9 @@ // See also: file:///usr/share/doc/python/html/reference/compound_stmts.html#function-definitions // See also: https://greentreesnakes.readthedocs.io/en/latest/nodes.html#keyword +use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt, MagicKind}, + ast::{self as ast, Ranged, MagicKind}, Mode, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, @@ -19,7 +20,7 @@ grammar(mode: Mode); // This is a hack to reduce the amount of lalrpop tables generated: // For each public entry point, a full parse table is generated. // By having only a single pub function, we reduce this to one. -pub Top: ast::Mod = { +pub(crate) Top: ast::Mod = { StartModule => ast::ModModule { body, type_ignores: vec![], range: (start..end).into() }.into(), StartInteractive => ast::ModInteractive { body, range: (start..end).into() }.into(), StartExpression ("\n")* => ast::ModExpression { body: Box::new(body), range: (start..end).into() }.into() diff --git a/parser/src/python.rs b/parser/src/python.rs index da673857..b0906d1a 100644 --- a/parser/src/python.rs +++ b/parser/src/python.rs @@ -1,7 +1,8 @@ // auto-generated: "lalrpop 0.20.0" -// sha3: fa57e02e9e5bfceb811748310e8d17940d15b6c6e2d6191d9ae71b2e4dc435d8 +// sha3: 263bb187f0a83dfe2a024fa0eed0ad8cb855da5991584b5040fa7d870fdb84af +use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt, MagicKind}, + ast::{self as ast, Ranged, MagicKind}, Mode, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, @@ -21,8 +22,9 @@ extern crate alloc; #[allow(non_snake_case, non_camel_case_types, unused_mut, unused_variables, unused_imports, unused_parens, clippy::all)] mod __parse__Top { + use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt, MagicKind}, + ast::{self as ast, Ranged, MagicKind}, Mode, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, @@ -11488,19 +11490,19 @@ mod __parse__Top { _ => panic!("invalid reduction index {}", __reduce_index) } } - pub struct TopParser { + pub(crate) struct TopParser { _priv: (), } impl TopParser { - pub fn new() -> TopParser { + pub(crate) fn new() -> TopParser { TopParser { _priv: (), } } #[allow(dead_code)] - pub fn parse< + pub(crate) fn parse< __TOKEN: __ToTriple<>, __TOKENS: IntoIterator, >( @@ -30744,7 +30746,7 @@ mod __parse__Top { (3, 276) } } -pub use self::__parse__Top::TopParser; +pub(crate) use self::__parse__Top::TopParser; #[allow(unused_variables)] #[allow(clippy::too_many_arguments)] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap new file mode 100644 index 00000000..3ad53568 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap @@ -0,0 +1,440 @@ +--- +source: parser/src/parser.rs +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + Assign( + StmtAssign { + range: 0..13, + targets: [ + Name( + ExprName { + range: 0..1, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 4..13, + value: Int( + 123456789, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 14..24, + targets: [ + Name( + ExprName { + range: 14..15, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 18..24, + value: Int( + 123456, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 25..31, + targets: [ + Name( + ExprName { + range: 25..26, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 29..31, + value: Float( + 0.1, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 32..38, + targets: [ + Name( + ExprName { + range: 32..33, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 36..38, + value: Float( + 1.0, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 39..47, + targets: [ + Name( + ExprName { + range: 39..40, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 43..47, + value: Float( + 10.0, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 48..56, + targets: [ + Name( + ExprName { + range: 48..49, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 52..56, + value: Float( + 0.1, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 57..73, + targets: [ + Name( + ExprName { + range: 57..58, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 61..73, + value: Float( + 1.00000001, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 74..97, + targets: [ + Name( + ExprName { + range: 74..75, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 78..97, + value: Float( + 123456789.12345679, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 98..131, + targets: [ + Name( + ExprName { + range: 98..99, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 102..131, + value: Float( + inf, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 132..155, + targets: [ + Name( + ExprName { + range: 132..133, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 136..155, + value: Float( + inf, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 156..170, + targets: [ + Name( + ExprName { + range: 156..157, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 160..170, + value: Complex { + real: 0.0, + imag: 123456789.0, + }, + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 171..195, + targets: [ + Name( + ExprName { + range: 171..172, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 175..195, + value: Complex { + real: 0.0, + imag: 123456789.12345679, + }, + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 196..207, + targets: [ + Name( + ExprName { + range: 196..197, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 200..207, + value: Int( + 727756, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 208..218, + targets: [ + Name( + ExprName { + range: 208..209, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 212..218, + value: Int( + 11, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 219..228, + targets: [ + Name( + ExprName { + range: 219..220, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 223..228, + value: Int( + 511, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 229..244, + targets: [ + Name( + ExprName { + range: 229..230, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 233..244, + value: Float( + 6e-9, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 245..254, + targets: [ + Name( + ExprName { + range: 245..246, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 249..254, + value: Int( + 10000, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 255..265, + targets: [ + Name( + ExprName { + range: 255..256, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 259..265, + value: Int( + 133333, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), +] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap new file mode 100644 index 00000000..a5f419b4 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap @@ -0,0 +1,672 @@ +--- +source: parser/src/parser.rs +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + Assign( + StmtAssign { + range: 0..19, + targets: [ + Name( + ExprName { + range: 0..1, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 4..19, + func: Attribute( + ExprAttribute { + range: 4..17, + value: Constant( + ExprConstant { + range: 4..6, + value: Float( + 0.1, + ), + kind: None, + }, + ), + attr: Identifier { + id: "is_integer", + range: 7..17, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 20..32, + targets: [ + Name( + ExprName { + range: 20..21, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 24..32, + value: Constant( + ExprConstant { + range: 24..26, + value: Float( + 1.0, + ), + kind: None, + }, + ), + attr: Identifier { + id: "imag", + range: 28..32, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 33..46, + targets: [ + Name( + ExprName { + range: 33..34, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 37..46, + value: Constant( + ExprConstant { + range: 37..41, + value: Float( + 10.0, + ), + kind: None, + }, + ), + attr: Identifier { + id: "imag", + range: 42..46, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 47..60, + targets: [ + Name( + ExprName { + range: 47..48, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 51..60, + value: Constant( + ExprConstant { + range: 51..55, + value: Float( + 0.1, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 56..60, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 61..90, + targets: [ + Name( + ExprName { + range: 61..62, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 65..90, + func: Attribute( + ExprAttribute { + range: 65..88, + value: Constant( + ExprConstant { + range: 65..84, + value: Float( + 123456789.12345679, + ), + kind: None, + }, + ), + attr: Identifier { + id: "hex", + range: 85..88, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 91..130, + targets: [ + Name( + ExprName { + range: 91..92, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 95..130, + value: Constant( + ExprConstant { + range: 95..124, + value: Float( + inf, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 126..130, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 131..167, + targets: [ + Name( + ExprName { + range: 131..132, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 135..167, + func: Attribute( + ExprAttribute { + range: 135..165, + value: Constant( + ExprConstant { + range: 135..154, + value: Float( + inf, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 156..165, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 168..187, + targets: [ + Name( + ExprName { + range: 168..169, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 172..187, + value: Constant( + ExprConstant { + range: 172..182, + value: Complex { + real: 0.0, + imag: 123456789.0, + }, + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 183..187, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 188..241, + targets: [ + Name( + ExprName { + range: 188..189, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 192..241, + func: Attribute( + ExprAttribute { + range: 192..220, + value: Constant( + ExprConstant { + range: 192..212, + value: Complex { + real: 0.0, + imag: 123456789.12345679, + }, + kind: None, + }, + ), + attr: Identifier { + id: "__add__", + range: 213..220, + }, + ctx: Load, + }, + ), + args: [ + Call( + ExprCall { + range: 221..240, + func: Attribute( + ExprAttribute { + range: 221..238, + value: Constant( + ExprConstant { + range: 221..227, + value: Int( + 11, + ), + kind: None, + }, + ), + attr: Identifier { + id: "bit_length", + range: 228..238, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + ], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 242..265, + targets: [ + Name( + ExprName { + range: 242..243, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 246..265, + func: Attribute( + ExprAttribute { + range: 246..263, + value: Constant( + ExprConstant { + range: 246..253, + value: Int( + 727756, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 254..263, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 266..289, + targets: [ + Name( + ExprName { + range: 266..267, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 270..289, + func: Attribute( + ExprAttribute { + range: 270..287, + value: Constant( + ExprConstant { + range: 270..276, + value: Int( + 11, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 278..287, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 290..305, + targets: [ + Name( + ExprName { + range: 290..291, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 294..305, + value: Constant( + ExprConstant { + range: 294..299, + value: Int( + 511, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 301..305, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 306..329, + targets: [ + Name( + ExprName { + range: 306..307, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 310..329, + func: Attribute( + ExprAttribute { + range: 310..327, + value: Constant( + ExprConstant { + range: 310..321, + value: Float( + 6e-9, + ), + kind: None, + }, + ), + attr: Identifier { + id: "hex", + range: 324..327, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 330..344, + targets: [ + Name( + ExprName { + range: 330..331, + id: "x", + ctx: Store, + }, + ), + ], + value: UnaryOp( + ExprUnaryOp { + range: 334..344, + op: USub, + operand: Constant( + ExprConstant { + range: 335..344, + value: Complex { + real: 0.0, + imag: 100.0, + }, + kind: None, + }, + ), + }, + ), + type_comment: None, + }, + ), + If( + StmtIf { + range: 346..366, + test: Attribute( + ExprAttribute { + range: 349..357, + value: Constant( + ExprConstant { + range: 349..351, + value: Int( + 10, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 353..357, + }, + ctx: Load, + }, + ), + body: [ + Expr( + StmtExpr { + range: 363..366, + value: Constant( + ExprConstant { + range: 363..366, + value: Ellipsis, + kind: None, + }, + ), + }, + ), + ], + elif_else_clauses: [], + }, + ), + Assign( + StmtAssign { + range: 368..379, + targets: [ + Name( + ExprName { + range: 368..369, + id: "y", + ctx: Store, + }, + ), + ], + value: Subscript( + ExprSubscript { + range: 372..379, + value: Constant( + ExprConstant { + range: 372..375, + value: Int( + 100, + ), + kind: None, + }, + ), + slice: Name( + ExprName { + range: 376..378, + id: "no", + ctx: Load, + }, + ), + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 380..391, + targets: [ + Name( + ExprName { + range: 380..381, + id: "y", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 384..391, + func: Constant( + ExprConstant { + range: 384..387, + value: Int( + 100, + ), + kind: None, + }, + ), + args: [ + Name( + ExprName { + range: 388..390, + id: "no", + ctx: Load, + }, + ), + ], + keywords: [], + }, + ), + type_comment: None, + }, + ), +] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap index c48429b1..672b6230 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap @@ -6,9 +6,10 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ClassDef( StmtClassDef { range: 10..29, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 16..19, + }, bases: [], keywords: [], body: [ @@ -25,26 +26,28 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 20..21, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 20..21, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 52..76, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 58..61, + }, bases: [], keywords: [], body: [ @@ -61,21 +64,19 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 62..68, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 62..63, + }, bound: Some( Name( ExprName { range: 65..68, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -83,14 +84,16 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 105..138, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 111..114, + }, bases: [], keywords: [], body: [ @@ -107,14 +110,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 115..130, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 115..116, + }, bound: Some( Tuple( ExprTuple { @@ -123,18 +126,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 119..122, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), Name( ExprName { range: 124..129, - id: Identifier( - "bytes", - ), + id: "bytes", ctx: Load, }, ), @@ -146,14 +145,16 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 159..181, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 165..168, + }, bases: [], keywords: [], body: [ @@ -170,35 +171,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 169..170, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 169..170, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 172..173, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 172..173, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 200..223, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 206..209, + }, bases: [], keywords: [], body: [ @@ -215,35 +219,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 210..211, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 210..211, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 213..214, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 213..214, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 240..261, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 246..249, + }, bases: [], keywords: [], body: [ @@ -260,25 +267,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVarTuple( TypeParamTypeVarTuple { range: 250..253, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 251..253, + }, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 275..296, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 281..284, + }, bases: [], keywords: [], body: [ @@ -295,25 +304,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ ParamSpec( TypeParamParamSpec { range: 285..288, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 287..288, + }, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 312..351, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 318..321, + }, bases: [], keywords: [], body: [ @@ -323,30 +334,29 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 322..323, - name: Identifier( - "X", - ), + name: Identifier { + id: "X", + range: 322..323, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 325..331, - name: Identifier( - "Y", - ), + name: Identifier { + id: "Y", + range: 325..326, + }, bound: Some( Name( ExprName { range: 328..331, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -356,20 +366,23 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" TypeVarTuple( TypeParamTypeVarTuple { range: 333..335, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 334..335, + }, }, ), ParamSpec( TypeParamParamSpec { range: 337..340, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 339..340, + }, }, ), ], + decorator_list: [], }, ), ] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap index 2d65a64e..f84851f8 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap @@ -6,20 +6,22 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" FunctionDef( StmtFunctionDef { range: 0..20, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 4..8, + }, args: Arguments { - range: 9..10, + range: 8..11, posonlyargs: [], args: [ ArgWithDefault { range: 9..10, def: Arg { range: 9..10, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 9..10, + }, annotation: None, type_comment: None, }, @@ -46,34 +48,34 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 22..53, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 26..30, + }, args: Arguments { - range: 34..38, + range: 33..39, posonlyargs: [], args: [ ArgWithDefault { range: 34..38, def: Arg { range: 34..38, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 34..35, + }, annotation: Some( Name( ExprName { range: 37..38, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -106,51 +108,50 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 43..44, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 31..32, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 31..32, + }, bound: None, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 55..91, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 59..63, + }, args: Arguments { - range: 72..76, + range: 71..77, posonlyargs: [], args: [ ArgWithDefault { range: 72..76, def: Arg { range: 72..76, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 72..73, + }, annotation: Some( Name( ExprName { range: 75..76, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -183,28 +184,24 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 81..82, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 64..70, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 64..65, + }, bound: Some( Name( ExprName { range: 67..70, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -212,32 +209,33 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 93..138, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 97..101, + }, args: Arguments { - range: 119..123, + range: 118..124, posonlyargs: [], args: [ ArgWithDefault { range: 119..123, def: Arg { range: 119..123, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 119..120, + }, annotation: Some( Name( ExprName { range: 122..123, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -270,21 +268,19 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 128..129, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 102..117, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 102..103, + }, bound: Some( Tuple( ExprTuple { @@ -293,18 +289,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 106..109, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), Name( ExprName { range: 111..116, - id: Identifier( - "bytes", - ), + id: "bytes", ctx: Load, }, ), @@ -316,24 +308,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 140..171, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 144..148, + }, args: Arguments { - range: 154..161, + range: 153..162, posonlyargs: [], args: [], vararg: Some( Arg { range: 155..161, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 155..156, + }, annotation: Some( Starred( ExprStarred { @@ -341,9 +336,7 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 159..161, - id: Identifier( - "Ts", - ), + id: "Ts", ctx: Load, }, ), @@ -373,35 +366,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ TypeVarTuple( TypeParamTypeVarTuple { range: 149..152, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 150..152, + }, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 173..230, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 177..181, + }, args: Arguments { - range: 187..220, + range: 186..221, posonlyargs: [], args: [], vararg: Some( Arg { range: 188..200, - arg: Identifier( - "args", - ), + arg: Identifier { + id: "args", + range: 188..192, + }, annotation: Some( Attribute( ExprAttribute { @@ -409,15 +405,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 194..195, - id: Identifier( - "P", - ), + id: "P", ctx: Load, }, ), - attr: Identifier( - "args", - ), + attr: Identifier { + id: "args", + range: 196..200, + }, ctx: Load, }, ), @@ -429,9 +424,10 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" kwarg: Some( Arg { range: 204..220, - arg: Identifier( - "kwargs", - ), + arg: Identifier { + id: "kwargs", + range: 204..210, + }, annotation: Some( Attribute( ExprAttribute { @@ -439,15 +435,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 212..213, - id: Identifier( - "P", - ), + id: "P", ctx: Load, }, ), - attr: Identifier( - "kwargs", - ), + attr: Identifier { + id: "kwargs", + range: 214..220, + }, ctx: Load, }, ), @@ -472,25 +467,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ ParamSpec( TypeParamParamSpec { range: 182..185, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 184..185, + }, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 232..273, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 236..240, + }, args: Arguments { range: 261..263, posonlyargs: [], @@ -508,30 +505,29 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 241..242, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 241..242, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 244..250, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 244..245, + }, bound: Some( Name( ExprName { range: 247..250, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -541,20 +537,23 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" TypeVarTuple( TypeParamTypeVarTuple { range: 252..255, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 253..255, + }, }, ), ParamSpec( TypeParamParamSpec { range: 257..260, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 259..260, + }, }, ), ], + type_comment: None, }, ), ] diff --git a/parser/src/soft_keywords.rs b/parser/src/soft_keywords.rs index 9abcd395..51278a46 100644 --- a/parser/src/soft_keywords.rs +++ b/parser/src/soft_keywords.rs @@ -134,7 +134,6 @@ where self.start_of_line = next.as_ref().map_or(false, |lex_result| { lex_result.as_ref().map_or(false, |(tok, _)| { - #[cfg(feature = "full-lexer")] if matches!(tok, Tok::NonLogicalNewline | Tok::Comment { .. }) { return self.start_of_line; } diff --git a/parser/src/string.rs b/parser/src/string.rs index 987e0df2..ea711e96 100644 --- a/parser/src/string.rs +++ b/parser/src/string.rs @@ -736,14 +736,14 @@ pub(crate) fn parse_strings( #[derive(Debug, PartialEq)] struct FStringError { /// The type of error that occurred. - pub error: FStringErrorType, + pub(crate) error: FStringErrorType, /// The location of the error. - pub location: TextSize, + pub(crate) location: TextSize, } impl FStringError { /// Creates a new `FStringError` with the given error type and location. - pub fn new(error: FStringErrorType, location: TextSize) -> Self { + pub(crate) fn new(error: FStringErrorType, location: TextSize) -> Self { Self { error, location } } } diff --git a/parser/src/token.rs b/parser/src/token.rs index ac33be75..86a86686 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -3,10 +3,10 @@ //! This module defines the tokens that the lexer recognizes. The tokens are //! loosely based on the token definitions found in the [CPython source]. //! -//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h -use crate::ast::bigint::BigInt; +//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h; use crate::ast::MagicKind; use crate::{text_size::TextSize, Mode}; +use num_bigint::BigInt; use std::fmt; /// The set of tokens the Python source code can be tokenized in. @@ -52,13 +52,11 @@ pub enum Tok { kind: MagicKind, }, /// Token value for a comment. These are filtered out of the token stream prior to parsing. - #[cfg(feature = "full-lexer")] Comment(String), /// Token value for a newline. Newline, /// Token value for a newline that is not a logical line break. These are filtered out of /// the token stream prior to parsing. - #[cfg(feature = "full-lexer")] NonLogicalNewline, /// Token value for an indent. Indent, @@ -236,7 +234,6 @@ impl fmt::Display for Tok { } MagicCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), - #[cfg(feature = "full-lexer")] NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), @@ -250,7 +247,6 @@ impl fmt::Display for Tok { Rsqb => f.write_str("']'"), Colon => f.write_str("':'"), Comma => f.write_str("','"), - #[cfg(feature = "full-lexer")] Comment(value) => f.write_str(value), Semi => f.write_str("';'"), Plus => f.write_str("'+'"),