From 0893de188f05b154321779a40b8b8c193b98b6ab Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Sun, 23 Jul 2023 11:00:56 +0200 Subject: [PATCH 1/6] Cursor based lexer --- .github/workflows/ci.yaml | 10 +- Cargo.toml | 6 - ast/Cargo.toml | 6 +- ast/src/builtin.rs | 2 +- ast/src/lib.rs | 5 - core/Cargo.toml | 1 - format/Cargo.toml | 4 +- format/src/cformat.rs | 2 +- format/src/format.rs | 2 +- format/src/lib.rs | 5 - literal/src/escape.rs | 2 +- parser/Cargo.toml | 11 +- parser/build.rs | 58 +- parser/src/gen/parse.rs | 386 +--- parser/src/lexer.rs | 1786 +++++++---------- parser/src/lexer/cursor.rs | 108 + parser/src/lexer/indentation.rs | 133 ++ parser/src/parser.rs | 160 +- parser/src/python.lalrpop | 3 +- parser/src/python.rs | 8 +- ...rser__parser__tests__numeric_literals.snap | 440 ++++ ...ts__numeric_literals_attribute_access.snap | 672 +++++++ ...ser__tests__parse_class_generic_types.snap | 179 +- ...ser__tests__parse_function_definition.snap | 253 ++- parser/src/soft_keywords.rs | 1 - parser/src/token.rs | 10 +- 26 files changed, 2485 insertions(+), 1768 deletions(-) create mode 100644 parser/src/lexer/cursor.rs create mode 100644 parser/src/lexer/indentation.rs create mode 100644 parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap create mode 100644 parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 989f80eb..fa5f8a8f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,10 +37,8 @@ jobs: - uses: Swatinem/rust-cache@v2 - - name: run tests with num-bigint - run: cargo test --all --no-default-features --features num-bigint - - name: run tests with malachite-bigint and all features - run: cargo test --all --features malachite-bigint,full-lexer,serde + - name: run tests + run: cargo test --all --all-features lint: name: Check Rust code with rustfmt and clippy @@ -53,9 +51,7 @@ jobs: - name: run rustfmt run: cargo fmt --all -- --check - name: run clippy - run: cargo clippy --all --no-default-features --features num-bigint - - name: run clippy - run: cargo clippy --all --features malachite-bigint,full-lexer,serde -- -Dwarnings + run: cargo clippy --all --all-features -- -Dwarnings - uses: actions/setup-python@v4 with: diff --git a/Cargo.toml b/Cargo.toml index 219221e6..7949185b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,23 +21,17 @@ rustpython-literal = { path = "literal" } rustpython-format = { path = "format" } rustpython-parser = { path = "parser", default-features = false } -ahash = "0.7.6" anyhow = "1.0.45" cfg-if = "1.0" insta = "1.14.0" itertools = "0.10.3" is-macro = "0.2.2" -log = "0.4.16" num-complex = "0.4.0" num-bigint = "0.4.3" num-traits = "0.2" -pyo3 = { version = "0.19.0" } -malachite-bigint = { version = "0.1.0" } -memchr = "2.5.0" rand = "0.8.5" serde = "1.0" static_assertions = "1.1" -once_cell = "1.17.1" unicode_names2 = { version = "0.6.0", git = "https://github.com/youknowone/unicode_names2.git", rev = "4ce16aa85cbcdd9cc830410f1a72ef9a235f2fde" } [profile.dev.package."*"] diff --git a/ast/Cargo.toml b/ast/Cargo.toml index fe869346..03a566e0 100644 --- a/ast/Cargo.toml +++ b/ast/Cargo.toml @@ -7,14 +7,10 @@ edition = "2021" repository = "https://github.com/RustPython/Parser/" license = "MIT" -[features] -default = ["malachite-bigint"] - [dependencies] rustpython-parser-core = { workspace = true } rustpython-literal = { workspace = true, optional = true } is-macro = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } static_assertions = "1.1.0" diff --git a/ast/src/builtin.rs b/ast/src/builtin.rs index b7fd3c8e..e10b8245 100644 --- a/ast/src/builtin.rs +++ b/ast/src/builtin.rs @@ -2,8 +2,8 @@ use rustpython_parser_core::text_size::TextRange; -use crate::bigint::BigInt; use crate::Ranged; +use num_bigint::BigInt; pub type String = std::string::String; diff --git a/ast/src/lib.rs b/ast/src/lib.rs index 1b12a93e..cbb12ce2 100644 --- a/ast/src/lib.rs +++ b/ast/src/lib.rs @@ -20,11 +20,6 @@ mod generic; mod impls; mod ranged; -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] -pub use num_bigint as bigint; - pub use builtin::*; pub use generic::*; pub use ranged::Ranged; diff --git a/core/Cargo.toml b/core/Cargo.toml index 82e1cae5..2c477757 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -13,7 +13,6 @@ ruff_text_size = { path = "../ruff_text_size" } serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } is-macro.workspace = true -memchr.workspace = true [features] default = [] diff --git a/format/Cargo.toml b/format/Cargo.toml index b11b25db..0fda5abc 100644 --- a/format/Cargo.toml +++ b/format/Cargo.toml @@ -13,8 +13,6 @@ rustpython-literal = { workspace = true } bitflags = "2.3.1" itertools = "0.10.5" num-traits = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } [features] -default = ["malachite-bigint"] \ No newline at end of file diff --git a/format/src/cformat.rs b/format/src/cformat.rs index d835fda0..8519bbd6 100644 --- a/format/src/cformat.rs +++ b/format/src/cformat.rs @@ -9,7 +9,7 @@ use std::{ str::FromStr, }; -use crate::bigint::{BigInt, Sign}; +use num_bigint::{BigInt, Sign}; #[derive(Debug, PartialEq)] pub enum CFormatErrorType { diff --git a/format/src/format.rs b/format/src/format.rs index 6bc5796e..09e42b80 100644 --- a/format/src/format.rs +++ b/format/src/format.rs @@ -6,7 +6,7 @@ use rustpython_literal::format::Case; use std::ops::Deref; use std::{cmp, str::FromStr}; -use crate::bigint::{BigInt, Sign}; +use num_bigint::{BigInt, Sign}; trait FormatParse { fn parse(text: &str) -> (Option, &str) diff --git a/format/src/lib.rs b/format/src/lib.rs index 61de9d55..e15074ba 100644 --- a/format/src/lib.rs +++ b/format/src/lib.rs @@ -1,8 +1,3 @@ -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] -pub use num_bigint as bigint; - pub use crate::format::*; pub mod cformat; diff --git a/literal/src/escape.rs b/literal/src/escape.rs index 082248a5..0cb07adb 100644 --- a/literal/src/escape.rs +++ b/literal/src/escape.rs @@ -385,7 +385,7 @@ impl<'a> Escape for AsciiEscape<'a> { fn layout(&self) -> &EscapeLayout { &self.layout } - + #[allow(unsafe_code)] fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { formatter.write_str(unsafe { // SAFETY: this function must be called only when source is printable ascii characters diff --git a/parser/Cargo.toml b/parser/Cargo.toml index b6c20ff8..129db9f8 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -9,16 +9,11 @@ license = "MIT" edition = "2021" [features] -default = ["malachite-bigint"] serde = ["dep:serde", "rustpython-parser-core/serde"] -full-lexer = [] -malachite-bigint = ["dep:malachite-bigint", "rustpython-ast/malachite-bigint"] -num-bigint = ["dep:num-bigint", "rustpython-ast/num-bigint"] [build-dependencies] anyhow = { workspace = true } lalrpop = { version = "0.20.0", default-features = false, optional = true } -phf_codegen = "0.11.1" tiny-keccak = { version = "2", features = ["sha3"] } [dependencies] @@ -27,18 +22,16 @@ rustpython-parser-core = { workspace = true } itertools = { workspace = true } is-macro = { workspace = true } -log = { workspace = true } -malachite-bigint = { workspace = true, optional = true } -num-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } num-traits = { workspace = true } unicode_names2 = { workspace = true } unic-emoji-char = "0.9.0" unic-ucd-ident = "0.9.0" lalrpop-util = { version = "0.20.0", default-features = false } -phf = "0.11.1" rustc-hash = "1.1.0" serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } +static_assertions = "1.1.0" [dev-dependencies] insta = { workspace = true } diff --git a/parser/build.rs b/parser/build.rs index e205c65f..a9bc3832 100644 --- a/parser/build.rs +++ b/parser/build.rs @@ -1,13 +1,10 @@ use std::fmt::Write as _; use std::fs::File; -use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::io::{BufRead, BufReader}; use std::path::{Path, PathBuf}; use tiny_keccak::{Hasher, Sha3}; fn main() -> anyhow::Result<()> { - let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); - gen_phf(&out_dir); - const SOURCE: &str = "src/python.lalrpop"; println!("cargo:rerun-if-changed={SOURCE}"); @@ -16,6 +13,7 @@ fn main() -> anyhow::Result<()> { #[cfg(feature = "lalrpop")] { + let out_dir = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); target = out_dir.join("src/python.rs"); } #[cfg(not(feature = "lalrpop"))] @@ -113,55 +111,3 @@ fn sha_equal(expected_sha3_str: &str, actual_sha3: &[u8; 32]) -> bool { } *actual_sha3 == expected_sha3 } - -fn gen_phf(out_dir: &Path) { - let mut kwds = phf_codegen::Map::new(); - let kwds = kwds - // Alphabetical keywords: - .entry("...", "Tok::Ellipsis") - .entry("False", "Tok::False") - .entry("None", "Tok::None") - .entry("True", "Tok::True") - // more so "standard" keywords - .entry("and", "Tok::And") - .entry("as", "Tok::As") - .entry("assert", "Tok::Assert") - .entry("async", "Tok::Async") - .entry("await", "Tok::Await") - .entry("break", "Tok::Break") - .entry("case", "Tok::Case") - .entry("class", "Tok::Class") - .entry("continue", "Tok::Continue") - .entry("def", "Tok::Def") - .entry("del", "Tok::Del") - .entry("elif", "Tok::Elif") - .entry("else", "Tok::Else") - .entry("except", "Tok::Except") - .entry("finally", "Tok::Finally") - .entry("for", "Tok::For") - .entry("from", "Tok::From") - .entry("global", "Tok::Global") - .entry("if", "Tok::If") - .entry("import", "Tok::Import") - .entry("in", "Tok::In") - .entry("is", "Tok::Is") - .entry("lambda", "Tok::Lambda") - .entry("match", "Tok::Match") - .entry("nonlocal", "Tok::Nonlocal") - .entry("not", "Tok::Not") - .entry("or", "Tok::Or") - .entry("pass", "Tok::Pass") - .entry("raise", "Tok::Raise") - .entry("return", "Tok::Return") - .entry("try", "Tok::Try") - .entry("type", "Tok::Type") - .entry("while", "Tok::While") - .entry("with", "Tok::With") - .entry("yield", "Tok::Yield") - .build(); - writeln!( - BufWriter::new(File::create(out_dir.join("keywords.rs")).unwrap()), - "{kwds}", - ) - .unwrap(); -} diff --git a/parser/src/gen/parse.rs b/parser/src/gen/parse.rs index fafec6a1..e56491ae 100644 --- a/parser/src/gen/parse.rs +++ b/parser/src/gen/parse.rs @@ -1,12 +1,10 @@ // This file was originally generated from asdl by a python script, but we now edit it manually impl Parse for ast::StmtFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -24,11 +22,8 @@ impl Parse for ast::StmtFunctionDef { } impl Parse for ast::StmtAsyncFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -47,11 +42,8 @@ impl Parse for ast::StmtAsyncFunctionDef { } impl Parse for ast::StmtClassDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -70,11 +62,8 @@ impl Parse for ast::StmtClassDef { } impl Parse for ast::StmtReturn { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -93,11 +82,8 @@ impl Parse for ast::StmtReturn { } impl Parse for ast::StmtDelete { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -116,11 +102,8 @@ impl Parse for ast::StmtDelete { } impl Parse for ast::StmtAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -139,11 +122,8 @@ impl Parse for ast::StmtAssign { } impl Parse for ast::StmtTypeAlias { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -162,11 +142,8 @@ impl Parse for ast::StmtTypeAlias { } impl Parse for ast::StmtAugAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -185,11 +162,8 @@ impl Parse for ast::StmtAugAssign { } impl Parse for ast::StmtAnnAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -208,11 +182,8 @@ impl Parse for ast::StmtAnnAssign { } impl Parse for ast::StmtFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -231,11 +202,8 @@ impl Parse for ast::StmtFor { } impl Parse for ast::StmtAsyncFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -254,11 +222,8 @@ impl Parse for ast::StmtAsyncFor { } impl Parse for ast::StmtWhile { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -277,11 +242,8 @@ impl Parse for ast::StmtWhile { } impl Parse for ast::StmtIf { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -300,11 +262,8 @@ impl Parse for ast::StmtIf { } impl Parse for ast::StmtWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -323,11 +282,8 @@ impl Parse for ast::StmtWith { } impl Parse for ast::StmtAsyncWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -346,11 +302,8 @@ impl Parse for ast::StmtAsyncWith { } impl Parse for ast::StmtMatch { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -369,11 +322,8 @@ impl Parse for ast::StmtMatch { } impl Parse for ast::StmtRaise { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -392,11 +342,8 @@ impl Parse for ast::StmtRaise { } impl Parse for ast::StmtTry { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -415,11 +362,8 @@ impl Parse for ast::StmtTry { } impl Parse for ast::StmtTryStar { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -438,11 +382,8 @@ impl Parse for ast::StmtTryStar { } impl Parse for ast::StmtAssert { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -461,11 +402,8 @@ impl Parse for ast::StmtAssert { } impl Parse for ast::StmtImport { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -484,11 +422,8 @@ impl Parse for ast::StmtImport { } impl Parse for ast::StmtImportFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -507,11 +442,8 @@ impl Parse for ast::StmtImportFrom { } impl Parse for ast::StmtGlobal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -530,11 +462,8 @@ impl Parse for ast::StmtGlobal { } impl Parse for ast::StmtNonlocal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -553,11 +482,8 @@ impl Parse for ast::StmtNonlocal { } impl Parse for ast::StmtExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -576,11 +502,8 @@ impl Parse for ast::StmtExpr { } impl Parse for ast::StmtPass { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -599,11 +522,8 @@ impl Parse for ast::StmtPass { } impl Parse for ast::StmtBreak { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -622,11 +542,8 @@ impl Parse for ast::StmtBreak { } impl Parse for ast::StmtContinue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Stmt::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } fn parse_tokens( lxr: impl IntoIterator, @@ -645,11 +562,8 @@ impl Parse for ast::StmtContinue { } impl Parse for ast::ExprBoolOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -668,11 +582,8 @@ impl Parse for ast::ExprBoolOp { } impl Parse for ast::ExprNamedExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -691,11 +602,8 @@ impl Parse for ast::ExprNamedExpr { } impl Parse for ast::ExprBinOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -714,11 +622,8 @@ impl Parse for ast::ExprBinOp { } impl Parse for ast::ExprUnaryOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -737,11 +642,8 @@ impl Parse for ast::ExprUnaryOp { } impl Parse for ast::ExprLambda { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -760,11 +662,8 @@ impl Parse for ast::ExprLambda { } impl Parse for ast::ExprIfExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -783,11 +682,8 @@ impl Parse for ast::ExprIfExp { } impl Parse for ast::ExprDict { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -806,11 +702,8 @@ impl Parse for ast::ExprDict { } impl Parse for ast::ExprSet { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -829,11 +722,8 @@ impl Parse for ast::ExprSet { } impl Parse for ast::ExprListComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -852,11 +742,8 @@ impl Parse for ast::ExprListComp { } impl Parse for ast::ExprSetComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -875,11 +762,8 @@ impl Parse for ast::ExprSetComp { } impl Parse for ast::ExprDictComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -898,11 +782,8 @@ impl Parse for ast::ExprDictComp { } impl Parse for ast::ExprGeneratorExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -921,11 +802,8 @@ impl Parse for ast::ExprGeneratorExp { } impl Parse for ast::ExprAwait { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -944,11 +822,8 @@ impl Parse for ast::ExprAwait { } impl Parse for ast::ExprYield { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -967,11 +842,8 @@ impl Parse for ast::ExprYield { } impl Parse for ast::ExprYieldFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -990,11 +862,8 @@ impl Parse for ast::ExprYieldFrom { } impl Parse for ast::ExprCompare { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1013,11 +882,8 @@ impl Parse for ast::ExprCompare { } impl Parse for ast::ExprCall { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1036,11 +902,8 @@ impl Parse for ast::ExprCall { } impl Parse for ast::ExprFormattedValue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1059,11 +922,8 @@ impl Parse for ast::ExprFormattedValue { } impl Parse for ast::ExprJoinedStr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1082,11 +942,8 @@ impl Parse for ast::ExprJoinedStr { } impl Parse for ast::ExprConstant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1105,11 +962,8 @@ impl Parse for ast::ExprConstant { } impl Parse for ast::ExprAttribute { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1128,11 +982,8 @@ impl Parse for ast::ExprAttribute { } impl Parse for ast::ExprSubscript { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1151,11 +1002,8 @@ impl Parse for ast::ExprSubscript { } impl Parse for ast::ExprStarred { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1174,11 +1022,8 @@ impl Parse for ast::ExprStarred { } impl Parse for ast::ExprName { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1197,11 +1042,8 @@ impl Parse for ast::ExprName { } impl Parse for ast::ExprList { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1220,11 +1062,8 @@ impl Parse for ast::ExprList { } impl Parse for ast::ExprTuple { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, @@ -1243,11 +1082,8 @@ impl Parse for ast::ExprTuple { } impl Parse for ast::ExprSlice { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } fn parse_tokens( lxr: impl IntoIterator, diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index 98b653c2..a8d0400f 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -27,166 +27,46 @@ //! ``` //! //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html + +use std::iter::FusedIterator; +use std::{char, cmp::Ordering, str::FromStr}; + +use num_bigint::BigInt; +use num_traits::{Num, Zero}; +use rustpython_ast::MagicKind; +use unic_emoji_char::is_emoji_presentation; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; + +use crate::lexer::cursor::{Cursor, EOF_CHAR}; +use crate::lexer::indentation::{Indentation, Indentations}; use crate::{ - ast::bigint::BigInt, - ast::MagicKind, soft_keywords::SoftKeywordTransformer, string::FStringErrorType, - text_size::{TextLen, TextRange, TextSize}, + text_size::{TextRange, TextSize}, token::{StringKind, Tok}, Mode, }; -use log::trace; -use num_traits::{Num, Zero}; -use std::{char, cmp::Ordering, ops::Index, slice::SliceIndex, str::FromStr}; -use unic_emoji_char::is_emoji_presentation; -use unic_ucd_ident::{is_xid_continue, is_xid_start}; - -// Indentations are tracked by a stack of indentation levels. IndentationLevel keeps -// track of the number of tabs and spaces at the current level. -#[derive(Clone, Copy, PartialEq, Debug, Default)] -struct IndentationLevel { - tabs: u32, - spaces: u32, -} - -impl IndentationLevel { - fn compare_strict( - &self, - other: &IndentationLevel, - location: TextSize, - ) -> Result { - // We only know for sure that we're smaller or bigger if tabs - // and spaces both differ in the same direction. Otherwise we're - // dependent on the size of tabs. - match self.tabs.cmp(&other.tabs) { - Ordering::Less => { - if self.spaces <= other.spaces { - Ok(Ordering::Less) - } else { - Err(LexicalError { - location, - error: LexicalErrorType::TabError, - }) - } - } - Ordering::Greater => { - if self.spaces >= other.spaces { - Ok(Ordering::Greater) - } else { - Err(LexicalError { - location, - error: LexicalErrorType::TabError, - }) - } - } - Ordering::Equal => Ok(self.spaces.cmp(&other.spaces)), - } - } -} -// The indentations stack is used to keep track of the current indentation level. -// Similar to the CPython implementation, the Indentations stack always has at -// least one level which is never popped. See Reference 2.1.8. -#[derive(Debug)] -struct Indentations { - indent_stack: Vec, -} - -impl Indentations { - fn is_empty(&self) -> bool { - self.indent_stack.len() == 1 - } - - fn push(&mut self, indent: IndentationLevel) { - self.indent_stack.push(indent); - } - - fn pop(&mut self) -> Option { - if self.is_empty() { - return None; - } - self.indent_stack.pop() - } - - fn current(&self) -> &IndentationLevel { - self.indent_stack - .last() - .expect("Indentations must have at least one level") - } -} - -impl Default for Indentations { - fn default() -> Self { - Self { - indent_stack: vec![IndentationLevel::default()], - } - } -} - -// A CharWindow is a sliding window over an iterator of chars. It is used to -// allow for look-ahead when scanning tokens from the source code. -struct CharWindow, const N: usize> { - source: T, - window: [Option; N], -} - -impl CharWindow -where - T: Iterator, -{ - fn new(source: T) -> Self { - Self { - source, - window: [None; N], - } - } - - fn slide(&mut self) -> Option { - self.window.rotate_left(1); - let next = self.source.next(); - *self.window.last_mut().expect("never empty") = next; - next - } -} - -impl Index for CharWindow -where - T: Iterator, - Idx: SliceIndex<[Option]>, -{ - type Output = Idx::Output; - - fn index(&self, index: Idx) -> &Self::Output { - &self.window[index] - } -} +mod cursor; +mod indentation; /// A lexer for Python source code. -pub struct Lexer> { +pub struct Lexer<'source> { // Contains the source code to be lexed. - window: CharWindow, + cursor: Cursor<'source>, + source: &'source str, + // Are we at the beginning of a line? at_begin_of_line: bool, // Amount of parenthesis. - nesting: usize, + nesting: u32, // Indentation levels. indentations: Indentations, - // Pending list of tokens to be returned. - pending: Vec, - // The current location. - location: TextSize, - // Is the last token an equal sign? - last_token_is_equal: bool, + pending_indentation: Option, // Lexer mode. mode: Mode, } -// generated in build.rs, in gen_phf() -/// A map of keywords to their tokens. -pub static KEYWORDS: phf::Map<&'static str, Tok> = - include!(concat!(env!("OUT_DIR"), "/keywords.rs")); - /// Contains a Token along with its `range`. pub type Spanned = (Tok, TextRange); /// The result of lexing a token. @@ -207,8 +87,43 @@ pub type LexResult = Result; /// } /// ``` #[inline] -pub fn lex(source: &str, mode: Mode) -> impl Iterator + '_ { - lex_starts_at(source, mode, TextSize::default()) +pub fn lex(source: &str, mode: Mode) -> SoftKeywordTransformer { + SoftKeywordTransformer::new(Lexer::new(source, mode), mode) +} + +pub struct LexStartsAtIterator { + start_offset: TextSize, + inner: I, +} + +impl Iterator for LexStartsAtIterator +where + I: Iterator, +{ + type Item = LexResult; + + #[inline] + fn next(&mut self) -> Option { + let result = match self.inner.next()? { + Ok((tok, range)) => Ok((tok, range + self.start_offset)), + Err(error) => Err(LexicalError { + location: error.location + self.start_offset, + ..error + }), + }; + + Some(result) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl FusedIterator for LexStartsAtIterator where I: Iterator + FusedIterator {} +impl ExactSizeIterator for LexStartsAtIterator where + I: Iterator + ExactSizeIterator +{ } /// Create a new lexer from a source string, starting at a given location. @@ -217,194 +132,219 @@ pub fn lex_starts_at( source: &str, mode: Mode, start_offset: TextSize, -) -> SoftKeywordTransformer>> { - SoftKeywordTransformer::new(Lexer::new(source.chars(), mode, start_offset), mode) +) -> LexStartsAtIterator> { + LexStartsAtIterator { + start_offset, + inner: lex(source, mode), + } } -impl Lexer -where - T: Iterator, -{ +impl<'source> Lexer<'source> { /// Create a new lexer from T and a starting location. You probably want to use /// [`lex`] instead. - pub fn new(input: T, mode: Mode, start: TextSize) -> Self { + pub fn new(input: &'source str, mode: Mode) -> Self { let mut lxr = Lexer { at_begin_of_line: true, nesting: 0, indentations: Indentations::default(), - // Usually we have less than 5 tokens pending. - pending: Vec::with_capacity(5), - location: start, - window: CharWindow::new(input), - last_token_is_equal: false, + pending_indentation: None, + + source: input, + cursor: Cursor::new(input), mode, }; - // Fill the window. - lxr.window.slide(); - lxr.window.slide(); - lxr.window.slide(); // TODO: Handle possible mismatch between BOM and explicit encoding declaration. // spell-checker:ignore feff - if let Some('\u{feff}') = lxr.window[0] { - lxr.window.slide(); - lxr.location += '\u{feff}'.text_len(); - } + lxr.cursor.eat_char('\u{feff}'); + lxr } /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier(&mut self) -> LexResult { + fn lex_identifier(&mut self, first: char) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' - match self.window[..3] { - [Some(c), Some('"' | '\''), ..] => { - if let Ok(kind) = StringKind::try_from(c) { - return self.lex_string(kind); + match self.cursor.first() { + quote @ ('\'' | '"') => { + if let Ok(string_kind) = StringKind::try_from(first) { + self.cursor.bump(); + return self.lex_string(string_kind, quote); } } - [Some(c1), Some(c2), Some('"' | '\'')] => { - if let Ok(kind) = StringKind::try_from([c1, c2]) { - return self.lex_string(kind); + second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => { + self.cursor.bump(); + + if let Ok(string_kind) = StringKind::try_from([first, second]) { + let quote = self.cursor.bump().unwrap(); + return self.lex_string(string_kind, quote); } } _ => {} - }; - - let start_pos = self.get_pos(); - let mut name = String::with_capacity(8); - while self.is_identifier_continuation() { - name.push(self.next_char().unwrap()); } - let end_pos = self.get_pos(); - if let Some(tok) = KEYWORDS.get(&name) { - Ok((tok.clone(), TextRange::new(start_pos, end_pos))) - } else { - Ok((Tok::Name { name }, TextRange::new(start_pos, end_pos))) - } + self.cursor.eat_while(is_identifier_continuation); + + let text = self.token_text(); + + let keyword = match text { + "False" => Tok::False, + "None" => Tok::None, + "True" => Tok::True, + "and" => Tok::And, + "as" => Tok::As, + "assert" => Tok::Assert, + "async" => Tok::Async, + "await" => Tok::Await, + "break" => Tok::Break, + "case" => Tok::Case, + "class" => Tok::Class, + "continue" => Tok::Continue, + "def" => Tok::Def, + "del" => Tok::Del, + "elif" => Tok::Elif, + "else" => Tok::Else, + "except" => Tok::Except, + "finally" => Tok::Finally, + "for" => Tok::For, + "from" => Tok::From, + "global" => Tok::Global, + "if" => Tok::If, + "import" => Tok::Import, + "in" => Tok::In, + "is" => Tok::Is, + "lambda" => Tok::Lambda, + "match" => Tok::Match, + "nonlocal" => Tok::Nonlocal, + "not" => Tok::Not, + "or" => Tok::Or, + "pass" => Tok::Pass, + "raise" => Tok::Raise, + "return" => Tok::Return, + "try" => Tok::Try, + "type" => Tok::Type, + "while" => Tok::While, + "with" => Tok::With, + "yield" => Tok::Yield, + _ => { + return Ok(Tok::Name { + name: text.to_string(), + }) + } + }; + + Ok(keyword) } /// Numeric lexing. The feast can start! - fn lex_number(&mut self) -> LexResult { - let start_pos = self.get_pos(); - match self.window[..2] { - [Some('0'), Some('x' | 'X')] => { - // Hex! (0xdeadbeef) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 16) - } - [Some('0'), Some('o' | 'O')] => { - // Octal style! (0o377) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 8) - } - [Some('0'), Some('b' | 'B')] => { - // Binary! (0b_1110_0101) - self.next_char(); - self.next_char(); - self.lex_number_radix(start_pos, 2) - } - _ => self.lex_normal_number(), + fn lex_number(&mut self, first: char) -> Result { + if first == '0' { + if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { + self.lex_number_radix(first, Radix::Hex) + } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { + self.lex_number_radix(first, Radix::Octal) + } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { + self.lex_number_radix(first, Radix::Binary) + } else { + self.lex_decimal_number(first) + } + } else { + self.lex_decimal_number(first) } } /// Lex a hex/octal/decimal/binary number without a decimal point. - fn lex_number_radix(&mut self, start_pos: TextSize, radix: u32) -> LexResult { - let value_text = self.radix_run(radix); - let end_pos = self.get_pos(); - let value = BigInt::from_str_radix(&value_text, radix).map_err(|e| LexicalError { - error: LexicalErrorType::OtherError(format!("{e:?}")), - location: start_pos, - })?; - Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos))) + fn lex_number_radix(&mut self, first: char, radix: Radix) -> Result { + #[cfg(debug_assertions)] + debug_assert!(matches!( + self.cursor.previous().to_ascii_lowercase(), + 'x' | 'o' | 'b' + )); + + let value_text = self.radix_run(Some(first), radix); + let value = + BigInt::from_str_radix(&value_text, radix.as_u32()).map_err(|e| LexicalError { + error: LexicalErrorType::OtherError(format!("{e:?}")), + location: self.token_range().start(), + })?; + Ok(Tok::Int { value }) } /// Lex a normal number, that is, no octal, hex or binary number. - fn lex_normal_number(&mut self) -> LexResult { - let start_pos = self.get_pos(); - let start_is_zero = self.window[0] == Some('0'); - // Normal number: - let mut value_text = self.radix_run(10); + fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> Result { + #[cfg(debug_assertions)] + debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); + let start_is_zero = first_digit_or_dot == '0'; - // If float: - if self.window[0] == Some('.') || self.at_exponent() { - // Take '.': - if self.window[0] == Some('.') { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap()); - value_text.push_str(&self.radix_run(10)); + let mut value_text = if first_digit_or_dot == '.' { + String::new() + } else { + self.radix_run(Some(first_digit_or_dot), Radix::Decimal) + }; + + let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { + value_text.push('.'); + + if self.cursor.eat_char('_') { + return Err(LexicalError { + error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), + location: self.offset() - TextSize::new(1), + }); } - // 1e6 for example: - if let Some('e' | 'E') = self.window[0] { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap().to_ascii_lowercase()); - // Optional +/- - if matches!(self.window[0], Some('-' | '+')) { - if self.window[1] == Some('_') { - return Err(LexicalError { - error: LexicalErrorType::OtherError("Invalid Syntax".to_owned()), - location: self.get_pos(), - }); - } - value_text.push(self.next_char().unwrap()); + value_text.push_str(&self.radix_run(None, Radix::Decimal)); + true + } else { + // Normal number: + false + }; + + let is_float = match self.cursor.rest().as_bytes() { + [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { + value_text.push('e'); + self.cursor.bump(); // e | E + + if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { + value_text.push(sign); } - value_text.push_str(&self.radix_run(10)); + value_text.push_str(&self.radix_run(None, Radix::Decimal)); + + true } + _ => is_float, + }; + // If float: + if is_float { + // Improvement: Use `Cow` instead of pushing to value text let value = f64::from_str(&value_text).map_err(|_| LexicalError { error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()), - location: self.get_pos(), + location: self.token_start(), })?; // Parse trailing 'j': - if matches!(self.window[0], Some('j' | 'J')) { - self.next_char(); - let end_pos = self.get_pos(); - Ok(( - Tok::Complex { - real: 0.0, - imag: value, - }, - TextRange::new(start_pos, end_pos), - )) + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + Ok(Tok::Complex { + real: 0.0, + imag: value, + }) } else { - let end_pos = self.get_pos(); - Ok((Tok::Float { value }, TextRange::new(start_pos, end_pos))) + Ok(Tok::Float { value }) } } else { // Parse trailing 'j': - if matches!(self.window[0], Some('j' | 'J')) { - self.next_char(); - let end_pos = self.get_pos(); + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { let imag = f64::from_str(&value_text).unwrap(); - Ok(( - Tok::Complex { real: 0.0, imag }, - TextRange::new(start_pos, end_pos), - )) + Ok(Tok::Complex { real: 0.0, imag }) } else { - let end_pos = self.get_pos(); let value = value_text.parse::().unwrap(); if start_is_zero && !value.is_zero() { // leading zeros in decimal integer literals are not permitted return Err(LexicalError { error: LexicalErrorType::OtherError("Invalid Token".to_owned()), - location: self.get_pos(), + location: self.token_range().start(), }); } - Ok((Tok::Int { value }, TextRange::new(start_pos, end_pos))) + Ok(Tok::Int { value }) } } } @@ -412,105 +352,39 @@ where /// Consume a sequence of numbers with the given radix, /// the digits can be decorated with underscores /// like this: '1_2_3_4' == '1234' - fn radix_run(&mut self, radix: u32) -> String { - let mut value_text = String::new(); + fn radix_run(&mut self, first: Option, radix: Radix) -> String { + let mut value_text = first.map_or(String::new(), |c| c.to_string()); loop { - if let Some(c) = self.take_number(radix) { + if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { value_text.push(c); - } else if self.window[0] == Some('_') - && Lexer::::is_digit_of_radix(self.window[1], radix) - { - self.next_char(); + } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + self.cursor.bump(); } else { break; } } - value_text - } - - /// Consume a single character with the given radix. - fn take_number(&mut self, radix: u32) -> Option { - let take_char = Lexer::::is_digit_of_radix(self.window[0], radix); - - take_char.then(|| self.next_char().unwrap()) - } - - /// Test if a digit is of a certain radix. - fn is_digit_of_radix(c: Option, radix: u32) -> bool { - match radix { - 2 => matches!(c, Some('0'..='1')), - 8 => matches!(c, Some('0'..='7')), - 10 => matches!(c, Some('0'..='9')), - 16 => matches!(c, Some('0'..='9') | Some('a'..='f') | Some('A'..='F')), - other => unimplemented!("Radix not implemented: {}", other), - } - } - /// Test if we face '[eE][-+]?[0-9]+' - fn at_exponent(&self) -> bool { - match self.window[..2] { - [Some('e' | 'E'), Some('+' | '-')] => matches!(self.window[2], Some('0'..='9')), - [Some('e' | 'E'), Some('0'..='9')] => true, - _ => false, - } + value_text } /// Lex a single comment. - #[cfg(feature = "full-lexer")] - fn lex_comment(&mut self) -> LexResult { - let start_pos = self.get_pos(); - let mut value = String::new(); - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - let end_pos = self.get_pos(); - return Ok((Tok::Comment(value), TextRange::new(start_pos, end_pos))); - } - Some(_) => {} - } - value.push(self.next_char().unwrap()); - } - } + fn lex_comment(&mut self) -> Result { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), '#'); - #[cfg(feature = "full-lexer")] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - let comment = self.lex_comment()?; - self.emit(comment); - Ok(()) - } + self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); - /// Discard comment if full-lexer is not enabled. - #[cfg(not(feature = "full-lexer"))] - fn lex_comment(&mut self) { - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - return; - } - Some(_) => {} - } - self.next_char().unwrap(); - } - } - - #[cfg(not(feature = "full-lexer"))] - #[inline] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - self.lex_comment(); - Ok(()) + return Ok(Tok::Comment(self.token_text().to_string())); } /// Lex a single magic command. - fn lex_magic_command(&mut self, kind: MagicKind) -> (Tok, TextRange) { - let start_pos = self.get_pos(); - for _ in 0..u32::from(kind.prefix_len()) { - self.next_char(); - } + fn lex_magic_command(&mut self, kind: MagicKind) -> Tok { let mut value = String::new(); + loop { - match self.window[0] { - Some('\\') => { + match self.cursor.first() { + '\\' => { // Only skip the line continuation if it is followed by a newline // otherwise it is a normal backslash which is part of the magic command: // @@ -520,94 +394,78 @@ where // && ls -a | sed 's/^/\\ /' // ^^ // Don't skip these backslashes - if matches!(self.window[1], Some('\n' | '\r')) { - self.next_char(); - self.next_char(); + if self.cursor.second() == '\r' { + self.cursor.bump(); + self.cursor.bump(); + self.cursor.eat_char('\n'); + continue; + } else if self.cursor.second() == '\n' { + self.cursor.bump(); + self.cursor.bump(); continue; + } else { + self.cursor.bump(); + value.push('\\'); } } - Some('\n' | '\r') | None => { - let end_pos = self.get_pos(); - return ( - Tok::MagicCommand { kind, value }, - TextRange::new(start_pos, end_pos), - ); + '\n' | '\r' | EOF_CHAR => { + return Tok::MagicCommand { kind, value }; + } + c => { + self.cursor.bump(); + value.push(c); } - Some(_) => {} - } - value.push(self.next_char().unwrap()); - } - } - - fn lex_and_emit_magic_command(&mut self) { - let kind = match self.window[..2] { - [Some(c1), Some(c2)] => { - MagicKind::try_from([c1, c2]).map_or_else(|_| MagicKind::try_from(c1), Ok) } - // When the escape character is the last character of the file. - [Some(c), None] => MagicKind::try_from(c), - _ => return, - }; - if let Ok(kind) = kind { - let magic_command = self.lex_magic_command(kind); - self.emit(magic_command); } } /// Lex a string literal. - fn lex_string(&mut self, kind: StringKind) -> LexResult { - let start_pos = self.get_pos(); - for _ in 0..u32::from(kind.prefix_len()) { - self.next_char(); - } - let quote_char = self.next_char().unwrap(); - let mut string_content = String::with_capacity(5); + fn lex_string(&mut self, kind: StringKind, quote: char) -> Result { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), quote); // If the next two characters are also the quote character, then we have a triple-quoted // string; consume those two characters and ensure that we require a triple-quote to close - let triple_quoted = if self.window[..2] == [Some(quote_char); 2] { - self.next_char(); - self.next_char(); + let triple_quoted = if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); true } else { false }; - loop { - match self.next_char() { - Some(c) => { - if c == '\\' { - if let Some(next_c) = self.next_char() { - string_content.push('\\'); - string_content.push(next_c); - continue; - } - } - if c == '\n' && !triple_quoted { - return Err(LexicalError { - error: LexicalErrorType::OtherError( - "EOL while scanning string literal".to_owned(), - ), - location: self.get_pos(), - }); - } + let value_start = self.offset(); - if c == quote_char { - if triple_quoted { - // Look ahead at the next two characters; if we have two more - // quote_chars, it's the end of the string; consume the remaining - // closing quotes and break the loop - if self.window[..2] == [Some(quote_char); 2] { - self.next_char(); - self.next_char(); - break; - } - } else { - break; + let value_end = loop { + match self.cursor.bump() { + Some('\\') => { + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else { + self.cursor.bump(); + } + } + Some('\r' | '\n') if !triple_quoted => { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "EOL while scanning string literal".to_owned(), + ), + location: self.offset() - TextSize::new(1), + }); + } + Some(c) if c == quote => { + if triple_quoted { + if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); + break self.offset() - TextSize::new(3); } + } else { + break self.offset() - TextSize::new(1); } - string_content.push(c); } + + Some(_) => {} None => { return Err(LexicalError { error: if triple_quoted { @@ -615,719 +473,454 @@ where } else { LexicalErrorType::StringError }, - location: self.get_pos(), + location: self.offset(), }); } } - } - let end_pos = self.get_pos(); + }; + let tok = Tok::String { - value: string_content, + value: self.source[TextRange::new(value_start, value_end)].to_string(), kind, triple_quoted, }; - Ok((tok, TextRange::new(start_pos, end_pos))) - } - - // Checks if the character c is a valid starting character as described - // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers - fn is_identifier_start(&self, c: char) -> bool { - match c { - 'a'..='z' | 'A'..='Z' | '_' => true, - _ => is_xid_start(c), - } - } - - // Checks if the character c is a valid continuation character as described - // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers - fn is_identifier_continuation(&self) -> bool { - match self.window[0] { - Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true, - Some(c) => is_xid_continue(c), - _ => false, - } + Ok(tok) } // This is the main entry point. Call this function to retrieve the next token. // This function is used by the iterator implementation. - fn inner_next(&mut self) -> LexResult { + pub fn next_token(&mut self) -> LexResult { // top loop, keep on processing, until we have something pending. - while self.pending.is_empty() { - // Detect indentation levels - if self.at_begin_of_line { - self.handle_indentations()?; - if self.mode == Mode::Jupyter - // https://github.com/ipython/ipython/blob/635815e8f1ded5b764d66cacc80bbe25e9e2587f/IPython/core/inputtransformer2.py#L345 - && matches!(self.window[0], Some('%' | '!' | '?' | '/' | ';' | ',')) + loop { + // Return dedent tokens until the current indentation level matches the indentation of the next token. + if let Some(indentation) = self.pending_indentation.take() { + if let Ok(Ordering::Greater) = self.indentations.current().try_compare(&indentation) { - self.lex_and_emit_magic_command(); + self.pending_indentation = Some(indentation); + self.indentations.pop(); + return Ok((Tok::Dedent, TextRange::empty(self.offset()))); } } - self.consume_normal()?; - } - - Ok(self.pending.remove(0)) - } - - // Given we are at the start of a line, count the number of spaces and/or tabs until the first character. - fn eat_indentation(&mut self) -> Result { - // Determine indentation: - let mut spaces: u32 = 0; - let mut tabs: u32 = 0; - loop { - match self.window[0] { - Some(' ') => { - /* - if tabs != 0 { - // Don't allow spaces after tabs as part of indentation. - // This is technically stricter than python3 but spaces after - // tabs is even more insane than mixing spaces and tabs. - return Some(Err(LexicalError { - error: LexicalErrorType::OtherError("Spaces not allowed as part of indentation after tabs".to_owned()), - location: self.get_pos(), - })); - } - */ - self.next_char(); - spaces += 1; - } - Some('\t') => { - if spaces != 0 { - // Don't allow tabs after spaces as part of indentation. - // This is technically stricter than python3 but spaces before - // tabs is even more insane than mixing spaces and tabs. - return Err(LexicalError { - error: LexicalErrorType::TabsAfterSpaces, - location: self.get_pos(), - }); - } - self.next_char(); - tabs += 1; - } - Some('#') => { - self.lex_and_emit_comment()?; - spaces = 0; - tabs = 0; - } - Some('\x0C') => { - // Form feed character! - // Reset indentation for the Emacs user. - self.next_char(); - spaces = 0; - tabs = 0; - } - Some('\n' | '\r') => { - // Empty line! - #[cfg(feature = "full-lexer")] - let tok_start = self.get_pos(); - self.next_char(); - #[cfg(feature = "full-lexer")] - let tok_end = self.get_pos(); - #[cfg(feature = "full-lexer")] - self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); - spaces = 0; - tabs = 0; + if self.at_begin_of_line && self.nesting == 0 { + if let Some(trivia) = self.eat_logical_line_trivia()? { + break Ok(trivia); } - None => { - spaces = 0; - tabs = 0; - break; - } - _ => { - self.at_begin_of_line = false; - break; + } + + self.cursor.start_token(); + if let Some(c) = self.cursor.bump() { + if let Some(normal) = self.consume_normal(c)? { + break Ok(normal); } + } else { + // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line, + // empty the dedent stack, and finally, return the EndOfFile token. + break self.consume_end(); } } - - Ok(IndentationLevel { tabs, spaces }) } - // Push/pop indents/dedents based on the current indentation level. - fn handle_indentations(&mut self) -> Result<(), LexicalError> { - let indentation_level = self.eat_indentation()?; + fn eat_logical_line_trivia(&mut self) -> Result, LexicalError> { + let mut indentation = Indentation::root(); + + // Eat over any leading whitespace + self.cursor.start_token(); + self.cursor.eat_while(|c| { + if c == ' ' { + indentation = indentation.add_space(); + true + } else if c == '\t' { + indentation = indentation.add_tab(); + true + } else if c == '\x0C' { + indentation = Indentation::root(); + true + } else { + false + } + }); - if self.nesting != 0 { - return Ok(()); - } + let token = match self.cursor.first() { + c @ ('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter => { + self.cursor.start_token(); + self.cursor.bump(); + let kind = if let Ok(kind) = MagicKind::try_from([c, self.cursor.first()]) { + self.cursor.bump(); + kind + } else { + MagicKind::try_from(c).unwrap() + }; - // Determine indent or dedent: - let current_indentation = self.indentations.current(); - let ordering = indentation_level.compare_strict(current_indentation, self.get_pos())?; - match ordering { - Ordering::Equal => { - // Same same - } - Ordering::Greater => { - // New indentation level: - self.indentations.push(indentation_level); - let tok_pos = self.get_pos(); - self.emit(( - Tok::Indent, - TextRange::new( - tok_pos - - TextSize::new(indentation_level.spaces) - - TextSize::new(indentation_level.tabs), - tok_pos, - ), - )); - } - Ordering::Less => { - // One or more dedentations - // Pop off other levels until col is found: - - loop { - let current_indentation = self.indentations.current(); - let ordering = - indentation_level.compare_strict(current_indentation, self.get_pos())?; - match ordering { - Ordering::Less => { - self.indentations.pop(); - let tok_pos = self.get_pos(); - self.emit((Tok::Dedent, TextRange::empty(tok_pos))); - } - Ordering::Equal => { - // We arrived at proper level of indentation. - break; - } - Ordering::Greater => { - return Err(LexicalError { - error: LexicalErrorType::IndentationError, - location: self.get_pos(), - }); - } - } - } + self.lex_magic_command(kind) } - } - Ok(()) - } + '#' => { + self.cursor.start_token(); + self.cursor.bump(); - // Take a look at the next character, if any, and decide upon the next steps. - fn consume_normal(&mut self) -> Result<(), LexicalError> { - if let Some(c) = self.window[0] { - // Identifiers are the most common case. - if self.is_identifier_start(c) { - let identifier = self.lex_identifier()?; - self.emit(identifier); - } else { - self.consume_character(c)?; + self.lex_comment()? } - } else { - // We reached end of file. - let tok_pos = self.get_pos(); - // First of all, we need all nestings to be finished. - if self.nesting > 0 { - return Err(LexicalError { - error: LexicalErrorType::Eof, - location: tok_pos, - }); + '\n' => { + self.cursor.start_token(); + self.cursor.bump(); + Tok::NonLogicalNewline + } + // `\r` or `\r\n` + '\r' => { + self.cursor.start_token(); + self.cursor.bump(); + self.cursor.eat_char('\n'); + Tok::NonLogicalNewline } - // Next, insert a trailing newline, if required. - if !self.at_begin_of_line { - self.at_begin_of_line = true; - self.emit((Tok::Newline, TextRange::empty(tok_pos))); + EOF_CHAR => { + // handled by consume end of line + return Ok(None); } - // Next, flush the indentation stack to zero. - while !self.indentations.is_empty() { + _ => { + self.at_begin_of_line = false; + + return self.handle_indentation(indentation); + } + }; + + Ok(Some((token, self.token_range()))) + } + + fn handle_indentation( + &mut self, + indentation: Indentation, + ) -> Result, LexicalError> { + let token = match self.indentations.current().try_compare(&indentation) { + // Dedent + Ok(Ordering::Greater) => { self.indentations.pop(); - self.emit((Tok::Dedent, TextRange::empty(tok_pos))); + self.pending_indentation = Some(indentation); + + Some((Tok::Dedent, TextRange::empty(self.offset()))) + } + + Ok(Ordering::Equal) => None, + + // Indent + Ok(Ordering::Less) => { + self.indentations.push(indentation); + Some((Tok::Indent, self.token_range())) } + Err(_) => { + return Err(LexicalError { + error: LexicalErrorType::IndentationError, + location: self.offset(), + }); + } + }; - self.emit((Tok::EndOfFile, TextRange::empty(tok_pos))); + Ok(token) + } + + // Take a look at the next character, if any, and decide upon the next steps. + fn consume_normal(&mut self, first: char) -> Result, LexicalError> { + if first.is_ascii() { + self.consume_ascii_character(first) + } else if is_unicode_identifier_start(first) { + let identifier = self.lex_identifier(first)?; + Ok(Some((identifier, self.token_range()))) + } else if is_emoji_presentation(first) { + Ok(Some(( + Tok::Name { + name: first.to_string(), + }, + self.token_range(), + ))) + } else { + Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: first }, + location: self.token_start(), + }) + } + } + + fn consume_end(&mut self) -> Result { + // We reached end of file. + // First of all, we need all nestings to be finished. + if self.nesting > 0 { + return Err(LexicalError { + error: LexicalErrorType::Eof, + location: self.offset(), + }); } - Ok(()) + // Next, insert a trailing newline, if required. + if !self.at_begin_of_line { + self.at_begin_of_line = true; + Ok((Tok::Newline, TextRange::empty(self.offset()))) + } + // Next, flush the indentation stack to zero. + else if self.indentations.pop().is_some() { + Ok((Tok::Dedent, TextRange::empty(self.offset()))) + } else { + Ok((Tok::EndOfFile, TextRange::empty(self.offset()))) + } } // Dispatch based on the given character. - fn consume_character(&mut self, c: char) -> Result<(), LexicalError> { - match c { - '0'..='9' => { - let number = self.lex_number()?; - self.emit(number); - } - '#' => { - self.lex_and_emit_comment()?; - } - '"' | '\'' => { - let string = self.lex_string(StringKind::String)?; - self.emit(string); - } + fn consume_ascii_character(&mut self, c: char) -> Result, LexicalError> { + let token = match c { + c if is_ascii_identifier_start(c) => self.lex_identifier(c)?, + '0'..='9' => self.lex_number(c)?, + '#' => self.lex_comment()?, + '"' | '\'' => self.lex_string(StringKind::String, c)?, '=' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::EqEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Equal, TextRange::new(tok_start, tok_end))); - } + if self.cursor.eat_char('=') { + Tok::EqEqual + } else { + Tok::Equal } } '+' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::PlusEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::PlusEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Plus, TextRange::new(tok_start, tok_end))); + Tok::Plus } } '*' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::StarEqual, TextRange::new(tok_start, tok_end))); - } - Some('*') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::DoubleStarEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::DoubleStar, TextRange::new(tok_start, tok_end))); - } - } - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Star, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::StarEqual + } else if self.cursor.eat_char('*') { + if self.cursor.eat_char('=') { + Tok::DoubleStarEqual + } else { + Tok::DoubleStar } + } else { + Tok::Star } } '/' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::SlashEqual, TextRange::new(tok_start, tok_end))); - } - Some('/') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::DoubleSlashEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::DoubleSlash, TextRange::new(tok_start, tok_end))); - } - } - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Slash, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::SlashEqual + } else if self.cursor.eat_char('/') { + if self.cursor.eat_char('=') { + Tok::DoubleSlashEqual + } else { + Tok::DoubleSlash } + } else { + Tok::Slash } } '%' => { - if self.mode == Mode::Jupyter && self.nesting == 0 && self.last_token_is_equal { - self.lex_and_emit_magic_command(); + if self.cursor.eat_char('=') { + Tok::PercentEqual } else { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::PercentEqual, TextRange::new(tok_start, tok_end))); - } else { - let tok_end = self.get_pos(); - self.emit((Tok::Percent, TextRange::new(tok_start, tok_end))); - } + Tok::Percent } } '|' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::VbarEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::VbarEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Vbar, TextRange::new(tok_start, tok_end))); + Tok::Vbar } } '^' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::CircumflexEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::CircumflexEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::CircumFlex, TextRange::new(tok_start, tok_end))); + Tok::CircumFlex } } '&' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::AmperEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::AmperEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Amper, TextRange::new(tok_start, tok_end))); + Tok::Amper } } '-' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::MinusEqual, TextRange::new(tok_start, tok_end))); - } - Some('>') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::Rarrow, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Minus, TextRange::new(tok_start, tok_end))); - } + if self.cursor.eat_char('=') { + Tok::MinusEqual + } else if self.cursor.eat_char('>') { + Tok::Rarrow + } else { + Tok::Minus } } '@' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::AtEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::AtEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::At, TextRange::new(tok_start, tok_end))); + Tok::At } } '!' => { - if self.mode == Mode::Jupyter && self.nesting == 0 && self.last_token_is_equal { - self.lex_and_emit_magic_command(); + if self.cursor.eat_char('=') { + Tok::NotEqual } else { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::NotEqual, TextRange::new(tok_start, tok_end))); - } else { - return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: '!' }, - location: tok_start, - }); - } + return Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: '!' }, + location: self.token_start(), + }); } } - '~' => { - self.eat_single_char(Tok::Tilde); - } + '~' => Tok::Tilde, '(' => { - self.eat_single_char(Tok::Lpar); self.nesting += 1; + Tok::Lpar } ')' => { - self.eat_single_char(Tok::Rpar); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rpar } '[' => { - self.eat_single_char(Tok::Lsqb); self.nesting += 1; + Tok::Lsqb } ']' => { - self.eat_single_char(Tok::Rsqb); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rsqb } '{' => { - self.eat_single_char(Tok::Lbrace); self.nesting += 1; + Tok::Lbrace } '}' => { - self.eat_single_char(Tok::Rbrace); - if self.nesting == 0 { - return Err(LexicalError { - error: LexicalErrorType::NestingError, - location: self.get_pos(), - }); - } - self.nesting -= 1; + self.nesting = self.nesting.saturating_sub(1); + Tok::Rbrace } ':' => { - let tok_start = self.get_pos(); - self.next_char(); - if let Some('=') = self.window[0] { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::ColonEqual, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('=') { + Tok::ColonEqual } else { - let tok_end = self.get_pos(); - self.emit((Tok::Colon, TextRange::new(tok_start, tok_end))); + Tok::Colon } } - ';' => { - self.eat_single_char(Tok::Semi); - } + ';' => Tok::Semi, '<' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('<') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::LeftShiftEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::LeftShift, TextRange::new(tok_start, tok_end))); - } - } - } - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::LessEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Less, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('<') { + if self.cursor.eat_char('=') { + Tok::LeftShiftEqual + } else { + Tok::LeftShift } + } else if self.cursor.eat_char('=') { + Tok::LessEqual + } else { + Tok::Less } } '>' => { - let tok_start = self.get_pos(); - self.next_char(); - match self.window[0] { - Some('>') => { - self.next_char(); - match self.window[0] { - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::RightShiftEqual, - TextRange::new(tok_start, tok_end), - )); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::RightShift, TextRange::new(tok_start, tok_end))); - } - } - } - Some('=') => { - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::GreaterEqual, TextRange::new(tok_start, tok_end))); - } - _ => { - let tok_end = self.get_pos(); - self.emit((Tok::Greater, TextRange::new(tok_start, tok_end))); + if self.cursor.eat_char('>') { + if self.cursor.eat_char('=') { + Tok::RightShiftEqual + } else { + Tok::RightShift } + } else if self.cursor.eat_char('=') { + Tok::GreaterEqual + } else { + Tok::Greater } } - ',' => { - self.eat_single_char(Tok::Comma); - } + ',' => Tok::Comma, '.' => { - if let Some('0'..='9') = self.window[1] { - let number = self.lex_number()?; - self.emit(number); + if self.cursor.first().is_ascii_digit() { + self.lex_decimal_number('.')? + } else if self.cursor.first() == '.' && self.cursor.second() == '.' { + self.cursor.bump(); + self.cursor.bump(); + Tok::Ellipsis } else { - let tok_start = self.get_pos(); - self.next_char(); - if self.window[..2] == [Some('.'); 2] { - self.next_char(); - self.next_char(); - let tok_end = self.get_pos(); - self.emit((Tok::Ellipsis, TextRange::new(tok_start, tok_end))); - } else { - let tok_end = self.get_pos(); - self.emit((Tok::Dot, TextRange::new(tok_start, tok_end))); - } + Tok::Dot } } - '\n' | '\r' => { - let tok_start = self.get_pos(); - self.next_char(); - let tok_end = self.get_pos(); + '\n' => { + if self.nesting == 0 { + self.at_begin_of_line = true; + Tok::Newline + } else { + Tok::NonLogicalNewline + } + } + '\r' => { + self.cursor.eat_char('\n'); - // Depending on the nesting level, we emit a logical or - // non-logical newline: if self.nesting == 0 { self.at_begin_of_line = true; - self.emit((Tok::Newline, TextRange::new(tok_start, tok_end))); + Tok::Newline } else { - #[cfg(feature = "full-lexer")] - self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); + Tok::NonLogicalNewline } } ' ' | '\t' | '\x0C' => { - // Skip white-spaces - self.next_char(); - while let Some(' ' | '\t' | '\x0C') = self.window[0] { - self.next_char(); - } + self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C')); + return Ok(None); } - '\\' => { - self.next_char(); - match self.window[0] { - Some('\n' | '\r') => { - self.next_char(); - } - _ => { - return Err(LexicalError { - error: LexicalErrorType::LineContinuationError, - location: self.get_pos(), - }); - } - } - if self.window[0].is_none() { + '\\' => { + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.is_eof() { return Err(LexicalError { error: LexicalErrorType::Eof, - location: self.get_pos(), + location: self.token_start(), }); - } - } - _ => { - if is_emoji_presentation(c) { - let tok_start = self.get_pos(); - self.next_char(); - let tok_end = self.get_pos(); - self.emit(( - Tok::Name { - name: c.to_string(), - }, - TextRange::new(tok_start, tok_end), - )); - } else { - let c = self.next_char(); + } else if !self.cursor.eat_char('\n') { return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() }, - location: self.get_pos(), + error: LexicalErrorType::LineContinuationError, + location: self.token_start(), }); } + return Ok(None); } - } - Ok(()) + _ => { + return Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: c }, + location: self.token_start(), + }); + } + }; + + Ok(Some((token, self.token_range()))) } - // Used by single character tokens to advance the window and emit the correct token. - fn eat_single_char(&mut self, ty: Tok) { - let tok_start = self.get_pos(); - self.next_char().unwrap_or_else(|| unsafe { - // SAFETY: eat_single_char has been called only after a character has been read - // from the window, so the window is guaranteed to be non-empty. - std::hint::unreachable_unchecked() - }); - let tok_end = self.get_pos(); - self.emit((ty, TextRange::new(tok_start, tok_end))); - } - - // Helper function to go to the next character coming up. - fn next_char(&mut self) -> Option { - let mut c = self.window[0]; - self.window.slide(); - match c { - Some('\r') => { - if self.window[0] == Some('\n') { - self.location += TextSize::from(1); - self.window.slide(); - } + #[inline] + fn token_range(&self) -> TextRange { + let end = self.offset(); + let len = self.cursor.token_len(); - self.location += TextSize::from(1); - c = Some('\n'); - } - #[allow(unused_variables)] - Some(c) => { - self.location += c.text_len(); - } - _ => {} - } - c + TextRange::at(end - len, len) } - // Helper function to retrieve the current position. - fn get_pos(&self) -> TextSize { - self.location + #[inline] + fn token_text(&self) -> &'source str { + &self.source[self.token_range()] } - // Helper function to emit a lexed token to the queue of tokens. - fn emit(&mut self, spanned: Spanned) { - self.last_token_is_equal = matches!(spanned.0, Tok::Equal); - self.pending.push(spanned); + #[inline] + fn offset(&self) -> TextSize { + TextSize::new(self.source.len() as u32) - self.cursor.text_len() + } + + #[inline] + fn token_start(&self) -> TextSize { + self.token_range().start() } } // Implement iterator pattern for Lexer. // Calling the next element in the iterator will yield the next lexical // token. -impl Iterator for Lexer -where - T: Iterator, -{ +impl Iterator for Lexer<'_> { type Item = LexResult; fn next(&mut self) -> Option { - let token = self.inner_next(); - trace!( - "Lex token {:?}, nesting={:?}, indent stack: {:?}", - token, - self.nesting, - self.indentations, - ); + let token = self.next_token(); match token { Ok((Tok::EndOfFile, _)) => None, @@ -1336,6 +929,8 @@ where } } +impl FusedIterator for Lexer<'_> {} + /// Represents an error that occur during lexing and are /// returned by the `parse_*` functions in the iterator in the /// [lexer] implementation. @@ -1442,10 +1037,63 @@ impl std::fmt::Display for LexicalErrorType { } } +#[derive(Copy, Clone, Debug)] +enum Radix { + Binary, + Octal, + Decimal, + Hex, +} + +impl Radix { + const fn as_u32(self) -> u32 { + match self { + Radix::Binary => 2, + Radix::Octal => 8, + Radix::Decimal => 10, + Radix::Hex => 16, + } + } + + const fn is_digit(self, c: char) -> bool { + match self { + Radix::Binary => matches!(c, '0'..='1'), + Radix::Octal => matches!(c, '0'..='7'), + Radix::Decimal => c.is_ascii_digit(), + Radix::Hex => matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'), + } + } +} + +const fn is_quote(c: char) -> bool { + matches!(c, '\'' | '"') +} + +const fn is_ascii_identifier_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_') +} + +// Checks if the character c is a valid starting character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_unicode_identifier_start(c: char) -> bool { + is_xid_start(c) +} + +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char) -> bool { + match c { + 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true, + c => is_xid_continue(c), + } +} + #[cfg(test)] mod tests { + use num_bigint::BigInt; + use rustpython_ast::MagicKind; + use super::*; - use crate::ast::bigint::BigInt; const WINDOWS_EOL: &str = "\r\n"; const MAC_EOL: &str = "\r"; @@ -1482,13 +1130,10 @@ mod tests { let tokens = lex_jupyter_source(&source); assert_eq!( tokens, - vec![ - Tok::MagicCommand { - value: "matplotlib --inline".to_string(), - kind: MagicKind::Magic - }, - Tok::Newline - ] + vec![Tok::MagicCommand { + value: "matplotlib --inline".to_string(), + kind: MagicKind::Magic + },] ) } @@ -1512,13 +1157,10 @@ mod tests { let tokens = lex_jupyter_source(&source); assert_eq!( tokens, - vec![ - Tok::MagicCommand { - value: "matplotlib ".to_string(), - kind: MagicKind::Magic - }, - Tok::Newline - ] + vec![Tok::MagicCommand { + value: "matplotlib ".to_string(), + kind: MagicKind::Magic + },] ) } @@ -1548,47 +1190,46 @@ mod tests { value: "".to_string(), kind: MagicKind::Magic, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Magic2, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Shell, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::ShCap, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help2, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Paren, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Quote, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Quote2, }, - Tok::Newline, ] ) } @@ -1607,8 +1248,10 @@ mod tests { !!cd /Users/foo/Library/Application\ Support/ /foo 1 2 ,foo 1 2 -;foo 1 2" - .trim(); +;foo 1 2 + !ls +" + .trim(); let tokens = lex_jupyter_source(source); assert_eq!( tokens, @@ -1617,56 +1260,59 @@ mod tests { value: "foo".to_string(), kind: MagicKind::Help, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo".to_string(), kind: MagicKind::Help2, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "timeit a = b".to_string(), kind: MagicKind::Magic, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "timeit a % 3".to_string(), kind: MagicKind::Magic, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "matplotlib --inline".to_string(), kind: MagicKind::Magic, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "pwd && ls -a | sed 's/^/\\\\ /'".to_string(), kind: MagicKind::Shell, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "cd /Users/foo/Library/Application\\ Support/".to_string(), kind: MagicKind::ShCap, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Paren, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote, }, - Tok::Newline, + Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote2, }, - Tok::Newline, + Tok::NonLogicalNewline, + Tok::MagicCommand { + value: "ls".to_string(), + kind: MagicKind::Shell, + }, ] ) } - #[test] fn test_jupyter_magic_indentation() { let source = r" @@ -1693,84 +1339,6 @@ if True: ) } - #[test] - fn test_jupyter_magic_assignment() { - let source = r" -pwd = !pwd -foo = %timeit a = b -bar = %timeit a % 3 -baz = %matplotlib \ - inline" - .trim(); - let tokens = lex_jupyter_source(source); - assert_eq!( - tokens, - vec![ - Tok::Name { - name: "pwd".to_string() - }, - Tok::Equal, - Tok::MagicCommand { - value: "pwd".to_string(), - kind: MagicKind::Shell, - }, - Tok::Newline, - Tok::Name { - name: "foo".to_string() - }, - Tok::Equal, - Tok::MagicCommand { - value: "timeit a = b".to_string(), - kind: MagicKind::Magic, - }, - Tok::Newline, - Tok::Name { - name: "bar".to_string() - }, - Tok::Equal, - Tok::MagicCommand { - value: "timeit a % 3".to_string(), - kind: MagicKind::Magic, - }, - Tok::Newline, - Tok::Name { - name: "baz".to_string() - }, - Tok::Equal, - Tok::MagicCommand { - value: "matplotlib inline".to_string(), - kind: MagicKind::Magic, - }, - Tok::Newline, - ] - ) - } - - fn assert_no_jupyter_magic(tokens: &[Tok]) { - for tok in tokens { - if let Tok::MagicCommand { .. } = tok { - panic!("Unexpected magic command token: {:?}", tok) - } - } - } - - #[test] - fn test_jupyter_magic_not_an_assignment() { - let source = r" -# Other magic kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token) -foo = /func -foo = ;func -foo = ,func - -(foo == %timeit a = b) -(foo := %timeit a = b) -def f(arg=%timeit a = b): - pass" - .trim(); - let tokens = lex_jupyter_source(source); - assert_no_jupyter_magic(&tokens); - } - #[test] fn test_numbers() { let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j"; @@ -1816,7 +1384,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!(r"99232 # {}", $eol); let tokens = lex_source(&source); @@ -1837,7 +1405,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("123 # Foo{}456", $eol); let tokens = lex_source(&source); @@ -1893,7 +1461,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1931,7 +1499,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1972,7 +1540,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -2025,7 +1593,7 @@ def f(arg=%timeit a = b): ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] + fn $name() { let source = r"x = [ @@ -2088,7 +1656,7 @@ def f(arg=%timeit a = b): } #[test] - #[cfg(feature = "full-lexer")] + fn test_non_logical_newline_in_string_continuation() { let source = r"( 'a' @@ -2118,7 +1686,7 @@ def f(arg=%timeit a = b): } #[test] - #[cfg(feature = "full-lexer")] + fn test_logical_newline_line_comment() { let source = "#Hello\n#World\n"; let tokens = lex_source(source); diff --git a/parser/src/lexer/cursor.rs b/parser/src/lexer/cursor.rs new file mode 100644 index 00000000..90f9f7b2 --- /dev/null +++ b/parser/src/lexer/cursor.rs @@ -0,0 +1,108 @@ +use crate::text_size::{TextLen, TextSize}; +use std::str::Chars; + +pub(crate) const EOF_CHAR: char = '\0'; + +#[derive(Clone, Debug)] +pub(super) struct Cursor<'a> { + chars: Chars<'a>, + source_length: TextSize, + #[cfg(debug_assertions)] + prev_char: char, +} + +impl<'a> Cursor<'a> { + pub fn new(source: &'a str) -> Self { + Self { + source_length: source.text_len(), + chars: source.chars(), + #[cfg(debug_assertions)] + prev_char: EOF_CHAR, + } + } + + /// Returns the previous token. Useful for debug assertions. + #[cfg(debug_assertions)] + pub(super) const fn previous(&self) -> char { + self.prev_char + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the file is at the end of the file. + pub(super) fn first(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the second character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn second(&self) -> char { + let mut chars = self.chars.clone(); + chars.next(); + chars.next().unwrap_or(EOF_CHAR) + } + + /// Peeks the third character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn rest(&self) -> &'a str { + self.chars.as_str() + } + + // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + #[allow(clippy::cast_possible_truncation)] + pub(super) fn text_len(&self) -> TextSize { + TextSize::new(self.chars.as_str().len() as u32) + } + + pub(super) fn token_len(&self) -> TextSize { + self.source_length - self.text_len() + } + + pub(super) fn start_token(&mut self) { + self.source_length = self.text_len() + } + + pub(super) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Consumes the next character + pub(super) fn bump(&mut self) -> Option { + let prev = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev_char = prev; + } + + Some(prev) + } + + pub(super) fn eat_char(&mut self, c: char) -> bool { + if self.first() == c { + self.bump(); + true + } else { + false + } + } + + pub(super) fn eat_if(&mut self, mut predicate: F) -> Option + where + F: FnMut(char) -> bool, + { + if predicate(self.first()) && !self.is_eof() { + self.bump() + } else { + None + } + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/parser/src/lexer/indentation.rs b/parser/src/lexer/indentation.rs new file mode 100644 index 00000000..a268679b --- /dev/null +++ b/parser/src/lexer/indentation.rs @@ -0,0 +1,133 @@ +use static_assertions::assert_eq_size; +use std::cmp::Ordering; +use std::fmt::Debug; + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Column(u32); + +impl Column { + pub(super) const fn new(column: u32) -> Self { + Self(column) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Character(u32); + +impl Character { + pub(super) const fn new(characters: u32) -> Self { + Self(characters) + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] +pub(super) struct Indentation { + column: Column, + character: Character, +} + +impl Indentation { + pub(super) const fn root() -> Self { + Self { + column: Column::new(0), + character: Character::new(0), + } + } + + #[cfg(test)] + pub(super) const fn new(column: Column, character: Character) -> Self { + Self { character, column } + } + + #[must_use] + pub(super) fn add_space(self) -> Self { + Self { + character: Character(self.character.0 + 1), + column: Column(self.column.0 + 1), + } + } + + #[must_use] + pub(super) fn add_tab(self) -> Self { + Self { + character: Character(self.character.0 + 1), + column: Column((self.column.0 / 2 + 1) * 2), + } + } + + pub(super) fn try_compare( + &self, + other: &Indentation, + ) -> Result { + let column_ordering = self.column.cmp(&other.column); + let character_ordering = self.character.cmp(&other.character); + + if column_ordering == character_ordering { + Ok(column_ordering) + } else { + Err(UnexpectedIndentation) + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub(super) struct UnexpectedIndentation; + +// The indentations stack is used to keep track of the current indentation level. +// Similar to the CPython implementation, the Indentations stack always has at +// least one level which is never popped. See Reference 2.1.8. +#[derive(Debug, Clone)] +pub(super) struct Indentations { + stack: Vec, +} + +impl Indentations { + pub fn is_empty(&self) -> bool { + self.stack.len() == 1 + } + + pub fn push(&mut self, indent: Indentation) { + debug_assert_eq!(self.current().try_compare(&indent), Ok(Ordering::Less)); + + self.stack.push(indent); + } + + pub fn pop(&mut self) -> Option { + if self.is_empty() { + None + } else { + self.stack.pop() + } + } + + pub fn current(&self) -> &Indentation { + self.stack.last().expect("Expected indentation") + } +} + +impl Default for Indentations { + fn default() -> Self { + Self { + stack: vec![Indentation::root()], + } + } +} + +assert_eq_size!(Indentation, u64); + +#[cfg(test)] +mod tests { + use super::{Character, Column, Indentation}; + use std::cmp::Ordering; + + #[test] + fn indentation_try_compare() { + let tab = Indentation::new(Column::new(8), Character::new(1)); + + assert_eq!(tab.try_compare(&tab), Ok(Ordering::Equal)); + + let two_tabs = Indentation::new(Column::new(16), Character::new(2)); + assert_eq!(two_tabs.try_compare(&tab), Ok(Ordering::Greater)); + assert_eq!(tab.try_compare(&two_tabs), Ok(Ordering::Less)); + } +} diff --git a/parser/src/parser.rs b/parser/src/parser.rs index c2dac39e..2748dc53 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -12,6 +12,12 @@ //! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree //! [`Mode`]: crate::mode +use std::iter; + +use itertools::Itertools; +pub(super) use lalrpop_util::ParseError as LalrpopError; + +use crate::lexer::{lex, lex_starts_at}; use crate::{ ast::{self, Ranged}, lexer::{self, LexResult, LexicalError, LexicalErrorType}, @@ -20,11 +26,6 @@ use crate::{ token::Tok, Mode, }; -use itertools::Itertools; -use std::iter; - -use crate::{lexer::Lexer, soft_keywords::SoftKeywordTransformer}; -pub(super) use lalrpop_util::ParseError as LalrpopError; /// Parse Python code string to implementor's type. /// @@ -57,44 +58,43 @@ where Self: Sized, { fn parse(source: &str, source_path: &str) -> Result { - Self::parse_starts_at(source, source_path, TextSize::default()) + let tokens = lex(source, Self::mode()); + + Self::parse_tokens(tokens, source_path) } + fn parse_without_path(source: &str) -> Result { Self::parse(source, "") } + fn parse_starts_at( source: &str, source_path: &str, offset: TextSize, ) -> Result { - let lxr = Self::lex_starts_at(source, offset); - #[cfg(feature = "full-lexer")] - let lxr = - lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); - Self::parse_tokens(lxr, source_path) + let tokens = lex_starts_at(source, Self::mode(), offset); + + Self::parse_tokens(tokens, source_path) } - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer>; + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result; + + fn mode() -> Mode; } impl Parse for ast::ModModule { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Module, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Module, source_path)? { + match parse_tokens(lxr, Mode::Module, source_path)? { ast::Mod::Module(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -102,17 +102,15 @@ impl Parse for ast::ModModule { } impl Parse for ast::ModExpression { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Expression, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Expression, source_path)? { + match parse_tokens(lxr, Mode::Expression, source_path)? { ast::Mod::Expression(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -120,17 +118,14 @@ impl Parse for ast::ModExpression { } impl Parse for ast::ModInteractive { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Interactive, offset) + fn mode() -> Mode { + Mode::Interactive } fn parse_tokens( lxr: impl IntoIterator, source_path: &str, ) -> Result { - match parse_filtered_tokens(lxr, Mode::Interactive, source_path)? { + match parse_tokens(lxr, Mode::Interactive, source_path)? { ast::Mod::Interactive(m) => Ok(m), _ => unreachable!("Mode::Module doesn't return other variant"), } @@ -138,12 +133,10 @@ impl Parse for ast::ModInteractive { } impl Parse for ast::Suite { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModModule::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -153,12 +146,10 @@ impl Parse for ast::Suite { } impl Parse for ast::Stmt { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModModule::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Module } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -186,12 +177,10 @@ impl Parse for ast::Stmt { } impl Parse for ast::Expr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::ModExpression::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -201,12 +190,10 @@ impl Parse for ast::Expr { } impl Parse for ast::Identifier { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -227,12 +214,10 @@ impl Parse for ast::Identifier { } impl Parse for ast::Constant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - ast::Expr::lex_starts_at(source, offset) + fn mode() -> Mode { + Mode::Expression } + fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -426,7 +411,7 @@ pub fn parse_tokens( source_path: &str, ) -> Result { let lxr = lxr.into_iter(); - #[cfg(feature = "full-lexer")] + let lxr = lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); parse_filtered_tokens(lxr, mode, source_path) @@ -571,8 +556,10 @@ include!("gen/parse.rs"); #[cfg(test)] mod tests { - use super::*; use crate::{ast, Parse}; + use insta::assert_debug_snapshot; + + use super::*; #[test] fn test_parse_empty() { @@ -656,7 +643,6 @@ class Foo(A, B): } #[test] - #[cfg(feature = "all-nodes-with-ranges")] fn test_parse_class_generic_types() { let source = "\ # TypeVar @@ -687,7 +673,6 @@ class Foo[X, Y: str, *U, **P](): insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); } #[test] - #[cfg(feature = "all-nodes-with-ranges")] fn test_parse_function_definition() { let source = "\ def func(a): @@ -985,6 +970,57 @@ x = type = 1 insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); } + #[test] + fn numeric_literals() { + let source = r#"x = 123456789 +x = 123456 +x = .1 +x = 1. +x = 1E+1 +x = 1E-1 +x = 1.000_000_01 +x = 123456789.123456789 +x = 123456789.123456789E123456789 +x = 123456789E123456789 +x = 123456789J +x = 123456789.123456789J +x = 0XB1ACC +x = 0B1011 +x = 0O777 +x = 0.000000006 +x = 10000 +x = 133333 +"#; + + insta::assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()); + } + + #[test] + fn numeric_literals_attribute_access() { + let source = r#"x = .1.is_integer() +x = 1. .imag +x = 1E+1.imag +x = 1E-1.real +x = 123456789.123456789.hex() +x = 123456789.123456789E123456789 .real +x = 123456789E123456789 .conjugate() +x = 123456789J.real +x = 123456789.123456789J.__add__(0b1011.bit_length()) +x = 0XB1ACC.conjugate() +x = 0B1011 .conjugate() +x = 0O777 .real +x = 0.000000006 .hex() +x = -100.0000J + +if 10 .real: + ... + +y = 100[no] +y = 100(no) +"#; + assert_debug_snapshot!(ast::Suite::parse(source, "").unwrap()) + } + #[test] fn test_match_as_identifier() { let source = r#"\ diff --git a/parser/src/python.lalrpop b/parser/src/python.lalrpop index 7b605d37..e95b21bd 100644 --- a/parser/src/python.lalrpop +++ b/parser/src/python.lalrpop @@ -3,8 +3,9 @@ // See also: file:///usr/share/doc/python/html/reference/compound_stmts.html#function-definitions // See also: https://greentreesnakes.readthedocs.io/en/latest/nodes.html#keyword +use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt, MagicKind}, + ast::{self as ast, Ranged, MagicKind}, Mode, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, diff --git a/parser/src/python.rs b/parser/src/python.rs index da673857..5245dcba 100644 --- a/parser/src/python.rs +++ b/parser/src/python.rs @@ -1,7 +1,8 @@ // auto-generated: "lalrpop 0.20.0" -// sha3: fa57e02e9e5bfceb811748310e8d17940d15b6c6e2d6191d9ae71b2e4dc435d8 +// sha3: 6a6e10102ca4897f12ff5ed33ef5aad928e18e86753214e39d3f3495951fc631 +use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt, MagicKind}, + ast::{self as ast, Ranged, MagicKind}, Mode, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, @@ -21,8 +22,9 @@ extern crate alloc; #[allow(non_snake_case, non_camel_case_types, unused_mut, unused_variables, unused_imports, unused_parens, clippy::all)] mod __parse__Top { + use num_bigint::BigInt; use crate::{ - ast::{self as ast, Ranged, bigint::BigInt, MagicKind}, + ast::{self as ast, Ranged, MagicKind}, Mode, lexer::{LexicalError, LexicalErrorType}, function::{ArgumentList, parse_args, validate_pos_params, validate_arguments}, diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap new file mode 100644 index 00000000..3ad53568 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals.snap @@ -0,0 +1,440 @@ +--- +source: parser/src/parser.rs +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + Assign( + StmtAssign { + range: 0..13, + targets: [ + Name( + ExprName { + range: 0..1, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 4..13, + value: Int( + 123456789, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 14..24, + targets: [ + Name( + ExprName { + range: 14..15, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 18..24, + value: Int( + 123456, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 25..31, + targets: [ + Name( + ExprName { + range: 25..26, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 29..31, + value: Float( + 0.1, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 32..38, + targets: [ + Name( + ExprName { + range: 32..33, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 36..38, + value: Float( + 1.0, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 39..47, + targets: [ + Name( + ExprName { + range: 39..40, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 43..47, + value: Float( + 10.0, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 48..56, + targets: [ + Name( + ExprName { + range: 48..49, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 52..56, + value: Float( + 0.1, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 57..73, + targets: [ + Name( + ExprName { + range: 57..58, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 61..73, + value: Float( + 1.00000001, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 74..97, + targets: [ + Name( + ExprName { + range: 74..75, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 78..97, + value: Float( + 123456789.12345679, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 98..131, + targets: [ + Name( + ExprName { + range: 98..99, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 102..131, + value: Float( + inf, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 132..155, + targets: [ + Name( + ExprName { + range: 132..133, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 136..155, + value: Float( + inf, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 156..170, + targets: [ + Name( + ExprName { + range: 156..157, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 160..170, + value: Complex { + real: 0.0, + imag: 123456789.0, + }, + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 171..195, + targets: [ + Name( + ExprName { + range: 171..172, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 175..195, + value: Complex { + real: 0.0, + imag: 123456789.12345679, + }, + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 196..207, + targets: [ + Name( + ExprName { + range: 196..197, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 200..207, + value: Int( + 727756, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 208..218, + targets: [ + Name( + ExprName { + range: 208..209, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 212..218, + value: Int( + 11, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 219..228, + targets: [ + Name( + ExprName { + range: 219..220, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 223..228, + value: Int( + 511, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 229..244, + targets: [ + Name( + ExprName { + range: 229..230, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 233..244, + value: Float( + 6e-9, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 245..254, + targets: [ + Name( + ExprName { + range: 245..246, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 249..254, + value: Int( + 10000, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 255..265, + targets: [ + Name( + ExprName { + range: 255..256, + id: "x", + ctx: Store, + }, + ), + ], + value: Constant( + ExprConstant { + range: 259..265, + value: Int( + 133333, + ), + kind: None, + }, + ), + type_comment: None, + }, + ), +] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap new file mode 100644 index 00000000..a5f419b4 --- /dev/null +++ b/parser/src/snapshots/rustpython_parser__parser__tests__numeric_literals_attribute_access.snap @@ -0,0 +1,672 @@ +--- +source: parser/src/parser.rs +expression: "ast::Suite::parse(source, \"\").unwrap()" +--- +[ + Assign( + StmtAssign { + range: 0..19, + targets: [ + Name( + ExprName { + range: 0..1, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 4..19, + func: Attribute( + ExprAttribute { + range: 4..17, + value: Constant( + ExprConstant { + range: 4..6, + value: Float( + 0.1, + ), + kind: None, + }, + ), + attr: Identifier { + id: "is_integer", + range: 7..17, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 20..32, + targets: [ + Name( + ExprName { + range: 20..21, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 24..32, + value: Constant( + ExprConstant { + range: 24..26, + value: Float( + 1.0, + ), + kind: None, + }, + ), + attr: Identifier { + id: "imag", + range: 28..32, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 33..46, + targets: [ + Name( + ExprName { + range: 33..34, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 37..46, + value: Constant( + ExprConstant { + range: 37..41, + value: Float( + 10.0, + ), + kind: None, + }, + ), + attr: Identifier { + id: "imag", + range: 42..46, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 47..60, + targets: [ + Name( + ExprName { + range: 47..48, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 51..60, + value: Constant( + ExprConstant { + range: 51..55, + value: Float( + 0.1, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 56..60, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 61..90, + targets: [ + Name( + ExprName { + range: 61..62, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 65..90, + func: Attribute( + ExprAttribute { + range: 65..88, + value: Constant( + ExprConstant { + range: 65..84, + value: Float( + 123456789.12345679, + ), + kind: None, + }, + ), + attr: Identifier { + id: "hex", + range: 85..88, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 91..130, + targets: [ + Name( + ExprName { + range: 91..92, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 95..130, + value: Constant( + ExprConstant { + range: 95..124, + value: Float( + inf, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 126..130, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 131..167, + targets: [ + Name( + ExprName { + range: 131..132, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 135..167, + func: Attribute( + ExprAttribute { + range: 135..165, + value: Constant( + ExprConstant { + range: 135..154, + value: Float( + inf, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 156..165, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 168..187, + targets: [ + Name( + ExprName { + range: 168..169, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 172..187, + value: Constant( + ExprConstant { + range: 172..182, + value: Complex { + real: 0.0, + imag: 123456789.0, + }, + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 183..187, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 188..241, + targets: [ + Name( + ExprName { + range: 188..189, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 192..241, + func: Attribute( + ExprAttribute { + range: 192..220, + value: Constant( + ExprConstant { + range: 192..212, + value: Complex { + real: 0.0, + imag: 123456789.12345679, + }, + kind: None, + }, + ), + attr: Identifier { + id: "__add__", + range: 213..220, + }, + ctx: Load, + }, + ), + args: [ + Call( + ExprCall { + range: 221..240, + func: Attribute( + ExprAttribute { + range: 221..238, + value: Constant( + ExprConstant { + range: 221..227, + value: Int( + 11, + ), + kind: None, + }, + ), + attr: Identifier { + id: "bit_length", + range: 228..238, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + ], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 242..265, + targets: [ + Name( + ExprName { + range: 242..243, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 246..265, + func: Attribute( + ExprAttribute { + range: 246..263, + value: Constant( + ExprConstant { + range: 246..253, + value: Int( + 727756, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 254..263, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 266..289, + targets: [ + Name( + ExprName { + range: 266..267, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 270..289, + func: Attribute( + ExprAttribute { + range: 270..287, + value: Constant( + ExprConstant { + range: 270..276, + value: Int( + 11, + ), + kind: None, + }, + ), + attr: Identifier { + id: "conjugate", + range: 278..287, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 290..305, + targets: [ + Name( + ExprName { + range: 290..291, + id: "x", + ctx: Store, + }, + ), + ], + value: Attribute( + ExprAttribute { + range: 294..305, + value: Constant( + ExprConstant { + range: 294..299, + value: Int( + 511, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 301..305, + }, + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 306..329, + targets: [ + Name( + ExprName { + range: 306..307, + id: "x", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 310..329, + func: Attribute( + ExprAttribute { + range: 310..327, + value: Constant( + ExprConstant { + range: 310..321, + value: Float( + 6e-9, + ), + kind: None, + }, + ), + attr: Identifier { + id: "hex", + range: 324..327, + }, + ctx: Load, + }, + ), + args: [], + keywords: [], + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 330..344, + targets: [ + Name( + ExprName { + range: 330..331, + id: "x", + ctx: Store, + }, + ), + ], + value: UnaryOp( + ExprUnaryOp { + range: 334..344, + op: USub, + operand: Constant( + ExprConstant { + range: 335..344, + value: Complex { + real: 0.0, + imag: 100.0, + }, + kind: None, + }, + ), + }, + ), + type_comment: None, + }, + ), + If( + StmtIf { + range: 346..366, + test: Attribute( + ExprAttribute { + range: 349..357, + value: Constant( + ExprConstant { + range: 349..351, + value: Int( + 10, + ), + kind: None, + }, + ), + attr: Identifier { + id: "real", + range: 353..357, + }, + ctx: Load, + }, + ), + body: [ + Expr( + StmtExpr { + range: 363..366, + value: Constant( + ExprConstant { + range: 363..366, + value: Ellipsis, + kind: None, + }, + ), + }, + ), + ], + elif_else_clauses: [], + }, + ), + Assign( + StmtAssign { + range: 368..379, + targets: [ + Name( + ExprName { + range: 368..369, + id: "y", + ctx: Store, + }, + ), + ], + value: Subscript( + ExprSubscript { + range: 372..379, + value: Constant( + ExprConstant { + range: 372..375, + value: Int( + 100, + ), + kind: None, + }, + ), + slice: Name( + ExprName { + range: 376..378, + id: "no", + ctx: Load, + }, + ), + ctx: Load, + }, + ), + type_comment: None, + }, + ), + Assign( + StmtAssign { + range: 380..391, + targets: [ + Name( + ExprName { + range: 380..381, + id: "y", + ctx: Store, + }, + ), + ], + value: Call( + ExprCall { + range: 384..391, + func: Constant( + ExprConstant { + range: 384..387, + value: Int( + 100, + ), + kind: None, + }, + ), + args: [ + Name( + ExprName { + range: 388..390, + id: "no", + ctx: Load, + }, + ), + ], + keywords: [], + }, + ), + type_comment: None, + }, + ), +] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap index c48429b1..672b6230 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap @@ -6,9 +6,10 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ClassDef( StmtClassDef { range: 10..29, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 16..19, + }, bases: [], keywords: [], body: [ @@ -25,26 +26,28 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 20..21, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 20..21, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 52..76, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 58..61, + }, bases: [], keywords: [], body: [ @@ -61,21 +64,19 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 62..68, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 62..63, + }, bound: Some( Name( ExprName { range: 65..68, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -83,14 +84,16 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 105..138, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 111..114, + }, bases: [], keywords: [], body: [ @@ -107,14 +110,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 115..130, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 115..116, + }, bound: Some( Tuple( ExprTuple { @@ -123,18 +126,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 119..122, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), Name( ExprName { range: 124..129, - id: Identifier( - "bytes", - ), + id: "bytes", ctx: Load, }, ), @@ -146,14 +145,16 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 159..181, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 165..168, + }, bases: [], keywords: [], body: [ @@ -170,35 +171,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 169..170, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 169..170, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 172..173, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 172..173, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 200..223, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 206..209, + }, bases: [], keywords: [], body: [ @@ -215,35 +219,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 210..211, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 210..211, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 213..214, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 213..214, + }, bound: None, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 240..261, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 246..249, + }, bases: [], keywords: [], body: [ @@ -260,25 +267,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVarTuple( TypeParamTypeVarTuple { range: 250..253, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 251..253, + }, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 275..296, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 281..284, + }, bases: [], keywords: [], body: [ @@ -295,25 +304,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ ParamSpec( TypeParamParamSpec { range: 285..288, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 287..288, + }, }, ), ], + decorator_list: [], }, ), ClassDef( StmtClassDef { range: 312..351, - name: Identifier( - "Foo", - ), + name: Identifier { + id: "Foo", + range: 318..321, + }, bases: [], keywords: [], body: [ @@ -323,30 +334,29 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], - decorator_list: [], type_params: [ TypeVar( TypeParamTypeVar { range: 322..323, - name: Identifier( - "X", - ), + name: Identifier { + id: "X", + range: 322..323, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 325..331, - name: Identifier( - "Y", - ), + name: Identifier { + id: "Y", + range: 325..326, + }, bound: Some( Name( ExprName { range: 328..331, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -356,20 +366,23 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" TypeVarTuple( TypeParamTypeVarTuple { range: 333..335, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 334..335, + }, }, ), ParamSpec( TypeParamParamSpec { range: 337..340, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 339..340, + }, }, ), ], + decorator_list: [], }, ), ] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap index 2d65a64e..f84851f8 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap @@ -6,20 +6,22 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" FunctionDef( StmtFunctionDef { range: 0..20, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 4..8, + }, args: Arguments { - range: 9..10, + range: 8..11, posonlyargs: [], args: [ ArgWithDefault { range: 9..10, def: Arg { range: 9..10, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 9..10, + }, annotation: None, type_comment: None, }, @@ -46,34 +48,34 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 22..53, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 26..30, + }, args: Arguments { - range: 34..38, + range: 33..39, posonlyargs: [], args: [ ArgWithDefault { range: 34..38, def: Arg { range: 34..38, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 34..35, + }, annotation: Some( Name( ExprName { range: 37..38, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -106,51 +108,50 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 43..44, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 31..32, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 31..32, + }, bound: None, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 55..91, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 59..63, + }, args: Arguments { - range: 72..76, + range: 71..77, posonlyargs: [], args: [ ArgWithDefault { range: 72..76, def: Arg { range: 72..76, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 72..73, + }, annotation: Some( Name( ExprName { range: 75..76, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -183,28 +184,24 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 81..82, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 64..70, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 64..65, + }, bound: Some( Name( ExprName { range: 67..70, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -212,32 +209,33 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 93..138, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 97..101, + }, args: Arguments { - range: 119..123, + range: 118..124, posonlyargs: [], args: [ ArgWithDefault { range: 119..123, def: Arg { range: 119..123, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 119..120, + }, annotation: Some( Name( ExprName { range: 122..123, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), @@ -270,21 +268,19 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 128..129, - id: Identifier( - "T", - ), + id: "T", ctx: Load, }, ), ), - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 102..117, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 102..103, + }, bound: Some( Tuple( ExprTuple { @@ -293,18 +289,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" Name( ExprName { range: 106..109, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), Name( ExprName { range: 111..116, - id: Identifier( - "bytes", - ), + id: "bytes", ctx: Load, }, ), @@ -316,24 +308,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 140..171, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 144..148, + }, args: Arguments { - range: 154..161, + range: 153..162, posonlyargs: [], args: [], vararg: Some( Arg { range: 155..161, - arg: Identifier( - "a", - ), + arg: Identifier { + id: "a", + range: 155..156, + }, annotation: Some( Starred( ExprStarred { @@ -341,9 +336,7 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 159..161, - id: Identifier( - "Ts", - ), + id: "Ts", ctx: Load, }, ), @@ -373,35 +366,38 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ TypeVarTuple( TypeParamTypeVarTuple { range: 149..152, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 150..152, + }, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 173..230, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 177..181, + }, args: Arguments { - range: 187..220, + range: 186..221, posonlyargs: [], args: [], vararg: Some( Arg { range: 188..200, - arg: Identifier( - "args", - ), + arg: Identifier { + id: "args", + range: 188..192, + }, annotation: Some( Attribute( ExprAttribute { @@ -409,15 +405,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 194..195, - id: Identifier( - "P", - ), + id: "P", ctx: Load, }, ), - attr: Identifier( - "args", - ), + attr: Identifier { + id: "args", + range: 196..200, + }, ctx: Load, }, ), @@ -429,9 +424,10 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" kwarg: Some( Arg { range: 204..220, - arg: Identifier( - "kwargs", - ), + arg: Identifier { + id: "kwargs", + range: 204..210, + }, annotation: Some( Attribute( ExprAttribute { @@ -439,15 +435,14 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" value: Name( ExprName { range: 212..213, - id: Identifier( - "P", - ), + id: "P", ctx: Load, }, ), - attr: Identifier( - "kwargs", - ), + attr: Identifier { + id: "kwargs", + range: 214..220, + }, ctx: Load, }, ), @@ -472,25 +467,27 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ ParamSpec( TypeParamParamSpec { range: 182..185, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 184..185, + }, }, ), ], + type_comment: None, }, ), FunctionDef( StmtFunctionDef { range: 232..273, - name: Identifier( - "func", - ), + name: Identifier { + id: "func", + range: 236..240, + }, args: Arguments { range: 261..263, posonlyargs: [], @@ -508,30 +505,29 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" ], decorator_list: [], returns: None, - type_comment: None, type_params: [ TypeVar( TypeParamTypeVar { range: 241..242, - name: Identifier( - "T", - ), + name: Identifier { + id: "T", + range: 241..242, + }, bound: None, }, ), TypeVar( TypeParamTypeVar { range: 244..250, - name: Identifier( - "U", - ), + name: Identifier { + id: "U", + range: 244..245, + }, bound: Some( Name( ExprName { range: 247..250, - id: Identifier( - "str", - ), + id: "str", ctx: Load, }, ), @@ -541,20 +537,23 @@ expression: "ast::Suite::parse(source, \"\").unwrap()" TypeVarTuple( TypeParamTypeVarTuple { range: 252..255, - name: Identifier( - "Ts", - ), + name: Identifier { + id: "Ts", + range: 253..255, + }, }, ), ParamSpec( TypeParamParamSpec { range: 257..260, - name: Identifier( - "P", - ), + name: Identifier { + id: "P", + range: 259..260, + }, }, ), ], + type_comment: None, }, ), ] diff --git a/parser/src/soft_keywords.rs b/parser/src/soft_keywords.rs index 9abcd395..51278a46 100644 --- a/parser/src/soft_keywords.rs +++ b/parser/src/soft_keywords.rs @@ -134,7 +134,6 @@ where self.start_of_line = next.as_ref().map_or(false, |lex_result| { lex_result.as_ref().map_or(false, |(tok, _)| { - #[cfg(feature = "full-lexer")] if matches!(tok, Tok::NonLogicalNewline | Tok::Comment { .. }) { return self.start_of_line; } diff --git a/parser/src/token.rs b/parser/src/token.rs index ac33be75..d511fb8e 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -3,10 +3,10 @@ //! This module defines the tokens that the lexer recognizes. The tokens are //! loosely based on the token definitions found in the [CPython source]. //! -//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h -use crate::ast::bigint::BigInt; +//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h; use crate::ast::MagicKind; use crate::{text_size::TextSize, Mode}; +use num_bigint::BigInt; use std::fmt; /// The set of tokens the Python source code can be tokenized in. @@ -52,13 +52,11 @@ pub enum Tok { kind: MagicKind, }, /// Token value for a comment. These are filtered out of the token stream prior to parsing. - #[cfg(feature = "full-lexer")] Comment(String), /// Token value for a newline. Newline, /// Token value for a newline that is not a logical line break. These are filtered out of /// the token stream prior to parsing. - #[cfg(feature = "full-lexer")] NonLogicalNewline, /// Token value for an indent. Indent, @@ -236,7 +234,7 @@ impl fmt::Display for Tok { } MagicCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), - #[cfg(feature = "full-lexer")] + NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), @@ -250,7 +248,7 @@ impl fmt::Display for Tok { Rsqb => f.write_str("']'"), Colon => f.write_str("':'"), Comma => f.write_str("','"), - #[cfg(feature = "full-lexer")] + Comment(value) => f.write_str(value), Semi => f.write_str("';'"), Plus => f.write_str("'+'"), From a179d8ed946da12ca5da6bc24fb28cf4f355dd53 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Sun, 23 Jul 2023 12:58:01 +0200 Subject: [PATCH 2/6] Use single filter call --- parser/src/parser.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 2748dc53..3c9d4a5b 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -412,9 +412,29 @@ pub fn parse_tokens( ) -> Result { let lxr = lxr.into_iter(); +<<<<<<< HEAD let lxr = lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); parse_filtered_tokens(lxr, mode, source_path) +======= + match mode { + Mode::Module | Mode::Interactive | Mode::Expression => parse_filtered_tokens( + lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)), + mode, + source_path, + ), + Mode::Jupyter => parse_filtered_tokens( + lxr.filter_ok(|(tok, _)| { + !matches!( + tok, + Tok::Comment { .. } | Tok::NonLogicalNewline | Tok::MagicCommand { .. } + ) + }), + mode, + source_path, + ), + } +>>>>>>> 58ac178 (Use single filter call) } fn parse_filtered_tokens( From 8b9e22281b529ccdd5c40ab1930337aba2bd4663 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 24 Jul 2023 09:59:50 +0200 Subject: [PATCH 3/6] Address code review feedback --- ast/src/generic.rs | 2 +- ast/src/ranged.rs | 2 - parser/src/function.rs | 4 +- parser/src/gen/parse.rs | 220 ++++---------- parser/src/lexer.rs | 496 ++++++++++++++++---------------- parser/src/lexer/cursor.rs | 7 +- parser/src/lexer/indentation.rs | 48 ++-- parser/src/parser.rs | 72 +++-- parser/src/python.lalrpop | 2 +- parser/src/python.rs | 10 +- parser/src/string.rs | 6 +- parser/src/token.rs | 2 - 12 files changed, 381 insertions(+), 490 deletions(-) diff --git a/ast/src/generic.rs b/ast/src/generic.rs index db255fca..6665012f 100644 --- a/ast/src/generic.rs +++ b/ast/src/generic.rs @@ -1,6 +1,6 @@ #![allow(clippy::derive_partial_eq_without_eq)] use crate::text_size::TextRange; -pub use crate::{builtin::*, text_size::TextSize, ConversionFlag, Node}; +pub(crate) use crate::{builtin::*, ConversionFlag, Node}; use std::fmt::{self, Debug}; // This file was originally generated from asdl by a python script, but we now edit it manually diff --git a/ast/src/ranged.rs b/ast/src/ranged.rs index f1d08b91..1893fd1c 100644 --- a/ast/src/ranged.rs +++ b/ast/src/ranged.rs @@ -2,8 +2,6 @@ use crate::text_size::{TextRange, TextSize}; -pub use crate::builtin::*; - pub trait Ranged { fn range(&self) -> TextRange; diff --git a/parser/src/function.rs b/parser/src/function.rs index 67749ea3..1f8215ec 100644 --- a/parser/src/function.rs +++ b/parser/src/function.rs @@ -10,8 +10,8 @@ use rustc_hash::FxHashSet; use rustpython_ast::Ranged; pub(crate) struct ArgumentList { - pub args: Vec, - pub keywords: Vec, + pub(crate) args: Vec, + pub(crate) keywords: Vec, } // Perform validation of function/lambda arguments in a function definition. diff --git a/parser/src/gen/parse.rs b/parser/src/gen/parse.rs index e56491ae..6c659c1b 100644 --- a/parser/src/gen/parse.rs +++ b/parser/src/gen/parse.rs @@ -1,9 +1,7 @@ // This file was originally generated from asdl by a python script, but we now edit it manually impl Parse for ast::StmtFunctionDef { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, @@ -22,9 +20,7 @@ impl Parse for ast::StmtFunctionDef { } impl Parse for ast::StmtAsyncFunctionDef { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -42,9 +38,7 @@ impl Parse for ast::StmtAsyncFunctionDef { } impl Parse for ast::StmtClassDef { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -62,9 +56,7 @@ impl Parse for ast::StmtClassDef { } impl Parse for ast::StmtReturn { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -82,9 +74,7 @@ impl Parse for ast::StmtReturn { } impl Parse for ast::StmtDelete { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -102,9 +92,7 @@ impl Parse for ast::StmtDelete { } impl Parse for ast::StmtAssign { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -122,9 +110,7 @@ impl Parse for ast::StmtAssign { } impl Parse for ast::StmtTypeAlias { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -142,9 +128,7 @@ impl Parse for ast::StmtTypeAlias { } impl Parse for ast::StmtAugAssign { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -162,9 +146,7 @@ impl Parse for ast::StmtAugAssign { } impl Parse for ast::StmtAnnAssign { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -182,9 +164,7 @@ impl Parse for ast::StmtAnnAssign { } impl Parse for ast::StmtFor { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -202,9 +182,7 @@ impl Parse for ast::StmtFor { } impl Parse for ast::StmtAsyncFor { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -222,9 +200,7 @@ impl Parse for ast::StmtAsyncFor { } impl Parse for ast::StmtWhile { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -242,9 +218,7 @@ impl Parse for ast::StmtWhile { } impl Parse for ast::StmtIf { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -262,9 +236,7 @@ impl Parse for ast::StmtIf { } impl Parse for ast::StmtWith { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -282,9 +254,7 @@ impl Parse for ast::StmtWith { } impl Parse for ast::StmtAsyncWith { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -302,9 +272,7 @@ impl Parse for ast::StmtAsyncWith { } impl Parse for ast::StmtMatch { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -322,9 +290,7 @@ impl Parse for ast::StmtMatch { } impl Parse for ast::StmtRaise { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -342,9 +308,7 @@ impl Parse for ast::StmtRaise { } impl Parse for ast::StmtTry { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -362,9 +326,7 @@ impl Parse for ast::StmtTry { } impl Parse for ast::StmtTryStar { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -382,9 +344,7 @@ impl Parse for ast::StmtTryStar { } impl Parse for ast::StmtAssert { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -402,9 +362,7 @@ impl Parse for ast::StmtAssert { } impl Parse for ast::StmtImport { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -422,9 +380,7 @@ impl Parse for ast::StmtImport { } impl Parse for ast::StmtImportFrom { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -442,9 +398,7 @@ impl Parse for ast::StmtImportFrom { } impl Parse for ast::StmtGlobal { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -462,9 +416,7 @@ impl Parse for ast::StmtGlobal { } impl Parse for ast::StmtNonlocal { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -482,9 +434,7 @@ impl Parse for ast::StmtNonlocal { } impl Parse for ast::StmtExpr { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -502,9 +452,7 @@ impl Parse for ast::StmtExpr { } impl Parse for ast::StmtPass { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -522,9 +470,7 @@ impl Parse for ast::StmtPass { } impl Parse for ast::StmtBreak { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -542,9 +488,7 @@ impl Parse for ast::StmtBreak { } impl Parse for ast::StmtContinue { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -562,9 +506,7 @@ impl Parse for ast::StmtContinue { } impl Parse for ast::ExprBoolOp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -582,9 +524,7 @@ impl Parse for ast::ExprBoolOp { } impl Parse for ast::ExprNamedExpr { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -602,9 +542,7 @@ impl Parse for ast::ExprNamedExpr { } impl Parse for ast::ExprBinOp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -622,9 +560,7 @@ impl Parse for ast::ExprBinOp { } impl Parse for ast::ExprUnaryOp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -642,9 +578,7 @@ impl Parse for ast::ExprUnaryOp { } impl Parse for ast::ExprLambda { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -662,9 +596,7 @@ impl Parse for ast::ExprLambda { } impl Parse for ast::ExprIfExp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -682,9 +614,7 @@ impl Parse for ast::ExprIfExp { } impl Parse for ast::ExprDict { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -702,9 +632,7 @@ impl Parse for ast::ExprDict { } impl Parse for ast::ExprSet { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -722,9 +650,7 @@ impl Parse for ast::ExprSet { } impl Parse for ast::ExprListComp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -742,9 +668,7 @@ impl Parse for ast::ExprListComp { } impl Parse for ast::ExprSetComp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -762,9 +686,7 @@ impl Parse for ast::ExprSetComp { } impl Parse for ast::ExprDictComp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -782,9 +704,7 @@ impl Parse for ast::ExprDictComp { } impl Parse for ast::ExprGeneratorExp { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -802,9 +722,7 @@ impl Parse for ast::ExprGeneratorExp { } impl Parse for ast::ExprAwait { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -822,9 +740,7 @@ impl Parse for ast::ExprAwait { } impl Parse for ast::ExprYield { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -842,9 +758,7 @@ impl Parse for ast::ExprYield { } impl Parse for ast::ExprYieldFrom { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -862,9 +776,7 @@ impl Parse for ast::ExprYieldFrom { } impl Parse for ast::ExprCompare { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -882,9 +794,7 @@ impl Parse for ast::ExprCompare { } impl Parse for ast::ExprCall { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -902,9 +812,7 @@ impl Parse for ast::ExprCall { } impl Parse for ast::ExprFormattedValue { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -922,9 +830,7 @@ impl Parse for ast::ExprFormattedValue { } impl Parse for ast::ExprJoinedStr { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -942,9 +848,7 @@ impl Parse for ast::ExprJoinedStr { } impl Parse for ast::ExprConstant { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -962,9 +866,7 @@ impl Parse for ast::ExprConstant { } impl Parse for ast::ExprAttribute { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -982,9 +884,7 @@ impl Parse for ast::ExprAttribute { } impl Parse for ast::ExprSubscript { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1002,9 +902,7 @@ impl Parse for ast::ExprSubscript { } impl Parse for ast::ExprStarred { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1022,9 +920,7 @@ impl Parse for ast::ExprStarred { } impl Parse for ast::ExprName { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1042,9 +938,7 @@ impl Parse for ast::ExprName { } impl Parse for ast::ExprList { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1062,9 +956,7 @@ impl Parse for ast::ExprList { } impl Parse for ast::ExprTuple { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -1082,9 +974,7 @@ impl Parse for ast::ExprTuple { } impl Parse for ast::ExprSlice { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index a8d0400f..c4a79021 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -28,6 +28,7 @@ //! //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html +use std::borrow::Cow; use std::iter::FusedIterator; use std::{char, cmp::Ordering, str::FromStr}; @@ -39,6 +40,7 @@ use unic_ucd_ident::{is_xid_continue, is_xid_start}; use crate::lexer::cursor::{Cursor, EOF_CHAR}; use crate::lexer::indentation::{Indentation, Indentations}; +use crate::text_size::TextLen; use crate::{ soft_keywords::SoftKeywordTransformer, string::FStringErrorType, @@ -56,8 +58,7 @@ pub struct Lexer<'source> { cursor: Cursor<'source>, source: &'source str, - // Are we at the beginning of a line? - at_begin_of_line: bool, + state: State, // Amount of parenthesis. nesting: u32, // Indentation levels. @@ -144,7 +145,7 @@ impl<'source> Lexer<'source> { /// [`lex`] instead. pub fn new(input: &'source str, mode: Mode) -> Self { let mut lxr = Lexer { - at_begin_of_line: true, + state: State::AfterNewline, nesting: 0, indentations: Indentations::default(), pending_indentation: None, @@ -238,11 +239,11 @@ impl<'source> Lexer<'source> { fn lex_number(&mut self, first: char) -> Result { if first == '0' { if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { - self.lex_number_radix(first, Radix::Hex) + self.lex_number_radix(Radix::Hex) } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { - self.lex_number_radix(first, Radix::Octal) + self.lex_number_radix(Radix::Octal) } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { - self.lex_number_radix(first, Radix::Binary) + self.lex_number_radix(Radix::Binary) } else { self.lex_decimal_number(first) } @@ -252,14 +253,14 @@ impl<'source> Lexer<'source> { } /// Lex a hex/octal/decimal/binary number without a decimal point. - fn lex_number_radix(&mut self, first: char, radix: Radix) -> Result { + fn lex_number_radix(&mut self, radix: Radix) -> Result { #[cfg(debug_assertions)] debug_assert!(matches!( self.cursor.previous().to_ascii_lowercase(), 'x' | 'o' | 'b' )); - let value_text = self.radix_run(Some(first), radix); + let value_text = self.radix_run(None, radix); let value = BigInt::from_str_radix(&value_text, radix.as_u32()).map_err(|e| LexicalError { error: LexicalErrorType::OtherError(format!("{e:?}")), @@ -278,6 +279,7 @@ impl<'source> Lexer<'source> { String::new() } else { self.radix_run(Some(first_digit_or_dot), Radix::Decimal) + .into_owned() }; let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { @@ -313,7 +315,6 @@ impl<'source> Lexer<'source> { _ => is_float, }; - // If float: if is_float { // Improvement: Use `Cow` instead of pushing to value text let value = f64::from_str(&value_text).map_err(|_| LexicalError { @@ -352,20 +353,35 @@ impl<'source> Lexer<'source> { /// Consume a sequence of numbers with the given radix, /// the digits can be decorated with underscores /// like this: '1_2_3_4' == '1234' - fn radix_run(&mut self, first: Option, radix: Radix) -> String { - let mut value_text = first.map_or(String::new(), |c| c.to_string()); + fn radix_run(&mut self, first: Option, radix: Radix) -> Cow<'source, str> { + let start = if let Some(first) = first { + self.offset() - first.text_len() + } else { + self.offset() + }; + self.cursor.eat_while(|c| radix.is_digit(c)); - loop { - if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { - value_text.push(c); - } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { - self.cursor.bump(); - } else { - break; + let number = &self.source[TextRange::new(start, self.offset())]; + + // Number that contains `_` separators. Remove them from the parsed text. + if radix.is_digit(self.cursor.second()) && self.cursor.eat_char('_') { + let mut value_text = number.to_string(); + + loop { + if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { + value_text.push(c); + } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + // Skip over `_` + self.cursor.bump(); + } else { + break; + } } - } - value_text + Cow::Owned(value_text) + } else { + Cow::Borrowed(number) + } } /// Lex a single comment. @@ -490,104 +506,91 @@ impl<'source> Lexer<'source> { // This is the main entry point. Call this function to retrieve the next token. // This function is used by the iterator implementation. pub fn next_token(&mut self) -> LexResult { - // top loop, keep on processing, until we have something pending. - loop { - // Return dedent tokens until the current indentation level matches the indentation of the next token. - if let Some(indentation) = self.pending_indentation.take() { - if let Ok(Ordering::Greater) = self.indentations.current().try_compare(&indentation) - { - self.pending_indentation = Some(indentation); - self.indentations.pop(); - return Ok((Tok::Dedent, TextRange::empty(self.offset()))); - } - } - - if self.at_begin_of_line && self.nesting == 0 { - if let Some(trivia) = self.eat_logical_line_trivia()? { - break Ok(trivia); - } - } - - self.cursor.start_token(); - if let Some(c) = self.cursor.bump() { - if let Some(normal) = self.consume_normal(c)? { - break Ok(normal); - } - } else { - // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line, - // empty the dedent stack, and finally, return the EndOfFile token. - break self.consume_end(); + // Return dedent tokens until the current indentation level matches the indentation of the next token. + if let Some(indentation) = self.pending_indentation.take() { + if let Ok(Ordering::Greater) = self.indentations.current().try_compare(&indentation) { + self.pending_indentation = Some(indentation); + self.indentations.pop(); + return Ok((Tok::Dedent, TextRange::empty(self.offset()))); } } - } - fn eat_logical_line_trivia(&mut self) -> Result, LexicalError> { let mut indentation = Indentation::root(); - - // Eat over any leading whitespace self.cursor.start_token(); - self.cursor.eat_while(|c| { - if c == ' ' { - indentation = indentation.add_space(); - true - } else if c == '\t' { - indentation = indentation.add_tab(); - true - } else if c == '\x0C' { - indentation = Indentation::root(); - true - } else { - false - } - }); - let token = match self.cursor.first() { - c @ ('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter => { - self.cursor.start_token(); - self.cursor.bump(); - let kind = if let Ok(kind) = MagicKind::try_from([c, self.cursor.first()]) { + loop { + match self.cursor.first() { + ' ' => { self.cursor.bump(); - kind - } else { - MagicKind::try_from(c).unwrap() - }; - - self.lex_magic_command(kind) + indentation = indentation.add_space(); + } + '\t' => { + self.cursor.bump(); + indentation = indentation.add_tab(); + } + '\\' => { + self.cursor.bump(); + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.is_eof() { + return Err(LexicalError { + error: LexicalErrorType::Eof, + location: self.token_start(), + }); + } else if !self.cursor.eat_char('\n') { + return Err(LexicalError { + error: LexicalErrorType::LineContinuationError, + location: self.token_start(), + }); + } + indentation = Indentation::root(); + } + // Form feed + '\x0C' => { + indentation = Indentation::root(); + } + _ => break, } + } - '#' => { - self.cursor.start_token(); - self.cursor.bump(); + if self.state.is_after_newline() { + // Handle indentation if this is a new, not all empty, logical line + if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) { + self.state = State::NonEmptyLogicalLine; - self.lex_comment()? - } + if let Some(spanned) = self.handle_indentation(indentation)? { + // Set to false so that we don't handle indentation on the next call. - '\n' => { - self.cursor.start_token(); - self.cursor.bump(); - Tok::NonLogicalNewline - } - // `\r` or `\r\n` - '\r' => { - self.cursor.start_token(); - self.cursor.bump(); - self.cursor.eat_char('\n'); - Tok::NonLogicalNewline - } - - EOF_CHAR => { - // handled by consume end of line - return Ok(None); + return Ok(spanned); + } } + } - _ => { - self.at_begin_of_line = false; - - return self.handle_indentation(indentation); + self.cursor.start_token(); + if let Some(c) = self.cursor.bump() { + if c.is_ascii() { + self.consume_ascii_character(c) + } else if is_unicode_identifier_start(c) { + let identifier = self.lex_identifier(c)?; + Ok((identifier, self.token_range())) + } else if is_emoji_presentation(c) { + Ok(( + Tok::Name { + name: c.to_string(), + }, + self.token_range(), + )) + } else { + Err(LexicalError { + error: LexicalErrorType::UnrecognizedToken { tok: c }, + location: self.token_start(), + }) } - }; - - Ok(Some((token, self.token_range()))) + } else { + // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line, + // empty the dedent stack, and finally, return the EndOfFile token. + self.consume_end() + } } fn handle_indentation( @@ -621,28 +624,6 @@ impl<'source> Lexer<'source> { Ok(token) } - // Take a look at the next character, if any, and decide upon the next steps. - fn consume_normal(&mut self, first: char) -> Result, LexicalError> { - if first.is_ascii() { - self.consume_ascii_character(first) - } else if is_unicode_identifier_start(first) { - let identifier = self.lex_identifier(first)?; - Ok(Some((identifier, self.token_range()))) - } else if is_emoji_presentation(first) { - Ok(Some(( - Tok::Name { - name: first.to_string(), - }, - self.token_range(), - ))) - } else { - Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: first }, - location: self.token_start(), - }) - } - } - fn consume_end(&mut self) -> Result { // We reached end of file. // First of all, we need all nestings to be finished. @@ -654,8 +635,8 @@ impl<'source> Lexer<'source> { } // Next, insert a trailing newline, if required. - if !self.at_begin_of_line { - self.at_begin_of_line = true; + if !self.state.is_new_logical_line() { + self.state = State::AfterNewline; Ok((Tok::Newline, TextRange::empty(self.offset()))) } // Next, flush the indentation stack to zero. @@ -667,11 +648,11 @@ impl<'source> Lexer<'source> { } // Dispatch based on the given character. - fn consume_ascii_character(&mut self, c: char) -> Result, LexicalError> { + fn consume_ascii_character(&mut self, c: char) -> Result { let token = match c { c if is_ascii_identifier_start(c) => self.lex_identifier(c)?, '0'..='9' => self.lex_number(c)?, - '#' => self.lex_comment()?, + '#' => return self.lex_comment().map(|token| (token, self.token_range())), '"' | '\'' => self.lex_string(StringKind::String, c)?, '=' => { if self.cursor.eat_char('=') { @@ -700,6 +681,20 @@ impl<'source> Lexer<'source> { Tok::Star } } + + c @ ('%' | '!' | '?' | '/' | ';' | ',') + if self.mode == Mode::Jupyter && self.state.is_new_logical_line() => + { + let kind = if let Ok(kind) = MagicKind::try_from([c, self.cursor.first()]) { + self.cursor.bump(); + kind + } else { + // SAFETY: Safe because `c` has been matched against one of the possible magic command prefix + MagicKind::try_from(c).unwrap() + }; + + self.lex_magic_command(kind) + } '/' => { if self.cursor.eat_char('=') { Tok::SlashEqual @@ -839,46 +834,33 @@ impl<'source> Lexer<'source> { } } '\n' => { - if self.nesting == 0 { - self.at_begin_of_line = true; - Tok::Newline - } else { - Tok::NonLogicalNewline - } + return Ok(( + if self.nesting == 0 && !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + Tok::Newline + } else { + Tok::NonLogicalNewline + }, + self.token_range(), + )) } '\r' => { self.cursor.eat_char('\n'); - if self.nesting == 0 { - self.at_begin_of_line = true; - Tok::Newline - } else { - Tok::NonLogicalNewline - } - } - ' ' | '\t' | '\x0C' => { - self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C')); - return Ok(None); - } - - '\\' => { - if self.cursor.eat_char('\r') { - self.cursor.eat_char('\n'); - } else if self.cursor.is_eof() { - return Err(LexicalError { - error: LexicalErrorType::Eof, - location: self.token_start(), - }); - } else if !self.cursor.eat_char('\n') { - return Err(LexicalError { - error: LexicalErrorType::LineContinuationError, - location: self.token_start(), - }); - } - return Ok(None); + return Ok(( + if self.nesting == 0 && !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + Tok::Newline + } else { + Tok::NonLogicalNewline + }, + self.token_range(), + )); } _ => { + self.state = State::Other; + return Err(LexicalError { error: LexicalErrorType::UnrecognizedToken { tok: c }, location: self.token_start(), @@ -886,7 +868,9 @@ impl<'source> Lexer<'source> { } }; - Ok(Some((token, self.token_range()))) + self.state = State::Other; + + Ok((token, self.token_range())) } #[inline] @@ -1037,6 +1021,28 @@ impl std::fmt::Display for LexicalErrorType { } } +#[derive(Copy, Clone, Debug)] +enum State { + /// Lexer is right at the beginning of the file or after a `Newline` token. + AfterNewline, + + /// The lexer is at the start of a new logical line but **after** the indentation + NonEmptyLogicalLine, + + /// Inside of a logical line + Other, +} + +impl State { + const fn is_after_newline(self) -> bool { + matches!(self, State::AfterNewline) + } + + const fn is_new_logical_line(self) -> bool { + matches!(self, State::AfterNewline | State::NonEmptyLogicalLine) + } +} + #[derive(Copy, Clone, Debug)] enum Radix { Binary, @@ -1099,12 +1105,12 @@ mod tests { const MAC_EOL: &str = "\r"; const UNIX_EOL: &str = "\n"; - pub fn lex_source(source: &str) -> Vec { + pub(crate) fn lex_source(source: &str) -> Vec { let lexer = lex(source, Mode::Module); lexer.map(|x| x.unwrap().0).collect() } - pub fn lex_jupyter_source(source: &str) -> Vec { + pub(crate) fn lex_jupyter_source(source: &str) -> Vec { let lexer = lex(source, Mode::Jupyter); lexer.map(|x| x.unwrap().0).collect() } @@ -1130,10 +1136,13 @@ mod tests { let tokens = lex_jupyter_source(&source); assert_eq!( tokens, - vec![Tok::MagicCommand { - value: "matplotlib --inline".to_string(), - kind: MagicKind::Magic - },] + vec![ + Tok::MagicCommand { + value: "matplotlib --inline".to_string(), + kind: MagicKind::Magic + }, + Tok::Newline + ] ) } @@ -1157,10 +1166,13 @@ mod tests { let tokens = lex_jupyter_source(&source); assert_eq!( tokens, - vec![Tok::MagicCommand { - value: "matplotlib ".to_string(), - kind: MagicKind::Magic - },] + vec![ + Tok::MagicCommand { + value: "matplotlib ".to_string(), + kind: MagicKind::Magic + }, + Tok::Newline + ] ) } @@ -1190,46 +1202,47 @@ mod tests { value: "".to_string(), kind: MagicKind::Magic, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Magic2, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Shell, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::ShCap, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help2, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Paren, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Quote, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Quote2, }, + Tok::Newline, ] ) } @@ -1249,7 +1262,7 @@ mod tests { /foo 1 2 ,foo 1 2 ;foo 1 2 - !ls +!ls " .trim(); let tokens = lex_jupyter_source(source); @@ -1260,56 +1273,57 @@ mod tests { value: "foo".to_string(), kind: MagicKind::Help, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "foo".to_string(), kind: MagicKind::Help2, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "timeit a = b".to_string(), kind: MagicKind::Magic, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "timeit a % 3".to_string(), kind: MagicKind::Magic, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "matplotlib --inline".to_string(), kind: MagicKind::Magic, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "pwd && ls -a | sed 's/^/\\\\ /'".to_string(), kind: MagicKind::Shell, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "cd /Users/foo/Library/Application\\ Support/".to_string(), kind: MagicKind::ShCap, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Paren, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote2, }, - Tok::NonLogicalNewline, + Tok::Newline, Tok::MagicCommand { value: "ls".to_string(), kind: MagicKind::Shell, }, + Tok::Newline, ] ) } @@ -1656,7 +1670,6 @@ if True: } #[test] - fn test_non_logical_newline_in_string_continuation() { let source = r"( 'a' @@ -1686,7 +1699,6 @@ if True: } #[test] - fn test_logical_newline_line_comment() { let source = "#Hello\n#World\n"; let tokens = lex_source(source); @@ -1739,29 +1751,29 @@ if True: ); } - macro_rules! test_string_continuation { - ($($name:ident: $eol:expr,)*) => { - $( - #[test] - fn $name() { - let source = format!("\"abc\\{}def\"", $eol); - let tokens = lex_source(&source); - assert_eq!( - tokens, - vec![ - str_tok("abc\\\ndef"), - Tok::Newline, - ] - ) - } - )* - } + fn assert_string_continuation_with_eol(eol: &str) { + let source = format!("\"abc\\{}def\"", eol); + let tokens = lex_source(&source); + + assert_eq!( + tokens, + vec![str_tok(&format!("abc\\{}def", eol)), Tok::Newline] + ) + } + + #[test] + fn test_string_continuation_windows_eol() { + assert_string_continuation_with_eol(WINDOWS_EOL); } - test_string_continuation! { - test_string_continuation_windows_eol: WINDOWS_EOL, - test_string_continuation_mac_eol: MAC_EOL, - test_string_continuation_unix_eol: UNIX_EOL, + #[test] + fn test_string_continuation_mac_eol() { + assert_string_continuation_with_eol(MAC_EOL); + } + + #[test] + fn test_string_continuation_unix_eol() { + assert_string_continuation_with_eol(UNIX_EOL); } #[test] @@ -1771,32 +1783,34 @@ if True: assert_eq!(tokens, vec![str_tok(r"\N{EN SPACE}"), Tok::Newline]) } - macro_rules! test_triple_quoted { - ($($name:ident: $eol:expr,)*) => { - $( - #[test] - fn $name() { - let source = format!("\"\"\"{0} test string{0} \"\"\"", $eol); - let tokens = lex_source(&source); - assert_eq!( - tokens, - vec![ - Tok::String { - value: "\n test string\n ".to_owned(), - kind: StringKind::String, - triple_quoted: true, - }, - Tok::Newline, - ] - ) - } - )* - } + fn assert_triple_quoted(eol: &str) { + let source = format!("\"\"\"{0} test string{0} \"\"\"", eol); + let tokens = lex_source(&source); + assert_eq!( + tokens, + vec![ + Tok::String { + value: format!("{0} test string{0} ", eol), + kind: StringKind::String, + triple_quoted: true, + }, + Tok::Newline, + ] + ) } - test_triple_quoted! { - test_triple_quoted_windows_eol: WINDOWS_EOL, - test_triple_quoted_mac_eol: MAC_EOL, - test_triple_quoted_unix_eol: UNIX_EOL, + #[test] + fn triple_quoted_windows_eol() { + assert_triple_quoted(WINDOWS_EOL); + } + + #[test] + fn triple_quoted_unix_eol() { + assert_triple_quoted(UNIX_EOL); + } + + #[test] + fn triple_quoted_macos_eol() { + assert_triple_quoted(MAC_EOL); } } diff --git a/parser/src/lexer/cursor.rs b/parser/src/lexer/cursor.rs index 90f9f7b2..ff1f3b74 100644 --- a/parser/src/lexer/cursor.rs +++ b/parser/src/lexer/cursor.rs @@ -12,7 +12,7 @@ pub(super) struct Cursor<'a> { } impl<'a> Cursor<'a> { - pub fn new(source: &'a str) -> Self { + pub(crate) fn new(source: &'a str) -> Self { Self { source_length: source.text_len(), chars: source.chars(), @@ -41,13 +41,12 @@ impl<'a> Cursor<'a> { chars.next().unwrap_or(EOF_CHAR) } - /// Peeks the third character from the input stream without consuming it. - /// Returns [EOF_CHAR] if the position is past the end of the file. + /// Returns the remaining text to lex. pub(super) fn rest(&self) -> &'a str { self.chars.as_str() } - // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + // SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`. #[allow(clippy::cast_possible_truncation)] pub(super) fn text_len(&self) -> TextSize { TextSize::new(self.chars.as_str().len() as u32) diff --git a/parser/src/lexer/indentation.rs b/parser/src/lexer/indentation.rs index a268679b..31732e21 100644 --- a/parser/src/lexer/indentation.rs +++ b/parser/src/lexer/indentation.rs @@ -2,6 +2,10 @@ use static_assertions::assert_eq_size; use std::cmp::Ordering; use std::fmt::Debug; +/// The column index of an indentation. +/// +/// A space increments the column by one. A tab adds up to 2 (if tab size is 2) indices, but just one +/// if the column isn't even. #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] pub(super) struct Column(u32); @@ -11,6 +15,7 @@ impl Column { } } +/// The number of characters in an indentation. Each character accounts for 1. #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] pub(super) struct Character(u32); @@ -20,6 +25,7 @@ impl Character { } } +/// The [Indentation](https://docs.python.org/3/reference/lexical_analysis.html#indentation) of a logical line. #[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] pub(super) struct Indentation { column: Column, @@ -27,6 +33,8 @@ pub(super) struct Indentation { } impl Indentation { + const TAB_SIZE: u32 = 2; + pub(super) const fn root() -> Self { Self { column: Column::new(0), @@ -51,7 +59,11 @@ impl Indentation { pub(super) fn add_tab(self) -> Self { Self { character: Character(self.character.0 + 1), - column: Column((self.column.0 / 2 + 1) * 2), + // Compute the column index: + // * Adds `TAB_SIZE` if `column` is a multiple of `TAB_SIZE` + // * Rounds `column` up to the next multiple of `TAB_SIZE` otherwise. + // https://github.com/python/cpython/blob/2cf99026d6320f38937257da1ab014fc873a11a6/Parser/tokenizer.c#L1818 + column: Column((self.column.0 / Self::TAB_SIZE + 1) * Self::TAB_SIZE), } } @@ -73,43 +85,27 @@ impl Indentation { #[derive(Debug, Copy, Clone, PartialEq)] pub(super) struct UnexpectedIndentation; -// The indentations stack is used to keep track of the current indentation level. -// Similar to the CPython implementation, the Indentations stack always has at -// least one level which is never popped. See Reference 2.1.8. -#[derive(Debug, Clone)] +// The indentations stack is used to keep track of the current indentation level +// [See Indentation](docs.python.org/3/reference/lexical_analysis.html#indentation). +#[derive(Debug, Clone, Default)] pub(super) struct Indentations { stack: Vec, } impl Indentations { - pub fn is_empty(&self) -> bool { - self.stack.len() == 1 - } - - pub fn push(&mut self, indent: Indentation) { + pub(super) fn push(&mut self, indent: Indentation) { debug_assert_eq!(self.current().try_compare(&indent), Ok(Ordering::Less)); self.stack.push(indent); } - pub fn pop(&mut self) -> Option { - if self.is_empty() { - None - } else { - self.stack.pop() - } - } - - pub fn current(&self) -> &Indentation { - self.stack.last().expect("Expected indentation") + pub(super) fn pop(&mut self) -> Option { + self.stack.pop() } -} -impl Default for Indentations { - fn default() -> Self { - Self { - stack: vec![Indentation::root()], - } + pub(super) fn current(&self) -> &Indentation { + static ROOT: Indentation = Indentation::root(); + self.stack.last().unwrap_or(&ROOT) } } diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 3c9d4a5b..2acef8cc 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -57,8 +57,10 @@ pub trait Parse where Self: Sized, { + const MODE: Mode; + fn parse(source: &str, source_path: &str) -> Result { - let tokens = lex(source, Self::mode()); + let tokens = lex(source, Self::MODE); Self::parse_tokens(tokens, source_path) } @@ -72,7 +74,7 @@ where source_path: &str, offset: TextSize, ) -> Result { - let tokens = lex_starts_at(source, Self::mode(), offset); + let tokens = lex_starts_at(source, Self::MODE, offset); Self::parse_tokens(tokens, source_path) } @@ -81,14 +83,10 @@ where lxr: impl IntoIterator, source_path: &str, ) -> Result; - - fn mode() -> Mode; } impl Parse for ast::ModModule { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, @@ -102,9 +100,7 @@ impl Parse for ast::ModModule { } impl Parse for ast::ModExpression { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, @@ -118,9 +114,7 @@ impl Parse for ast::ModExpression { } impl Parse for ast::ModInteractive { - fn mode() -> Mode { - Mode::Interactive - } + const MODE: Mode = Mode::Interactive; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -133,9 +127,7 @@ impl Parse for ast::ModInteractive { } impl Parse for ast::Suite { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, @@ -146,9 +138,7 @@ impl Parse for ast::Suite { } impl Parse for ast::Stmt { - fn mode() -> Mode { - Mode::Module - } + const MODE: Mode = Mode::Module; fn parse_tokens( lxr: impl IntoIterator, @@ -177,9 +167,7 @@ impl Parse for ast::Stmt { } impl Parse for ast::Expr { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, @@ -190,9 +178,7 @@ impl Parse for ast::Expr { } impl Parse for ast::Identifier { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, @@ -214,9 +200,7 @@ impl Parse for ast::Identifier { } impl Parse for ast::Constant { - fn mode() -> Mode { - Mode::Expression - } + const MODE: Mode = Mode::Expression; fn parse_tokens( lxr: impl IntoIterator, @@ -423,16 +407,28 @@ pub fn parse_tokens( mode, source_path, ), - Mode::Jupyter => parse_filtered_tokens( - lxr.filter_ok(|(tok, _)| { - !matches!( - tok, - Tok::Comment { .. } | Tok::NonLogicalNewline | Tok::MagicCommand { .. } - ) - }), - mode, - source_path, - ), + Mode::Jupyter => { + let mut after_magic = false; + parse_filtered_tokens( + lxr.filter_ok(|(tok, _)| match tok { + Tok::Comment(..) | Tok::NonLogicalNewline => { + after_magic = false; + false + } + Tok::Newline => !after_magic, + Tok::MagicCommand { .. } => { + after_magic = true; + false + } + _ => { + after_magic = false; + true + } + }), + mode, + source_path, + ) + } } >>>>>>> 58ac178 (Use single filter call) } diff --git a/parser/src/python.lalrpop b/parser/src/python.lalrpop index e95b21bd..68796dcc 100644 --- a/parser/src/python.lalrpop +++ b/parser/src/python.lalrpop @@ -20,7 +20,7 @@ grammar(mode: Mode); // This is a hack to reduce the amount of lalrpop tables generated: // For each public entry point, a full parse table is generated. // By having only a single pub function, we reduce this to one. -pub Top: ast::Mod = { +pub(crate) Top: ast::Mod = { StartModule => ast::ModModule { body, type_ignores: vec![], range: (start..end).into() }.into(), StartInteractive => ast::ModInteractive { body, range: (start..end).into() }.into(), StartExpression ("\n")* => ast::ModExpression { body: Box::new(body), range: (start..end).into() }.into() diff --git a/parser/src/python.rs b/parser/src/python.rs index 5245dcba..b0906d1a 100644 --- a/parser/src/python.rs +++ b/parser/src/python.rs @@ -1,5 +1,5 @@ // auto-generated: "lalrpop 0.20.0" -// sha3: 6a6e10102ca4897f12ff5ed33ef5aad928e18e86753214e39d3f3495951fc631 +// sha3: 263bb187f0a83dfe2a024fa0eed0ad8cb855da5991584b5040fa7d870fdb84af use num_bigint::BigInt; use crate::{ ast::{self as ast, Ranged, MagicKind}, @@ -11490,19 +11490,19 @@ mod __parse__Top { _ => panic!("invalid reduction index {}", __reduce_index) } } - pub struct TopParser { + pub(crate) struct TopParser { _priv: (), } impl TopParser { - pub fn new() -> TopParser { + pub(crate) fn new() -> TopParser { TopParser { _priv: (), } } #[allow(dead_code)] - pub fn parse< + pub(crate) fn parse< __TOKEN: __ToTriple<>, __TOKENS: IntoIterator, >( @@ -30746,7 +30746,7 @@ mod __parse__Top { (3, 276) } } -pub use self::__parse__Top::TopParser; +pub(crate) use self::__parse__Top::TopParser; #[allow(unused_variables)] #[allow(clippy::too_many_arguments)] diff --git a/parser/src/string.rs b/parser/src/string.rs index cb2b0e19..114cbb50 100644 --- a/parser/src/string.rs +++ b/parser/src/string.rs @@ -715,14 +715,14 @@ pub(crate) fn parse_strings( #[derive(Debug, PartialEq)] struct FStringError { /// The type of error that occurred. - pub error: FStringErrorType, + pub(crate) error: FStringErrorType, /// The location of the error. - pub location: TextSize, + pub(crate) location: TextSize, } impl FStringError { /// Creates a new `FStringError` with the given error type and location. - pub fn new(error: FStringErrorType, location: TextSize) -> Self { + pub(crate) fn new(error: FStringErrorType, location: TextSize) -> Self { Self { error, location } } } diff --git a/parser/src/token.rs b/parser/src/token.rs index d511fb8e..86a86686 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -234,7 +234,6 @@ impl fmt::Display for Tok { } MagicCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), - NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), @@ -248,7 +247,6 @@ impl fmt::Display for Tok { Rsqb => f.write_str("']'"), Colon => f.write_str("':'"), Comma => f.write_str("','"), - Comment(value) => f.write_str(value), Semi => f.write_str("';'"), Plus => f.write_str("'+'"), From 23a50275d1b59ef518aebcfe3fc63ae63129e23b Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 24 Jul 2023 15:37:22 +0200 Subject: [PATCH 4/6] Fix infinite loop caused by form feed --- parser/src/lexer.rs | 377 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 377 insertions(+) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index c4a79021..cb57799a 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -547,6 +547,7 @@ impl<'source> Lexer<'source> { } // Form feed '\x0C' => { + self.cursor.bump(); indentation = Indentation::root(); } _ => break, @@ -1813,4 +1814,380 @@ if True: fn triple_quoted_macos_eol() { assert_triple_quoted(MAC_EOL); } + + #[test] + fn regression() { + for token in lex( + r#"# Tests some corner cases with isinstance() and issubclass(). While these +# tests use new style classes and properties, they actually do whitebox +# testing of error conditions uncovered when using extension types. + +import unittest +import sys +import typing +from test import support + + + + +class TestIsInstanceExceptions(unittest.TestCase): + # Test to make sure that an AttributeError when accessing the instance's + # class's bases is masked. This was actually a bug in Python 2.2 and + # 2.2.1 where the exception wasn't caught but it also wasn't being cleared + # (leading to an "undetected error" in the debug build). Set up is, + # isinstance(inst, cls) where: + # + # - cls isn't a type, or a tuple + # - cls has a __bases__ attribute + # - inst has a __class__ attribute + # - inst.__class__ as no __bases__ attribute + # + # Sounds complicated, I know, but this mimics a situation where an + # extension type raises an AttributeError when its __bases__ attribute is + # gotten. In that case, isinstance() should return False. + def test_class_has_no_bases(self): + class I(object): + def getclass(self): + # This must return an object that has no __bases__ attribute + return None + __class__ = property(getclass) + + class C(object): + def getbases(self): + return () + __bases__ = property(getbases) + + self.assertEqual(False, isinstance(I(), C())) + + # Like above except that inst.__class__.__bases__ raises an exception + # other than AttributeError + def test_bases_raises_other_than_attribute_error(self): + class E(object): + def getbases(self): + raise RuntimeError + __bases__ = property(getbases) + + class I(object): + def getclass(self): + return E() + __class__ = property(getclass) + + class C(object): + def getbases(self): + return () + __bases__ = property(getbases) + + self.assertRaises(RuntimeError, isinstance, I(), C()) + + # Here's a situation where getattr(cls, '__bases__') raises an exception. + # If that exception is not AttributeError, it should not get masked + def test_dont_mask_non_attribute_error(self): + class I: pass + + class C(object): + def getbases(self): + raise RuntimeError + __bases__ = property(getbases) + + self.assertRaises(RuntimeError, isinstance, I(), C()) + + # Like above, except that getattr(cls, '__bases__') raises an + # AttributeError, which /should/ get masked as a TypeError + def test_mask_attribute_error(self): + class I: pass + + class C(object): + def getbases(self): + raise AttributeError + __bases__ = property(getbases) + + self.assertRaises(TypeError, isinstance, I(), C()) + + # check that we don't mask non AttributeErrors + # see: http://bugs.python.org/issue1574217 + def test_isinstance_dont_mask_non_attribute_error(self): + class C(object): + def getclass(self): + raise RuntimeError + __class__ = property(getclass) + + c = C() + self.assertRaises(RuntimeError, isinstance, c, bool) + + # test another code path + class D: pass + self.assertRaises(RuntimeError, isinstance, c, D) + + + +# These tests are similar to above, but tickle certain code paths in +# issubclass() instead of isinstance() -- really PyObject_IsSubclass() +# vs. PyObject_IsInstance(). +class TestIsSubclassExceptions(unittest.TestCase): + def test_dont_mask_non_attribute_error(self): + class C(object): + def getbases(self): + raise RuntimeError + __bases__ = property(getbases) + + class S(C): pass + + self.assertRaises(RuntimeError, issubclass, C(), S()) + + def test_mask_attribute_error(self): + class C(object): + def getbases(self): + raise AttributeError + __bases__ = property(getbases) + + class S(C): pass + + self.assertRaises(TypeError, issubclass, C(), S()) + + # Like above, but test the second branch, where the __bases__ of the + # second arg (the cls arg) is tested. This means the first arg must + # return a valid __bases__, and it's okay for it to be a normal -- + # unrelated by inheritance -- class. + def test_dont_mask_non_attribute_error_in_cls_arg(self): + class B: pass + + class C(object): + def getbases(self): + raise RuntimeError + __bases__ = property(getbases) + + self.assertRaises(RuntimeError, issubclass, B, C()) + + def test_mask_attribute_error_in_cls_arg(self): + class B: pass + + class C(object): + def getbases(self): + raise AttributeError + __bases__ = property(getbases) + + self.assertRaises(TypeError, issubclass, B, C()) + + + + +# meta classes for creating abstract classes and instances +class AbstractClass(object): + def __init__(self, bases): + self.bases = bases + + def getbases(self): + return self.bases + __bases__ = property(getbases) + + def __call__(self): + return AbstractInstance(self) + +class AbstractInstance(object): + def __init__(self, klass): + self.klass = klass + + def getclass(self): + return self.klass + __class__ = property(getclass) + +# abstract classes +AbstractSuper = AbstractClass(bases=()) + +AbstractChild = AbstractClass(bases=(AbstractSuper,)) + +# normal classes +class Super: + pass + +class Child(Super): + pass + + +class TestIsInstanceIsSubclass(unittest.TestCase): + # Tests to ensure that isinstance and issubclass work on abstract + # classes and instances. Before the 2.2 release, TypeErrors were + # raised when boolean values should have been returned. The bug was + # triggered by mixing 'normal' classes and instances were with + # 'abstract' classes and instances. This case tries to test all + # combinations. + + def test_isinstance_normal(self): + # normal instances + self.assertEqual(True, isinstance(Super(), Super)) + self.assertEqual(False, isinstance(Super(), Child)) + self.assertEqual(False, isinstance(Super(), AbstractSuper)) + self.assertEqual(False, isinstance(Super(), AbstractChild)) + + self.assertEqual(True, isinstance(Child(), Super)) + self.assertEqual(False, isinstance(Child(), AbstractSuper)) + + def test_isinstance_abstract(self): + # abstract instances + self.assertEqual(True, isinstance(AbstractSuper(), AbstractSuper)) + self.assertEqual(False, isinstance(AbstractSuper(), AbstractChild)) + self.assertEqual(False, isinstance(AbstractSuper(), Super)) + self.assertEqual(False, isinstance(AbstractSuper(), Child)) + + self.assertEqual(True, isinstance(AbstractChild(), AbstractChild)) + self.assertEqual(True, isinstance(AbstractChild(), AbstractSuper)) + self.assertEqual(False, isinstance(AbstractChild(), Super)) + self.assertEqual(False, isinstance(AbstractChild(), Child)) + + def test_isinstance_with_or_union(self): + self.assertTrue(isinstance(Super(), Super | int)) + self.assertFalse(isinstance(None, str | int)) + self.assertTrue(isinstance(3, str | int)) + self.assertTrue(isinstance("", str | int)) + self.assertTrue(isinstance([], typing.List | typing.Tuple)) + self.assertTrue(isinstance(2, typing.List | int)) + self.assertFalse(isinstance(2, typing.List | typing.Tuple)) + self.assertTrue(isinstance(None, int | None)) + self.assertFalse(isinstance(3.14, int | str)) + with self.assertRaises(TypeError): + isinstance(2, list[int]) + with self.assertRaises(TypeError): + isinstance(2, list[int] | int) + with self.assertRaises(TypeError): + isinstance(2, int | str | list[int] | float) + + + + def test_subclass_normal(self): + # normal classes + self.assertEqual(True, issubclass(Super, Super)) + self.assertEqual(False, issubclass(Super, AbstractSuper)) + self.assertEqual(False, issubclass(Super, Child)) + + self.assertEqual(True, issubclass(Child, Child)) + self.assertEqual(True, issubclass(Child, Super)) + self.assertEqual(False, issubclass(Child, AbstractSuper)) + self.assertTrue(issubclass(typing.List, typing.List|typing.Tuple)) + self.assertFalse(issubclass(int, typing.List|typing.Tuple)) + + def test_subclass_abstract(self): + # abstract classes + self.assertEqual(True, issubclass(AbstractSuper, AbstractSuper)) + self.assertEqual(False, issubclass(AbstractSuper, AbstractChild)) + self.assertEqual(False, issubclass(AbstractSuper, Child)) + + self.assertEqual(True, issubclass(AbstractChild, AbstractChild)) + self.assertEqual(True, issubclass(AbstractChild, AbstractSuper)) + self.assertEqual(False, issubclass(AbstractChild, Super)) + self.assertEqual(False, issubclass(AbstractChild, Child)) + + def test_subclass_tuple(self): + # test with a tuple as the second argument classes + self.assertEqual(True, issubclass(Child, (Child,))) + self.assertEqual(True, issubclass(Child, (Super,))) + self.assertEqual(False, issubclass(Super, (Child,))) + self.assertEqual(True, issubclass(Super, (Child, Super))) + self.assertEqual(False, issubclass(Child, ())) + self.assertEqual(True, issubclass(Super, (Child, (Super,)))) + + self.assertEqual(True, issubclass(int, (int, (float, int)))) + self.assertEqual(True, issubclass(str, (str, (Child, str)))) + + def test_subclass_recursion_limit(self): + # make sure that issubclass raises RecursionError before the C stack is + # blown + with support.infinite_recursion(): + self.assertRaises(RecursionError, blowstack, issubclass, str, str) + + def test_isinstance_recursion_limit(self): + # make sure that issubclass raises RecursionError before the C stack is + # blown + with support.infinite_recursion(): + self.assertRaises(RecursionError, blowstack, isinstance, '', str) + + def test_subclass_with_union(self): + self.assertTrue(issubclass(int, int | float | int)) + self.assertTrue(issubclass(str, str | Child | str)) + self.assertFalse(issubclass(dict, float|str)) + self.assertFalse(issubclass(object, float|str)) + with self.assertRaises(TypeError): + issubclass(2, Child | Super) + with self.assertRaises(TypeError): + issubclass(int, list[int] | Child) + + def test_issubclass_refcount_handling(self): + # bpo-39382: abstract_issubclass() didn't hold item reference while + # peeking in the bases tuple, in the single inheritance case. + class A: + @property + def __bases__(self): + return (int, ) + + class B: + def __init__(self): + # setting this here increases the chances of exhibiting the bug, + # probably due to memory layout changes. + self.x = 1 + + @property + def __bases__(self): + return (A(), ) + + self.assertEqual(True, issubclass(B(), int)) + + def test_infinite_recursion_in_bases(self): + class X: + @property + def __bases__(self): + return self.__bases__ + with support.infinite_recursion(): + self.assertRaises(RecursionError, issubclass, X(), int) + self.assertRaises(RecursionError, issubclass, int, X()) + self.assertRaises(RecursionError, isinstance, 1, X()) + + def test_infinite_recursion_via_bases_tuple(self): + """Regression test for bpo-30570.""" + class Failure(object): + def __getattr__(self, attr): + return (self, None) + with support.infinite_recursion(): + with self.assertRaises(RecursionError): + issubclass(Failure(), int) + + def test_infinite_cycle_in_bases(self): + """Regression test for bpo-30570.""" + class X: + @property + def __bases__(self): + return (self, self, self) + with support.infinite_recursion(): + self.assertRaises(RecursionError, issubclass, X(), int) + + def test_infinitely_many_bases(self): + """Regression test for bpo-30570.""" + class X: + def __getattr__(self, attr): + self.assertEqual(attr, "__bases__") + class A: + pass + class B: + pass + A.__getattr__ = B.__getattr__ = X.__getattr__ + return (A(), B()) + with support.infinite_recursion(): + self.assertRaises(RecursionError, issubclass, X(), int) + + +def blowstack(fxn, arg, compare_to): + # Make sure that calling isinstance with a deeply nested tuple for its + # argument will raise RecursionError eventually. + tuple_arg = (compare_to,) + for cnt in range(sys.getrecursionlimit()+5): + tuple_arg = (tuple_arg,) + fxn(arg, tuple_arg) + + + +if __name__ == '__main__': + unittest.main()"#, + Mode::Module, + ) { + dbg!(token); + } + } } From fcebb0cd085dd540517e65de9aaebac524adfe0a Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 24 Jul 2023 15:46:04 +0200 Subject: [PATCH 5/6] Merge with magic assignment lexing --- parser/src/lexer.rs | 474 +++++++++----------------------------------- 1 file changed, 97 insertions(+), 377 deletions(-) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index cb57799a..da541669 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -659,7 +659,8 @@ impl<'source> Lexer<'source> { if self.cursor.eat_char('=') { Tok::EqEqual } else { - Tok::Equal + self.state = State::AfterEqual; + return Ok((Tok::Equal, self.token_range())); } } '+' => { @@ -683,6 +684,15 @@ impl<'source> Lexer<'source> { } } + c @ ('%' | '!') + if self.mode == Mode::Jupyter + && self.state.is_after_equal() + && self.nesting == 0 => + { + // SAFETY: Safe because `c` has been matched against one of the possible magic command prefix + self.lex_magic_command(MagicKind::try_from(c).unwrap()) + } + c @ ('%' | '!' | '?' | '/' | ';' | ',') if self.mode == Mode::Jupyter && self.state.is_new_logical_line() => { @@ -1030,6 +1040,9 @@ enum State { /// The lexer is at the start of a new logical line but **after** the indentation NonEmptyLogicalLine, + /// Lexer is right after an equal token + AfterEqual, + /// Inside of a logical line Other, } @@ -1042,6 +1055,10 @@ impl State { const fn is_new_logical_line(self) -> bool { matches!(self, State::AfterNewline | State::NonEmptyLogicalLine) } + + const fn is_after_equal(self) -> bool { + matches!(self, State::AfterEqual) + } } #[derive(Copy, Clone, Debug)] @@ -1354,6 +1371,85 @@ if True: ) } + #[test] + fn test_jupyter_magic_assignment() { + let source = r" +pwd = !pwd +foo = %timeit a = b +bar = %timeit a % 3 +baz = %matplotlib \ + inline" + .trim(); + let tokens = lex_jupyter_source(source); + assert_eq!( + tokens, + vec![ + Tok::Name { + name: "pwd".to_string() + }, + Tok::Equal, + Tok::MagicCommand { + value: "pwd".to_string(), + kind: MagicKind::Shell, + }, + Tok::Newline, + Tok::Name { + name: "foo".to_string() + }, + Tok::Equal, + Tok::MagicCommand { + value: "timeit a = b".to_string(), + kind: MagicKind::Magic, + }, + Tok::Newline, + Tok::Name { + name: "bar".to_string() + }, + Tok::Equal, + Tok::MagicCommand { + value: "timeit a % 3".to_string(), + kind: MagicKind::Magic, + }, + Tok::Newline, + Tok::Name { + name: "baz".to_string() + }, + Tok::Equal, + Tok::MagicCommand { + value: "matplotlib inline".to_string(), + kind: MagicKind::Magic, + }, + Tok::Newline, + ] + ) + } + + fn assert_no_jupyter_magic(tokens: &[Tok]) { + for tok in tokens { + match tok { + Tok::MagicCommand { .. } => panic!("Unexpected magic command token: {:?}", tok), + _ => {} + } + } + } + + #[test] + fn test_jupyter_magic_not_an_assignment() { + let source = r" +# Other magic kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token) +foo = /func +foo = ;func +foo = ,func + +(foo == %timeit a = b) +(foo := %timeit a = b) +def f(arg=%timeit a = b): + pass" + .trim(); + let tokens = lex_jupyter_source(source); + assert_no_jupyter_magic(&tokens); + } + #[test] fn test_numbers() { let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j"; @@ -1814,380 +1910,4 @@ if True: fn triple_quoted_macos_eol() { assert_triple_quoted(MAC_EOL); } - - #[test] - fn regression() { - for token in lex( - r#"# Tests some corner cases with isinstance() and issubclass(). While these -# tests use new style classes and properties, they actually do whitebox -# testing of error conditions uncovered when using extension types. - -import unittest -import sys -import typing -from test import support - - - - -class TestIsInstanceExceptions(unittest.TestCase): - # Test to make sure that an AttributeError when accessing the instance's - # class's bases is masked. This was actually a bug in Python 2.2 and - # 2.2.1 where the exception wasn't caught but it also wasn't being cleared - # (leading to an "undetected error" in the debug build). Set up is, - # isinstance(inst, cls) where: - # - # - cls isn't a type, or a tuple - # - cls has a __bases__ attribute - # - inst has a __class__ attribute - # - inst.__class__ as no __bases__ attribute - # - # Sounds complicated, I know, but this mimics a situation where an - # extension type raises an AttributeError when its __bases__ attribute is - # gotten. In that case, isinstance() should return False. - def test_class_has_no_bases(self): - class I(object): - def getclass(self): - # This must return an object that has no __bases__ attribute - return None - __class__ = property(getclass) - - class C(object): - def getbases(self): - return () - __bases__ = property(getbases) - - self.assertEqual(False, isinstance(I(), C())) - - # Like above except that inst.__class__.__bases__ raises an exception - # other than AttributeError - def test_bases_raises_other_than_attribute_error(self): - class E(object): - def getbases(self): - raise RuntimeError - __bases__ = property(getbases) - - class I(object): - def getclass(self): - return E() - __class__ = property(getclass) - - class C(object): - def getbases(self): - return () - __bases__ = property(getbases) - - self.assertRaises(RuntimeError, isinstance, I(), C()) - - # Here's a situation where getattr(cls, '__bases__') raises an exception. - # If that exception is not AttributeError, it should not get masked - def test_dont_mask_non_attribute_error(self): - class I: pass - - class C(object): - def getbases(self): - raise RuntimeError - __bases__ = property(getbases) - - self.assertRaises(RuntimeError, isinstance, I(), C()) - - # Like above, except that getattr(cls, '__bases__') raises an - # AttributeError, which /should/ get masked as a TypeError - def test_mask_attribute_error(self): - class I: pass - - class C(object): - def getbases(self): - raise AttributeError - __bases__ = property(getbases) - - self.assertRaises(TypeError, isinstance, I(), C()) - - # check that we don't mask non AttributeErrors - # see: http://bugs.python.org/issue1574217 - def test_isinstance_dont_mask_non_attribute_error(self): - class C(object): - def getclass(self): - raise RuntimeError - __class__ = property(getclass) - - c = C() - self.assertRaises(RuntimeError, isinstance, c, bool) - - # test another code path - class D: pass - self.assertRaises(RuntimeError, isinstance, c, D) - - - -# These tests are similar to above, but tickle certain code paths in -# issubclass() instead of isinstance() -- really PyObject_IsSubclass() -# vs. PyObject_IsInstance(). -class TestIsSubclassExceptions(unittest.TestCase): - def test_dont_mask_non_attribute_error(self): - class C(object): - def getbases(self): - raise RuntimeError - __bases__ = property(getbases) - - class S(C): pass - - self.assertRaises(RuntimeError, issubclass, C(), S()) - - def test_mask_attribute_error(self): - class C(object): - def getbases(self): - raise AttributeError - __bases__ = property(getbases) - - class S(C): pass - - self.assertRaises(TypeError, issubclass, C(), S()) - - # Like above, but test the second branch, where the __bases__ of the - # second arg (the cls arg) is tested. This means the first arg must - # return a valid __bases__, and it's okay for it to be a normal -- - # unrelated by inheritance -- class. - def test_dont_mask_non_attribute_error_in_cls_arg(self): - class B: pass - - class C(object): - def getbases(self): - raise RuntimeError - __bases__ = property(getbases) - - self.assertRaises(RuntimeError, issubclass, B, C()) - - def test_mask_attribute_error_in_cls_arg(self): - class B: pass - - class C(object): - def getbases(self): - raise AttributeError - __bases__ = property(getbases) - - self.assertRaises(TypeError, issubclass, B, C()) - - - - -# meta classes for creating abstract classes and instances -class AbstractClass(object): - def __init__(self, bases): - self.bases = bases - - def getbases(self): - return self.bases - __bases__ = property(getbases) - - def __call__(self): - return AbstractInstance(self) - -class AbstractInstance(object): - def __init__(self, klass): - self.klass = klass - - def getclass(self): - return self.klass - __class__ = property(getclass) - -# abstract classes -AbstractSuper = AbstractClass(bases=()) - -AbstractChild = AbstractClass(bases=(AbstractSuper,)) - -# normal classes -class Super: - pass - -class Child(Super): - pass - - -class TestIsInstanceIsSubclass(unittest.TestCase): - # Tests to ensure that isinstance and issubclass work on abstract - # classes and instances. Before the 2.2 release, TypeErrors were - # raised when boolean values should have been returned. The bug was - # triggered by mixing 'normal' classes and instances were with - # 'abstract' classes and instances. This case tries to test all - # combinations. - - def test_isinstance_normal(self): - # normal instances - self.assertEqual(True, isinstance(Super(), Super)) - self.assertEqual(False, isinstance(Super(), Child)) - self.assertEqual(False, isinstance(Super(), AbstractSuper)) - self.assertEqual(False, isinstance(Super(), AbstractChild)) - - self.assertEqual(True, isinstance(Child(), Super)) - self.assertEqual(False, isinstance(Child(), AbstractSuper)) - - def test_isinstance_abstract(self): - # abstract instances - self.assertEqual(True, isinstance(AbstractSuper(), AbstractSuper)) - self.assertEqual(False, isinstance(AbstractSuper(), AbstractChild)) - self.assertEqual(False, isinstance(AbstractSuper(), Super)) - self.assertEqual(False, isinstance(AbstractSuper(), Child)) - - self.assertEqual(True, isinstance(AbstractChild(), AbstractChild)) - self.assertEqual(True, isinstance(AbstractChild(), AbstractSuper)) - self.assertEqual(False, isinstance(AbstractChild(), Super)) - self.assertEqual(False, isinstance(AbstractChild(), Child)) - - def test_isinstance_with_or_union(self): - self.assertTrue(isinstance(Super(), Super | int)) - self.assertFalse(isinstance(None, str | int)) - self.assertTrue(isinstance(3, str | int)) - self.assertTrue(isinstance("", str | int)) - self.assertTrue(isinstance([], typing.List | typing.Tuple)) - self.assertTrue(isinstance(2, typing.List | int)) - self.assertFalse(isinstance(2, typing.List | typing.Tuple)) - self.assertTrue(isinstance(None, int | None)) - self.assertFalse(isinstance(3.14, int | str)) - with self.assertRaises(TypeError): - isinstance(2, list[int]) - with self.assertRaises(TypeError): - isinstance(2, list[int] | int) - with self.assertRaises(TypeError): - isinstance(2, int | str | list[int] | float) - - - - def test_subclass_normal(self): - # normal classes - self.assertEqual(True, issubclass(Super, Super)) - self.assertEqual(False, issubclass(Super, AbstractSuper)) - self.assertEqual(False, issubclass(Super, Child)) - - self.assertEqual(True, issubclass(Child, Child)) - self.assertEqual(True, issubclass(Child, Super)) - self.assertEqual(False, issubclass(Child, AbstractSuper)) - self.assertTrue(issubclass(typing.List, typing.List|typing.Tuple)) - self.assertFalse(issubclass(int, typing.List|typing.Tuple)) - - def test_subclass_abstract(self): - # abstract classes - self.assertEqual(True, issubclass(AbstractSuper, AbstractSuper)) - self.assertEqual(False, issubclass(AbstractSuper, AbstractChild)) - self.assertEqual(False, issubclass(AbstractSuper, Child)) - - self.assertEqual(True, issubclass(AbstractChild, AbstractChild)) - self.assertEqual(True, issubclass(AbstractChild, AbstractSuper)) - self.assertEqual(False, issubclass(AbstractChild, Super)) - self.assertEqual(False, issubclass(AbstractChild, Child)) - - def test_subclass_tuple(self): - # test with a tuple as the second argument classes - self.assertEqual(True, issubclass(Child, (Child,))) - self.assertEqual(True, issubclass(Child, (Super,))) - self.assertEqual(False, issubclass(Super, (Child,))) - self.assertEqual(True, issubclass(Super, (Child, Super))) - self.assertEqual(False, issubclass(Child, ())) - self.assertEqual(True, issubclass(Super, (Child, (Super,)))) - - self.assertEqual(True, issubclass(int, (int, (float, int)))) - self.assertEqual(True, issubclass(str, (str, (Child, str)))) - - def test_subclass_recursion_limit(self): - # make sure that issubclass raises RecursionError before the C stack is - # blown - with support.infinite_recursion(): - self.assertRaises(RecursionError, blowstack, issubclass, str, str) - - def test_isinstance_recursion_limit(self): - # make sure that issubclass raises RecursionError before the C stack is - # blown - with support.infinite_recursion(): - self.assertRaises(RecursionError, blowstack, isinstance, '', str) - - def test_subclass_with_union(self): - self.assertTrue(issubclass(int, int | float | int)) - self.assertTrue(issubclass(str, str | Child | str)) - self.assertFalse(issubclass(dict, float|str)) - self.assertFalse(issubclass(object, float|str)) - with self.assertRaises(TypeError): - issubclass(2, Child | Super) - with self.assertRaises(TypeError): - issubclass(int, list[int] | Child) - - def test_issubclass_refcount_handling(self): - # bpo-39382: abstract_issubclass() didn't hold item reference while - # peeking in the bases tuple, in the single inheritance case. - class A: - @property - def __bases__(self): - return (int, ) - - class B: - def __init__(self): - # setting this here increases the chances of exhibiting the bug, - # probably due to memory layout changes. - self.x = 1 - - @property - def __bases__(self): - return (A(), ) - - self.assertEqual(True, issubclass(B(), int)) - - def test_infinite_recursion_in_bases(self): - class X: - @property - def __bases__(self): - return self.__bases__ - with support.infinite_recursion(): - self.assertRaises(RecursionError, issubclass, X(), int) - self.assertRaises(RecursionError, issubclass, int, X()) - self.assertRaises(RecursionError, isinstance, 1, X()) - - def test_infinite_recursion_via_bases_tuple(self): - """Regression test for bpo-30570.""" - class Failure(object): - def __getattr__(self, attr): - return (self, None) - with support.infinite_recursion(): - with self.assertRaises(RecursionError): - issubclass(Failure(), int) - - def test_infinite_cycle_in_bases(self): - """Regression test for bpo-30570.""" - class X: - @property - def __bases__(self): - return (self, self, self) - with support.infinite_recursion(): - self.assertRaises(RecursionError, issubclass, X(), int) - - def test_infinitely_many_bases(self): - """Regression test for bpo-30570.""" - class X: - def __getattr__(self, attr): - self.assertEqual(attr, "__bases__") - class A: - pass - class B: - pass - A.__getattr__ = B.__getattr__ = X.__getattr__ - return (A(), B()) - with support.infinite_recursion(): - self.assertRaises(RecursionError, issubclass, X(), int) - - -def blowstack(fxn, arg, compare_to): - # Make sure that calling isinstance with a deeply nested tuple for its - # argument will raise RecursionError eventually. - tuple_arg = (compare_to,) - for cnt in range(sys.getrecursionlimit()+5): - tuple_arg = (tuple_arg,) - fxn(arg, tuple_arg) - - - -if __name__ == '__main__': - unittest.main()"#, - Mode::Module, - ) { - dbg!(token); - } - } } From 36e75e1c2a028b3f79325507c3aba8dda4519885 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 24 Jul 2023 18:04:01 +0200 Subject: [PATCH 6/6] Merge with magic parsing --- ast/src/generic.rs | 2 +- parser/src/lexer.rs | 9 ++++++--- parser/src/parser.rs | 40 +++++----------------------------------- 3 files changed, 12 insertions(+), 39 deletions(-) diff --git a/ast/src/generic.rs b/ast/src/generic.rs index 6665012f..df9575da 100644 --- a/ast/src/generic.rs +++ b/ast/src/generic.rs @@ -1,5 +1,5 @@ #![allow(clippy::derive_partial_eq_without_eq)] -use crate::text_size::TextRange; +use crate::text_size::{TextRange, TextSize}; pub(crate) use crate::{builtin::*, ConversionFlag, Node}; use std::fmt::{self, Debug}; diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index da541669..2df4d498 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -573,8 +573,12 @@ impl<'source> Lexer<'source> { self.consume_ascii_character(c) } else if is_unicode_identifier_start(c) { let identifier = self.lex_identifier(c)?; + self.state = State::Other; + Ok((identifier, self.token_range())) } else if is_emoji_presentation(c) { + self.state = State::Other; + Ok(( Tok::Name { name: c.to_string(), @@ -1426,9 +1430,8 @@ baz = %matplotlib \ fn assert_no_jupyter_magic(tokens: &[Tok]) { for tok in tokens { - match tok { - Tok::MagicCommand { .. } => panic!("Unexpected magic command token: {:?}", tok), - _ => {} + if let Tok::MagicCommand { .. } = tok { + panic!("Unexpected magic command token: {:?}", tok) } } } diff --git a/parser/src/parser.rs b/parser/src/parser.rs index 2acef8cc..b2675f96 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -396,41 +396,11 @@ pub fn parse_tokens( ) -> Result { let lxr = lxr.into_iter(); -<<<<<<< HEAD - let lxr = - lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); - parse_filtered_tokens(lxr, mode, source_path) -======= - match mode { - Mode::Module | Mode::Interactive | Mode::Expression => parse_filtered_tokens( - lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)), - mode, - source_path, - ), - Mode::Jupyter => { - let mut after_magic = false; - parse_filtered_tokens( - lxr.filter_ok(|(tok, _)| match tok { - Tok::Comment(..) | Tok::NonLogicalNewline => { - after_magic = false; - false - } - Tok::Newline => !after_magic, - Tok::MagicCommand { .. } => { - after_magic = true; - false - } - _ => { - after_magic = false; - true - } - }), - mode, - source_path, - ) - } - } ->>>>>>> 58ac178 (Use single filter call) + parse_filtered_tokens( + lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)), + mode, + source_path, + ) } fn parse_filtered_tokens(