diff --git a/Cargo.toml b/Cargo.toml index 219221e6..65c534a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,6 @@ num-complex = "0.4.0" num-bigint = "0.4.3" num-traits = "0.2" pyo3 = { version = "0.19.0" } -malachite-bigint = { version = "0.1.0" } memchr = "2.5.0" rand = "0.8.5" serde = "1.0" diff --git a/ast/Cargo.toml b/ast/Cargo.toml index fe869346..b528a686 100644 --- a/ast/Cargo.toml +++ b/ast/Cargo.toml @@ -8,13 +8,11 @@ repository = "https://github.com/RustPython/Parser/" license = "MIT" [features] -default = ["malachite-bigint"] [dependencies] rustpython-parser-core = { workspace = true } rustpython-literal = { workspace = true, optional = true } is-macro = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } static_assertions = "1.1.0" diff --git a/ast/src/builtin.rs b/ast/src/builtin.rs index b7fd3c8e..e10b8245 100644 --- a/ast/src/builtin.rs +++ b/ast/src/builtin.rs @@ -2,8 +2,8 @@ use rustpython_parser_core::text_size::TextRange; -use crate::bigint::BigInt; use crate::Ranged; +use num_bigint::BigInt; pub type String = std::string::String; diff --git a/ast/src/lib.rs b/ast/src/lib.rs index 1b12a93e..c4441867 100644 --- a/ast/src/lib.rs +++ b/ast/src/lib.rs @@ -20,9 +20,6 @@ mod generic; mod impls; mod ranged; -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] pub use num_bigint as bigint; pub use builtin::*; diff --git a/format/Cargo.toml b/format/Cargo.toml index b11b25db..f271c4eb 100644 --- a/format/Cargo.toml +++ b/format/Cargo.toml @@ -13,8 +13,4 @@ rustpython-literal = { workspace = true } bitflags = "2.3.1" itertools = "0.10.5" num-traits = { workspace = true } -num-bigint = { workspace = true, optional = true } -malachite-bigint = { workspace = true, optional = true } - -[features] -default = ["malachite-bigint"] \ No newline at end of file +num-bigint = { workspace = true } diff --git a/format/src/lib.rs b/format/src/lib.rs index 61de9d55..2a2679c7 100644 --- a/format/src/lib.rs +++ b/format/src/lib.rs @@ -1,6 +1,3 @@ -#[cfg(feature = "malachite-bigint")] -pub use malachite_bigint as bigint; -#[cfg(all(feature = "num-bigint", not(feature = "malachite-bigint")))] pub use num_bigint as bigint; pub use crate::format::*; diff --git a/literal/src/escape.rs b/literal/src/escape.rs index 082248a5..0b86a6d0 100644 --- a/literal/src/escape.rs +++ b/literal/src/escape.rs @@ -386,6 +386,7 @@ impl<'a> Escape for AsciiEscape<'a> { &self.layout } + #[allow(unsafe_code)] fn write_source(&self, formatter: &mut impl std::fmt::Write) -> std::fmt::Result { formatter.write_str(unsafe { // SAFETY: this function must be called only when source is printable ascii characters diff --git a/parser/Cargo.toml b/parser/Cargo.toml index b6c20ff8..b099c4e5 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -9,11 +9,7 @@ license = "MIT" edition = "2021" [features] -default = ["malachite-bigint"] serde = ["dep:serde", "rustpython-parser-core/serde"] -full-lexer = [] -malachite-bigint = ["dep:malachite-bigint", "rustpython-ast/malachite-bigint"] -num-bigint = ["dep:num-bigint", "rustpython-ast/num-bigint"] [build-dependencies] anyhow = { workspace = true } @@ -25,11 +21,11 @@ tiny-keccak = { version = "2", features = ["sha3"] } rustpython-ast = { workspace = true } rustpython-parser-core = { workspace = true } +bitflags = "2.3.3" itertools = { workspace = true } is-macro = { workspace = true } log = { workspace = true } -malachite-bigint = { workspace = true, optional = true } -num-bigint = { workspace = true, optional = true } +num-bigint = { workspace = true } num-traits = { workspace = true } unicode_names2 = { workspace = true } @@ -38,6 +34,7 @@ unic-ucd-ident = "0.9.0" lalrpop-util = { version = "0.20.0", default-features = false } phf = "0.11.1" rustc-hash = "1.1.0" +static_assertions = "1.1.0" serde = { version = "1.0.133", optional = true, default-features = false, features = ["derive"] } [dev-dependencies] diff --git a/parser/src/gen/parse.rs b/parser/src/gen/parse.rs index fafec6a1..3e725015 100644 --- a/parser/src/gen/parse.rs +++ b/parser/src/gen/parse.rs @@ -1,10 +1,7 @@ // This file was originally generated from asdl by a python script, but we now edit it manually impl Parse for ast::StmtFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -24,10 +21,7 @@ impl Parse for ast::StmtFunctionDef { } impl Parse for ast::StmtAsyncFunctionDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -47,10 +41,7 @@ impl Parse for ast::StmtAsyncFunctionDef { } impl Parse for ast::StmtClassDef { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -70,10 +61,7 @@ impl Parse for ast::StmtClassDef { } impl Parse for ast::StmtReturn { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -93,10 +81,7 @@ impl Parse for ast::StmtReturn { } impl Parse for ast::StmtDelete { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -116,10 +101,7 @@ impl Parse for ast::StmtDelete { } impl Parse for ast::StmtAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -139,10 +121,7 @@ impl Parse for ast::StmtAssign { } impl Parse for ast::StmtTypeAlias { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -162,10 +141,7 @@ impl Parse for ast::StmtTypeAlias { } impl Parse for ast::StmtAugAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -185,10 +161,7 @@ impl Parse for ast::StmtAugAssign { } impl Parse for ast::StmtAnnAssign { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -208,10 +181,7 @@ impl Parse for ast::StmtAnnAssign { } impl Parse for ast::StmtFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -231,10 +201,7 @@ impl Parse for ast::StmtFor { } impl Parse for ast::StmtAsyncFor { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -254,10 +221,7 @@ impl Parse for ast::StmtAsyncFor { } impl Parse for ast::StmtWhile { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -277,10 +241,7 @@ impl Parse for ast::StmtWhile { } impl Parse for ast::StmtIf { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -300,10 +261,7 @@ impl Parse for ast::StmtIf { } impl Parse for ast::StmtWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -323,10 +281,7 @@ impl Parse for ast::StmtWith { } impl Parse for ast::StmtAsyncWith { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -346,10 +301,7 @@ impl Parse for ast::StmtAsyncWith { } impl Parse for ast::StmtMatch { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -369,10 +321,7 @@ impl Parse for ast::StmtMatch { } impl Parse for ast::StmtRaise { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -392,10 +341,7 @@ impl Parse for ast::StmtRaise { } impl Parse for ast::StmtTry { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -415,10 +361,7 @@ impl Parse for ast::StmtTry { } impl Parse for ast::StmtTryStar { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -438,10 +381,7 @@ impl Parse for ast::StmtTryStar { } impl Parse for ast::StmtAssert { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -461,10 +401,7 @@ impl Parse for ast::StmtAssert { } impl Parse for ast::StmtImport { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -484,10 +421,7 @@ impl Parse for ast::StmtImport { } impl Parse for ast::StmtImportFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -507,10 +441,7 @@ impl Parse for ast::StmtImportFrom { } impl Parse for ast::StmtGlobal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -530,10 +461,7 @@ impl Parse for ast::StmtGlobal { } impl Parse for ast::StmtNonlocal { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -553,10 +481,7 @@ impl Parse for ast::StmtNonlocal { } impl Parse for ast::StmtExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -576,10 +501,7 @@ impl Parse for ast::StmtExpr { } impl Parse for ast::StmtPass { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -599,10 +521,7 @@ impl Parse for ast::StmtPass { } impl Parse for ast::StmtBreak { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -622,10 +541,7 @@ impl Parse for ast::StmtBreak { } impl Parse for ast::StmtContinue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Stmt::lex_starts_at(source, offset) } fn parse_tokens( @@ -645,10 +561,7 @@ impl Parse for ast::StmtContinue { } impl Parse for ast::ExprBoolOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -668,10 +581,7 @@ impl Parse for ast::ExprBoolOp { } impl Parse for ast::ExprNamedExpr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -691,10 +601,7 @@ impl Parse for ast::ExprNamedExpr { } impl Parse for ast::ExprBinOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -714,10 +621,7 @@ impl Parse for ast::ExprBinOp { } impl Parse for ast::ExprUnaryOp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -737,10 +641,7 @@ impl Parse for ast::ExprUnaryOp { } impl Parse for ast::ExprLambda { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -760,10 +661,7 @@ impl Parse for ast::ExprLambda { } impl Parse for ast::ExprIfExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -783,10 +681,7 @@ impl Parse for ast::ExprIfExp { } impl Parse for ast::ExprDict { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -806,10 +701,7 @@ impl Parse for ast::ExprDict { } impl Parse for ast::ExprSet { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -829,10 +721,7 @@ impl Parse for ast::ExprSet { } impl Parse for ast::ExprListComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -852,10 +741,7 @@ impl Parse for ast::ExprListComp { } impl Parse for ast::ExprSetComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -875,10 +761,7 @@ impl Parse for ast::ExprSetComp { } impl Parse for ast::ExprDictComp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -898,10 +781,7 @@ impl Parse for ast::ExprDictComp { } impl Parse for ast::ExprGeneratorExp { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -921,10 +801,7 @@ impl Parse for ast::ExprGeneratorExp { } impl Parse for ast::ExprAwait { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -944,10 +821,7 @@ impl Parse for ast::ExprAwait { } impl Parse for ast::ExprYield { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -967,10 +841,7 @@ impl Parse for ast::ExprYield { } impl Parse for ast::ExprYieldFrom { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -990,10 +861,7 @@ impl Parse for ast::ExprYieldFrom { } impl Parse for ast::ExprCompare { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1013,10 +881,7 @@ impl Parse for ast::ExprCompare { } impl Parse for ast::ExprCall { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1036,10 +901,7 @@ impl Parse for ast::ExprCall { } impl Parse for ast::ExprFormattedValue { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1059,10 +921,7 @@ impl Parse for ast::ExprFormattedValue { } impl Parse for ast::ExprJoinedStr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1082,10 +941,7 @@ impl Parse for ast::ExprJoinedStr { } impl Parse for ast::ExprConstant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1105,10 +961,7 @@ impl Parse for ast::ExprConstant { } impl Parse for ast::ExprAttribute { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1128,10 +981,7 @@ impl Parse for ast::ExprAttribute { } impl Parse for ast::ExprSubscript { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1151,10 +1001,7 @@ impl Parse for ast::ExprSubscript { } impl Parse for ast::ExprStarred { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1174,10 +1021,7 @@ impl Parse for ast::ExprStarred { } impl Parse for ast::ExprName { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1197,10 +1041,7 @@ impl Parse for ast::ExprName { } impl Parse for ast::ExprList { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1220,10 +1061,7 @@ impl Parse for ast::ExprList { } impl Parse for ast::ExprTuple { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -1243,10 +1081,7 @@ impl Parse for ast::ExprTuple { } impl Parse for ast::ExprSlice { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index eced33ce..e160a4c4 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -453,7 +453,6 @@ where } /// Lex a single comment. - #[cfg(feature = "full-lexer")] fn lex_comment(&mut self) -> LexResult { let start_pos = self.get_pos(); let mut value = String::new(); @@ -469,34 +468,12 @@ where } } - #[cfg(feature = "full-lexer")] fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { let comment = self.lex_comment()?; self.emit(comment); Ok(()) } - /// Discard comment if full-lexer is not enabled. - #[cfg(not(feature = "full-lexer"))] - fn lex_comment(&mut self) { - loop { - match self.window[0] { - Some('\n' | '\r') | None => { - return; - } - Some(_) => {} - } - self.next_char().unwrap(); - } - } - - #[cfg(not(feature = "full-lexer"))] - #[inline] - fn lex_and_emit_comment(&mut self) -> Result<(), LexicalError> { - self.lex_comment(); - Ok(()) - } - /// Lex a single magic command. fn lex_magic_command(&mut self, kind: MagicKind) -> (Tok, TextRange) { let start_pos = self.get_pos(); @@ -713,12 +690,9 @@ where } Some('\n' | '\r') => { // Empty line! - #[cfg(feature = "full-lexer")] let tok_start = self.get_pos(); self.next_char(); - #[cfg(feature = "full-lexer")] let tok_end = self.get_pos(); - #[cfg(feature = "full-lexer")] self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); spaces = 0; tabs = 0; @@ -1194,7 +1168,6 @@ where self.at_begin_of_line = true; self.emit((Tok::Newline, TextRange::new(tok_start, tok_end))); } else { - #[cfg(feature = "full-lexer")] self.emit((Tok::NonLogicalNewline, TextRange::new(tok_start, tok_end))); } } @@ -1251,6 +1224,7 @@ where } // Used by single character tokens to advance the window and emit the correct token. + #[allow(unsafe_code)] fn eat_single_char(&mut self, ty: Tok) { let tok_start = self.get_pos(); self.next_char().unwrap_or_else(|| unsafe { @@ -1527,49 +1501,41 @@ mod tests { value: "".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Magic2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Shell, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::ShCap, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Help2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Paren, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), kind: MagicKind::Quote, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "".to_string(), @@ -1605,61 +1571,51 @@ mod tests { value: "foo".to_string(), kind: MagicKind::Help, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo".to_string(), kind: MagicKind::Help2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "timeit a = b".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "timeit a % 3".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "matplotlib --inline".to_string(), kind: MagicKind::Magic, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "pwd && ls -a | sed 's/^/\\\\ /'".to_string(), kind: MagicKind::Shell, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "cd /Users/foo/Library/Application\\ Support/".to_string(), kind: MagicKind::ShCap, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Paren, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "foo 1 2".to_string(), kind: MagicKind::Quote2, }, - #[cfg(feature = "full-lexer")] Tok::NonLogicalNewline, Tok::MagicCommand { value: "ls".to_string(), @@ -1714,7 +1670,6 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] fn $name() { let source = format!(r"99232 # {}", $eol); let tokens = lex_source(&source); @@ -1735,7 +1690,6 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] fn $name() { let source = format!("123 # Foo{}456", $eol); let tokens = lex_source(&source); @@ -1791,7 +1745,6 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] fn $name() { let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1829,7 +1782,6 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] fn $name() { let source = format!("def foo():{} if x:{}{} return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1870,7 +1822,6 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] fn $name() { let source = format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol); let tokens = lex_source(&source); @@ -1923,7 +1874,6 @@ mod tests { ($($name:ident: $eol:expr,)*) => { $( #[test] - #[cfg(feature = "full-lexer")] fn $name() { let source = r"x = [ @@ -1986,7 +1936,6 @@ mod tests { } #[test] - #[cfg(feature = "full-lexer")] fn test_non_logical_newline_in_string_continuation() { let source = r"( 'a' @@ -2016,7 +1965,6 @@ mod tests { } #[test] - #[cfg(feature = "full-lexer")] fn test_logical_newline_line_comment() { let source = "#Hello\n#World\n"; let tokens = lex_source(source); @@ -2130,3 +2078,5 @@ mod tests { test_triple_quoted_unix_eol: UNIX_EOL, } } + +static_assertions::assert_eq_size!(Tok, [u8; 40]); diff --git a/parser/src/lexer_v2/compat_adapter.rs b/parser/src/lexer_v2/compat_adapter.rs new file mode 100644 index 00000000..bc860b20 --- /dev/null +++ b/parser/src/lexer_v2/compat_adapter.rs @@ -0,0 +1,162 @@ +use crate::lexer::{LexResult, LexicalError, LexicalErrorType}; +use crate::lexer_v2::token::TokenValue; +use crate::lexer_v2::{Lexer, TokenFlags, TokenKind}; +use crate::text_size::{TextRange, TextSize}; +use crate::Tok; +use num_bigint::BigInt; +use num_traits::Num; +use std::iter::FusedIterator; + +pub struct CompatAdapter<'a> { + offset: TextSize, + lexer: Lexer<'a>, +} + +impl<'a> CompatAdapter<'a> { + pub fn new(offset: TextSize, lexer: Lexer<'a>) -> Self { + Self { offset, lexer } + } +} + +impl<'a> Iterator for CompatAdapter<'a> { + type Item = LexResult; + + fn next(&mut self) -> Option { + let (tok, range) = loop { + let item = self.lexer.next_token(); + let start = self.offset; + self.offset += item.length; + + let tok = match item.kind { + TokenKind::Int => { + let value = match item.value { + TokenValue::BigInt(value) => value, + TokenValue::String(value) => BigInt::from_str_radix(&value, 10).unwrap(), + _ => panic!("Expected bigint"), + }; + Tok::Int { value } + } + TokenKind::Float => Tok::Float { value: 0.0 }, + TokenKind::Complex => Tok::Complex { + real: 0.0, + imag: 0.0, + }, + TokenKind::String => Tok::String { + value: item.value.unwrap_into_string(), + kind: item.flags.as_string_kind(), + triple_quoted: item.flags.contains(TokenFlags::TripleQuoted), + }, + TokenKind::Identifier => Tok::Name { + name: item.value.unwrap_into_string(), + }, + TokenKind::Comment => Tok::Comment(item.value.unwrap_into_string()), + TokenKind::NonLogicalNewline => Tok::NonLogicalNewline, + TokenKind::LineContinuation => continue, + TokenKind::EndOfFile => { + return None; + } + TokenKind::Whitespace => continue, + TokenKind::Newline => Tok::Newline, + TokenKind::Indent => Tok::Indent, + TokenKind::Dedent => Tok::Dedent, + TokenKind::Lpar => Tok::Lpar, + TokenKind::Rpar => Tok::Rpar, + TokenKind::Lsqb => Tok::Lsqb, + TokenKind::Rsqb => Tok::Rsqb, + TokenKind::Colon => Tok::Colon, + TokenKind::Comma => Tok::Comma, + TokenKind::Semi => Tok::Semi, + TokenKind::Plus => Tok::Plus, + TokenKind::Minus => Tok::Minus, + TokenKind::Star => Tok::Star, + TokenKind::Slash => Tok::Slash, + TokenKind::Vbar => Tok::Vbar, + TokenKind::Amper => Tok::Amper, + TokenKind::Less => Tok::Less, + TokenKind::Greater => Tok::Greater, + TokenKind::Equal => Tok::Equal, + TokenKind::Dot => Tok::Dot, + TokenKind::Percent => Tok::Percent, + TokenKind::Lbrace => Tok::Lbrace, + TokenKind::Rbrace => Tok::Rbrace, + TokenKind::EqEqual => Tok::EqEqual, + TokenKind::NotEqual => Tok::NotEqual, + TokenKind::LessEqual => Tok::LessEqual, + TokenKind::GreaterEqual => Tok::GreaterEqual, + TokenKind::Tilde => Tok::Tilde, + TokenKind::CircumFlex => Tok::CircumFlex, + TokenKind::LeftShift => Tok::LeftShift, + TokenKind::RightShift => Tok::RightShift, + TokenKind::DoubleStar => Tok::DoubleStar, + TokenKind::DoubleStarEqual => Tok::DoubleStarEqual, + TokenKind::PlusEqual => Tok::PlusEqual, + TokenKind::MinusEqual => Tok::MinusEqual, + TokenKind::StarEqual => Tok::StarEqual, + TokenKind::SlashEqual => Tok::SlashEqual, + TokenKind::PercentEqual => Tok::PercentEqual, + TokenKind::AmperEqual => Tok::AmperEqual, + TokenKind::VbarEqual => Tok::VbarEqual, + TokenKind::CircumflexEqual => Tok::CircumflexEqual, + TokenKind::LeftShiftEqual => Tok::LeftShiftEqual, + TokenKind::RightShiftEqual => Tok::RightShiftEqual, + TokenKind::DoubleSlash => Tok::DoubleSlash, + TokenKind::DoubleSlashEqual => Tok::DoubleSlashEqual, + TokenKind::ColonEqual => Tok::ColonEqual, + TokenKind::At => Tok::At, + TokenKind::AtEqual => Tok::AtEqual, + TokenKind::Rarrow => Tok::Rarrow, + TokenKind::Ellipsis => Tok::Ellipsis, + TokenKind::False => Tok::False, + TokenKind::None => Tok::None, + TokenKind::True => Tok::True, + TokenKind::And => Tok::And, + TokenKind::As => Tok::As, + TokenKind::Assert => Tok::Assert, + TokenKind::Async => Tok::Async, + TokenKind::Await => Tok::Await, + TokenKind::Break => Tok::Break, + TokenKind::Class => Tok::Class, + TokenKind::Continue => Tok::Continue, + TokenKind::Def => Tok::Def, + TokenKind::Del => Tok::Del, + TokenKind::Elif => Tok::Elif, + TokenKind::Else => Tok::Else, + TokenKind::Except => Tok::Except, + TokenKind::Finally => Tok::Finally, + TokenKind::For => Tok::For, + TokenKind::From => Tok::From, + TokenKind::Global => Tok::Global, + TokenKind::If => Tok::If, + TokenKind::Import => Tok::Import, + TokenKind::In => Tok::In, + TokenKind::Is => Tok::Is, + TokenKind::Lambda => Tok::Lambda, + TokenKind::Nonlocal => Tok::Nonlocal, + TokenKind::Not => Tok::Not, + TokenKind::Or => Tok::Or, + TokenKind::Pass => Tok::Pass, + TokenKind::Raise => Tok::Raise, + TokenKind::Return => Tok::Return, + TokenKind::Try => Tok::Try, + TokenKind::While => Tok::While, + TokenKind::With => Tok::With, + TokenKind::Yield => Tok::Yield, + TokenKind::Match => Tok::Match, + TokenKind::Case => Tok::Case, + TokenKind::Type => Tok::Type, + TokenKind::Bogus => { + return Some(Err(LexicalError::new( + LexicalErrorType::OtherError("Compat error".to_string()), + start, + ))) + } + }; + + break (tok, TextRange::new(start, self.offset)); + }; + + Some(Ok((tok, range))) + } +} + +impl<'a> FusedIterator for CompatAdapter<'a> {} diff --git a/parser/src/lexer_v2/cursor.rs b/parser/src/lexer_v2/cursor.rs new file mode 100644 index 00000000..90f9f7b2 --- /dev/null +++ b/parser/src/lexer_v2/cursor.rs @@ -0,0 +1,108 @@ +use crate::text_size::{TextLen, TextSize}; +use std::str::Chars; + +pub(crate) const EOF_CHAR: char = '\0'; + +#[derive(Clone, Debug)] +pub(super) struct Cursor<'a> { + chars: Chars<'a>, + source_length: TextSize, + #[cfg(debug_assertions)] + prev_char: char, +} + +impl<'a> Cursor<'a> { + pub fn new(source: &'a str) -> Self { + Self { + source_length: source.text_len(), + chars: source.chars(), + #[cfg(debug_assertions)] + prev_char: EOF_CHAR, + } + } + + /// Returns the previous token. Useful for debug assertions. + #[cfg(debug_assertions)] + pub(super) const fn previous(&self) -> char { + self.prev_char + } + + /// Peeks the next character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the file is at the end of the file. + pub(super) fn first(&self) -> char { + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Peeks the second character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn second(&self) -> char { + let mut chars = self.chars.clone(); + chars.next(); + chars.next().unwrap_or(EOF_CHAR) + } + + /// Peeks the third character from the input stream without consuming it. + /// Returns [EOF_CHAR] if the position is past the end of the file. + pub(super) fn rest(&self) -> &'a str { + self.chars.as_str() + } + + // SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`. + #[allow(clippy::cast_possible_truncation)] + pub(super) fn text_len(&self) -> TextSize { + TextSize::new(self.chars.as_str().len() as u32) + } + + pub(super) fn token_len(&self) -> TextSize { + self.source_length - self.text_len() + } + + pub(super) fn start_token(&mut self) { + self.source_length = self.text_len() + } + + pub(super) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Consumes the next character + pub(super) fn bump(&mut self) -> Option { + let prev = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev_char = prev; + } + + Some(prev) + } + + pub(super) fn eat_char(&mut self, c: char) -> bool { + if self.first() == c { + self.bump(); + true + } else { + false + } + } + + pub(super) fn eat_if(&mut self, mut predicate: F) -> Option + where + F: FnMut(char) -> bool, + { + if predicate(self.first()) && !self.is_eof() { + self.bump() + } else { + None + } + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(super) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/parser/src/lexer_v2/indentation.rs b/parser/src/lexer_v2/indentation.rs new file mode 100644 index 00000000..7cd48454 --- /dev/null +++ b/parser/src/lexer_v2/indentation.rs @@ -0,0 +1,125 @@ +use static_assertions::assert_eq_size; +use std::cmp::Ordering; +use std::fmt::Debug; + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Column(u32); + +impl Column { + pub(super) const fn new(column: u32) -> Self { + Self(column) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Default)] +pub(super) struct Character(u32); + +impl Character { + pub(super) const fn new(characters: u32) -> Self { + Self(characters) + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] +pub(super) struct Indentation { + column: Column, + character: Character, +} + +impl Indentation { + pub(super) const fn root() -> Self { + Self { + column: Column::new(0), + character: Character::new(0), + } + } + + pub(super) const fn new(column: Column, character: Character) -> Self { + Self { character, column } + } + + pub(super) const fn column(&self) -> Column { + self.column + } + + pub(super) const fn character(&self) -> Character { + self.character + } + + pub(super) fn try_compare( + &self, + other: &Indentation, + ) -> Result { + let column_ordering = self.column.cmp(&other.column); + let character_ordering = self.character.cmp(&other.character); + + if column_ordering == character_ordering { + Ok(column_ordering) + } else { + Err(UnexpectedIndentation) + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq)] +pub(super) struct UnexpectedIndentation; + +// The indentations stack is used to keep track of the current indentation level. +// Similar to the CPython implementation, the Indentations stack always has at +// least one level which is never popped. See Reference 2.1.8. +#[derive(Debug, Clone)] +pub(super) struct Indentations { + stack: Vec, +} + +impl Indentations { + pub fn is_empty(&self) -> bool { + self.stack.len() == 1 + } + + pub fn push(&mut self, indent: Indentation) { + debug_assert_eq!(self.current().try_compare(&indent), Ok(Ordering::Less)); + + self.stack.push(indent); + } + + pub fn pop(&mut self) -> Option { + if self.is_empty() { + None + } else { + self.stack.pop() + } + } + + pub fn current(&self) -> &Indentation { + self.stack.last().expect("Expected indentation") + } +} + +impl Default for Indentations { + fn default() -> Self { + Self { + stack: vec![Indentation::root()], + } + } +} + +assert_eq_size!(Indentation, u64); + +#[cfg(test)] +mod tests { + use crate::lexer_v2::indentation::{Character, Column, Indentation}; + use std::cmp::Ordering; + + + #[test] + fn indentation_try_compare() { + let tab = Indentation::new(Column::new(8), Character::new(1)); + + assert_eq!(tab.try_compare(&tab), Ok(Ordering::Equal)); + + let two_tabs = Indentation::new(Column::new(16), Character::new(2)); + assert_eq!(two_tabs.try_compare(&tab), Ok(Ordering::Greater)); + assert_eq!(tab.try_compare(&two_tabs), Ok(Ordering::Less)); + } +} diff --git a/parser/src/lexer_v2/mod.rs b/parser/src/lexer_v2/mod.rs new file mode 100644 index 00000000..0c99b509 --- /dev/null +++ b/parser/src/lexer_v2/mod.rs @@ -0,0 +1,1371 @@ +//! This module takes care of lexing Python source text. +//! +//! This means source code is scanned and translated into separate tokens. The rules +//! governing what is and is not a valid token are defined in the Python reference +//! guide section on [Lexical analysis]. +//! +//! The primary function in this module is [`lex`], which takes a string slice +//! and returns an iterator over the tokens in the source code. The tokens are currently returned +//! as a `Result`, where [`Spanned`] is a tuple containing the +//! start and end [`TextSize`] and a [`TokenKind`] denoting the token. +//! +//! # Example +//! +//! ``` +//! use rustpython_parser::{lexer::lex, Tok, Mode, StringKind}; +//! +//! let source = "x = 'RustPython'"; +//! let tokens = lex(source, Mode::Module) +//! .map(|tok| tok.expect("Failed to lex")) +//! .collect::>(); +//! +//! for (token, range) in tokens { +//! println!( +//! "{token:?}@{range:?}", +//! ); +//! } +//! ``` +//! +//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html +use std::borrow::Cow; +use std::str::FromStr; +use std::{char, cmp::Ordering}; + +use num_bigint::BigInt; +use num_traits::Num; +use unic_emoji_char::is_emoji_presentation; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; + +pub use token::{Token, TokenFlags, TokenKind}; + +use crate::lexer_v2::cursor::{Cursor, EOF_CHAR}; +use crate::lexer_v2::indentation::{Character, Column, Indentation, Indentations}; +use crate::lexer_v2::token::StringKind; +use crate::text_size::{TextRange, TextSize}; +use crate::Mode; + +pub(crate) mod compat_adapter; +mod cursor; +mod indentation; +mod token; + +#[derive(Debug, Clone)] +pub struct LexDiagnostic { + range: TextRange, + message: String, +} + +/// A lexer for Python source code. +#[derive(Clone, Debug)] +pub struct Lexer<'source> { + cursor: Cursor<'source>, + + source: &'source str, + + // Are we at the beginning of a line? + new_logical_line: bool, + is_blank: bool, + left_parens: u32, + + // Indentation levels. + indentations: Indentations, + pending_indentation: Option, + + diagnostics: Vec, +} + +/// Create a new lexer from a source string. +/// +/// # Examples +/// +/// ``` +/// use rustpython_parser::{Mode, lexer::lex}; +/// +/// let source = "def hello(): return 'world'"; +/// let lexer = lex(source, Mode::Module); +/// +/// for token in lexer { +/// println!("{:?}", token); +/// } +/// ``` +#[inline] +pub fn lex(source: &str, mode: Mode) -> Lexer<'_> { + lex_starts_at(source, mode, TextSize::default()) +} + +/// Create a new lexer from a source string, starting at a given location. +/// You probably want to use [`lex`] instead. +pub fn lex_starts_at(source: &str, _mode: Mode, start_offset: TextSize) -> Lexer<'_> { + Lexer::new(source, start_offset) +} + +impl<'source> Lexer<'source> { + /// Create a new lexer from T and a starting location. You probably want to use + /// [`lex`] instead. + pub fn new(source: &'source str, _start: TextSize) -> Self { + let mut lexer = Lexer { + new_logical_line: true, + is_blank: true, + left_parens: 0, + indentations: Indentations::default(), + cursor: Cursor::new(source), + diagnostics: Vec::new(), + pending_indentation: None, + source, + }; + + // TODO: Handle possible mismatch between BOM and explicit encoding declaration. + // spell-checker:ignore feff + lexer.cursor.eat_char('\u{feff}'); + + lexer + } + + pub fn finish(self) -> Vec { + self.diagnostics + } + + pub fn next_token(&mut self) -> Token<'source> { + if let Some(indentation) = self.pending_indentation.take() { + match self.indentations.current().try_compare(&indentation) { + Ok(Ordering::Greater) => { + self.pending_indentation = Some(indentation); + self.indentations.pop(); + return Token::new(TokenKind::Dedent, TextSize::new(0)); + } + Ok(Ordering::Equal) => { + if indentation.character() != Character::new(0) { + return Token::new(TokenKind::Whitespace, self.cursor.token_len()); + } + } + _ => { + unreachable!("Invalid indentation stack. Parent indentation was smaller than this indentation.") + } + } + } + + if self.cursor.is_eof() { + return if !self.new_logical_line { + self.new_logical_line = true; + Token::new(TokenKind::Newline, TextSize::new(0)) + } else if let Some(dedent) = self.handle_indentation(Indentation::root()) { + dedent + } else { + Token::eof() + }; + } + + // #[cfg(debug_assertions)] + // { + // if self.new_logical_line { + // debug_assert!(matches!( + // self.cursor.previous(), + // '\n' | '\r' | cursor::EOF_CHAR + // )); + // } + // } + + self.cursor.start_token(); + + let first = self.cursor.first(); + + if let Some(trivia) = self.eat_trivia(first) { + return trivia; + } + + self.is_blank = false; + self.cursor.bump(); + self.lex_non_trivia(first) + } + + fn lex_non_trivia(&mut self, first: char) -> Token<'source> { + if first.is_ascii() { + match first { + 'a'..='z' | 'A'..='Z' | '_' => self.lex_identifier(first), + '0'..='9' => self.lex_number(first), + '"' | '\'' => self.lex_string(StringKind::String, first), + '=' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::EqEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Equal, TextSize::new(1)) + } + } + '+' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::PlusEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Plus, TextSize::new(1)) + } + } + '*' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::StarEqual, TextSize::new(2)) + } else if self.cursor.eat_char('*') { + if self.cursor.eat_char('=') { + Token::new(TokenKind::DoubleStarEqual, TextSize::new(3)) + } else { + Token::new(TokenKind::DoubleStar, TextSize::new(2)) + } + } else { + Token::new(TokenKind::Star, TextSize::new(1)) + } + } + '/' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::SlashEqual, TextSize::new(2)) + } else if self.cursor.eat_char('/') { + if self.cursor.eat_char('=') { + Token::new(TokenKind::DoubleSlashEqual, TextSize::new(3)) + } else { + Token::new(TokenKind::DoubleSlash, TextSize::new(2)) + } + } else { + Token::new(TokenKind::Slash, TextSize::new(1)) + } + } + '%' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::PercentEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Percent, TextSize::new(1)) + } + } + '|' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::VbarEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Vbar, TextSize::new(1)) + } + } + '^' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::CircumflexEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::CircumFlex, TextSize::new(1)) + } + } + '&' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::AmperEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Amper, TextSize::new(1)) + } + } + '-' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::MinusEqual, TextSize::new(2)) + } else if self.cursor.eat_char('>') { + Token::new(TokenKind::Rarrow, TextSize::new(2)) + } else { + Token::new(TokenKind::Minus, TextSize::new(1)) + } + } + '@' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::AtEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::At, TextSize::new(1)) + } + } + '!' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::NotEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Bogus, TextSize::new(1)) + } + } + '~' => Token::new(TokenKind::Tilde, TextSize::new(1)), + // TODO emit an error token if below zero? No, because we also don't emit an error + // token for extra parens? + '(' => { + self.left_parens = self.left_parens.saturating_add(1); + Token::new(TokenKind::Lpar, TextSize::new(1)) + } + ')' => { + self.left_parens = self.left_parens.saturating_sub(1); + Token::new(TokenKind::Rpar, TextSize::new(1)) + } + '[' => { + self.left_parens = self.left_parens.saturating_add(1); + Token::new(TokenKind::Lsqb, TextSize::new(1)) + } + ']' => { + self.left_parens = self.left_parens.saturating_sub(1); + Token::new(TokenKind::Rsqb, TextSize::new(1)) + } + '{' => { + self.left_parens = self.left_parens.saturating_add(1); + Token::new(TokenKind::Lbrace, TextSize::new(1)) + } + '}' => { + self.left_parens = self.left_parens.saturating_sub(1); + Token::new(TokenKind::Rbrace, TextSize::new(1)) + } + ':' => { + if self.cursor.eat_char('=') { + Token::new(TokenKind::ColonEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Colon, TextSize::new(1)) + } + } + ';' => Token::new(TokenKind::Semi, TextSize::new(1)), + '<' => { + if self.cursor.eat_char('<') { + if self.cursor.eat_char('=') { + Token::new(TokenKind::LeftShiftEqual, TextSize::new(3)) + } else { + Token::new(TokenKind::LeftShift, TextSize::new(2)) + } + } else if self.cursor.eat_char('=') { + Token::new(TokenKind::LessEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Less, TextSize::new(1)) + } + } + '>' => { + if self.cursor.eat_char('>') { + if self.cursor.eat_char('=') { + Token::new(TokenKind::RightShiftEqual, TextSize::new(3)) + } else { + Token::new(TokenKind::RightShift, TextSize::new(2)) + } + } else if self.cursor.eat_char('=') { + Token::new(TokenKind::GreaterEqual, TextSize::new(2)) + } else { + Token::new(TokenKind::Greater, TextSize::new(1)) + } + } + ',' => Token::new(TokenKind::Comma, TextSize::new(1)), + '.' => match self.cursor.first() { + '0'..='9' => self.lex_number('.'), + '.' if self.cursor.second() == '.' => { + self.cursor.bump(); + self.cursor.bump(); + Token::new(TokenKind::Ellipsis, TextSize::new(3)) + } + + _ => Token::new(TokenKind::Dot, TextSize::new(1)), + }, + '#' => self.lex_comment(), + // Line continuation. We should emit a token for the line continuation + '\\' => { + let continuation = if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + true + } else { + self.cursor.eat_char('\n') + }; + + if continuation { + self.new_logical_line = false; + // FIXME The new line should really not be consumed... but that's kind of hard + Token::new(TokenKind::LineContinuation, self.cursor.token_len()) + } else { + // TODO emit diagnostic + + Token::new(TokenKind::Bogus, TextSize::new(1)) + } + } + _ => Token::new(TokenKind::Bogus, TextSize::new(1)), + } + } else if is_non_ascii_identifier_start(first) { + self.lex_identifier(first) + } else if is_emoji_presentation(first) { + Token::new(TokenKind::Identifier, self.cursor.token_len()) + } else { + Token::new(TokenKind::Bogus, self.cursor.text_len()) + } + } + + // TODO handle \x0C + + fn eat_trivia(&mut self, first: char) -> Option> { + let token = match first { + prev @ (' ' | '\t') => { + self.cursor.bump(); + + if self.new_logical_line { + let indentation = self.lex_indentation(prev); + + // Indention of an all whitespace line or comment only line. Indention rules don't apply + if matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) { + Token::new(TokenKind::Whitespace, self.cursor.token_len()) + } else { + self.new_logical_line = false; + return self.handle_indentation(indentation); + } + } else { + // Skip over whitespace + self.cursor.eat_while(|c| matches!(c, ' ' | '\t')); + Token::new(TokenKind::Whitespace, self.cursor.token_len()) + } + } + + '#' => { + self.cursor.bump(); + self.lex_comment() + } + + '\n' => { + self.cursor.bump(); + let kind = self.newline_token_kind(); + self.new_logical_line = self.new_logical_line || kind == TokenKind::Newline; + self.is_blank = true; + Token::new(kind, TextSize::new(1)) + } + // `\r` or `\r\n` + '\r' => { + self.cursor.bump(); + let kind = self.newline_token_kind(); + self.new_logical_line = self.new_logical_line || kind == TokenKind::Newline; + self.is_blank = true; + let len = if self.cursor.eat_char('\n') { + TextSize::new(2) + } else { + TextSize::new(1) + }; + Token::new(kind, len) + } + + '\x0C' => { + // Skip over whitespace + self.cursor.bump(); + self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C')); + self.is_blank = true; + Token::new(TokenKind::Whitespace, self.cursor.token_len()) + } + + _ => { + return if self.new_logical_line { + self.new_logical_line = false; + self.handle_indentation(Indentation::root()) + } else { + None + }; + } + }; + + Some(token) + } + + fn newline_token_kind(&self) -> TokenKind { + if self.is_blank || self.left_parens > 0 { + TokenKind::NonLogicalNewline + } else { + TokenKind::Newline + } + } + + fn lex_indentation(&mut self, first: char) -> Indentation { + debug_assert!(self.new_logical_line); + debug_assert!(matches!(first, ' ' | '\t')); + + let mut column = 0u32; + let mut character = 0u32; + + if first == ' ' { + column += 1; + character += 1; + } else { + column += 8; + character += 1; + } + + loop { + match self.cursor.first() { + ' ' => { + column += 1; + } + '\t' => column = (column % 8) + column, + _ => break, + } + + self.cursor.bump(); + character += 1; + } + + Indentation::new(Column::new(column), Character::new(character)) + } + + fn handle_indentation(&mut self, indentation: Indentation) -> Option> { + match self.indentations.current().try_compare(&indentation) { + // Dedent + Ok(Ordering::Greater) => { + self.indentations.pop(); + self.pending_indentation = Some(indentation); + + Some(Token::new(TokenKind::Dedent, TextSize::new(0))) + } + + Ok(Ordering::Equal) => { + if indentation.character() != Character::new(0) { + Some(Token::new(TokenKind::Whitespace, self.cursor.token_len())) + } else { + None + } + } + + // Indent + Ok(Ordering::Less) => { + self.indentations.push(indentation); + Some(Token::new(TokenKind::Indent, self.cursor.token_len())) + } + Err(_) => { + self.diagnostics.push(LexDiagnostic { + // TODO add right range + range: self.token_range(), + message: "Unexpected indent".to_string(), + }); + + Some(Token::new(TokenKind::Bogus, self.cursor.token_len())) + } + } + } + + #[inline] + fn token_range(&self) -> TextRange { + let end = self.offset(); + let len = self.cursor.token_len(); + + TextRange::at(end - len, len) + } + + #[inline] + fn offset(&self) -> TextSize { + TextSize::new(self.source.len() as u32) - self.cursor.text_len() + } + + fn lex_comment(&mut self) -> Token<'source> { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), '#'); + + self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); + + let range = self.token_range(); + let comment = Cow::Borrowed(&self.source[range]); + + Token::new(TokenKind::Comment, self.cursor.token_len()).with_string_value(comment) + } + + /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. + fn lex_identifier(&mut self, first: char) -> Token<'source> { + match self.cursor.first() { + quote @ ('\'' | '"') => { + if let Ok(string_kind) = StringKind::try_from(first) { + self.cursor.bump(); + return self.lex_string(string_kind, quote); + } + } + second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => { + self.cursor.bump(); + + if let Ok(string_kind) = StringKind::try_from([first, second]) { + let quote = self.cursor.bump().unwrap(); + return self.lex_string(string_kind, quote); + } + } + _ => {} + } + + self.cursor.eat_while(is_identifier_continuation); + + let range = self.token_range(); + let text = &self.source[range]; + + let keyword = match text { + "False" => TokenKind::False, + "None" => TokenKind::None, + "True" => TokenKind::True, + "and" => TokenKind::And, + "as" => TokenKind::As, + "assert" => TokenKind::Assert, + "async" => TokenKind::Async, + "await" => TokenKind::Await, + "break" => TokenKind::Break, + "case" => TokenKind::Case, + "class" => TokenKind::Class, + "continue" => TokenKind::Continue, + "def" => TokenKind::Def, + "del" => TokenKind::Del, + "elif" => TokenKind::Elif, + "else" => TokenKind::Else, + "except" => TokenKind::Except, + "finally" => TokenKind::Finally, + "for" => TokenKind::For, + "from" => TokenKind::From, + "global" => TokenKind::Global, + "if" => TokenKind::If, + "import" => TokenKind::Import, + "in" => TokenKind::In, + "is" => TokenKind::Is, + "lambda" => TokenKind::Lambda, + "match" => TokenKind::Match, + "nonlocal" => TokenKind::Nonlocal, + "not" => TokenKind::Not, + "or" => TokenKind::Or, + "pass" => TokenKind::Pass, + "raise" => TokenKind::Raise, + "return" => TokenKind::Return, + "try" => TokenKind::Try, + "type" => TokenKind::Type, + "while" => TokenKind::While, + "with" => TokenKind::With, + "yield" => TokenKind::Yield, + _ => { + return Token::new(TokenKind::Identifier, range.len()) + .with_string_value(Cow::Borrowed(text)) + } + }; + + Token::new(keyword, range.len()) + } + + /// Numeric lexing. The feast can start! + fn lex_number(&mut self, first: char) -> Token<'source> { + if first == '0' { + if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { + // Hex! (0xdeadbeef) + return self.lex_number_radix(Radix::Hex); + } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { + // Octal style! (0o377) + return self.lex_number_radix(Radix::Octal); + } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { + // Binary! (0b_1110_0101) + return self.lex_number_radix(Radix::Binary); + } + } + + self.lex_decimal_number(first) + } + + /// Lex a hex/octal/decimal/binary number without a decimal point. + fn lex_number_radix(&mut self, radix: Radix) -> Token<'source> { + #[cfg(debug_assertions)] + debug_assert!(matches!( + self.cursor.previous().to_ascii_lowercase(), + 'x' | 'o' | 'b' + )); + + let value_text = self.radix_run(radix, self.offset()); + + // TODO Create our own `Write` and write the value to it. Returns `Borrowed` if the + // formatted radix is the same as the source text. + let value = match BigInt::from_str_radix(&value_text, radix.as_u32()) { + Ok(value) => value, + Err(_) => { + // TODO emit diagnostic + return Token::new(TokenKind::Bogus, self.cursor.token_len()); + } + }; + + Token::new(TokenKind::Int, self.cursor.token_len()).with_bigint_value(value) + } + + /// Consume a sequence of numbers with the given radix, + /// the digits can be decorated with underscores + /// like this: '1_2_3_4' == '1234' + fn radix_run(&mut self, radix: Radix, start: TextSize) -> Cow<'source, str> { + loop { + if self.eat_digit(radix).is_some() { + // nothing + } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + break; + } else { + return Cow::Borrowed(&self.source[TextRange::new(start, self.offset())]); + } + } + + let len = self.offset() - start; + let mut cleaned = String::from(&self.source[TextRange::at(start, len)]); + + loop { + if let Some(c) = self.eat_digit(radix) { + cleaned.push(c); + } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + cleaned.push(self.cursor.second()); + self.cursor.bump(); + self.cursor.bump(); + } else { + break; + } + } + Cow::Owned(cleaned) + } + + /// Consume a single character with the given radix. + fn eat_digit(&mut self, radix: Radix) -> Option { + self.cursor.eat_if(|c| radix.is_digit(c)) + } + + // TODO implement parsing manually + /// Lex a normal number, that is, no octal, hex or binary number. + fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> Token<'source> { + #[cfg(debug_assertions)] + debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); + + let start = self.offset() - TextSize::new(1); + let start_is_zero = first_digit_or_dot == '0'; + + // Normal number: + let value_text = self.radix_run(Radix::Decimal, start); + + // If float: + let (is_float, before_exponent) = if self.cursor.eat_char('.') || first_digit_or_dot == '.' + { + if self.cursor.eat_char('_') { + // TODO emit a diagnostic; + return Token::new(TokenKind::Bogus, self.cursor.token_len()); + } + + let after_dot = self.radix_run(Radix::Decimal, self.offset()); + let text = if matches!(value_text, Cow::Borrowed(_)) + && matches!(after_dot, Cow::Borrowed(_)) + { + Cow::Borrowed(&self.source[self.token_range()]) + } else { + Cow::Owned(format!("{}.{}", value_text, after_dot)) + }; + + (true, text) + } else { + (false, value_text) + }; + + let (is_float, number) = match self.cursor.rest().as_bytes() { + [b'e' | b'E', b'0'..=b'9', ..] => { + self.cursor.bump(); + + let exponent = self.radix_run(Radix::Decimal, self.offset()); + + let number = match (before_exponent, exponent) { + (Cow::Borrowed(_), Cow::Borrowed(_)) => { + Cow::Borrowed(&self.source[self.token_range()]) + } + + (Cow::Owned(mut owned), exponent) => { + owned.push('e'); + owned.push_str(&exponent); + Cow::Owned(owned) + } + (before, exponent) => Cow::Owned(format!("{before}e{exponent}")), + }; + + (true, number) + } + [b'e' | b'E', b'+' | b'-', b'0'..=b'9', ..] => { + self.cursor.bump(); + let sign = self.cursor.bump().unwrap(); + let exponent = self.radix_run(Radix::Decimal, self.offset()); + + let number = match (before_exponent, exponent) { + (Cow::Borrowed(_), Cow::Borrowed(_)) => { + Cow::Borrowed(&self.source[self.token_range()]) + } + (Cow::Owned(mut owned), exponent) => { + owned.push('e'); + owned.push(sign); + owned.push_str(&exponent); + Cow::Owned(owned) + } + (before, exponent) => Cow::Owned(format!("{before}e{sign}{exponent}")), + }; + + (true, number) + } + _ => (is_float, before_exponent), + }; + + if is_float { + if self.cursor.eat_char('_') { + // TODO emit a diagnostic; + return Token::new(TokenKind::Bogus, self.cursor.token_len()); + } + + let n = match f64::from_str(&number) { + Ok(n) => n, + Err(_) => { + // TODO emit diagnostic + return Token::new(TokenKind::Bogus, self.cursor.token_len()); + } + }; + + // Parse trailing 'j': + let kind = if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + TokenKind::Complex + } else { + TokenKind::Float + }; + + return Token::new(kind, self.cursor.token_len()) + .with_string_value(Cow::Owned(n.to_string())); + } + + // Parse trailing 'j': + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + // TODO what's the right way to represent these numbers + // TODO handle panic + let imag = f64::from_str(&number).unwrap(); + + Token::new(TokenKind::Complex, self.cursor.token_len()) + .with_string_value(Cow::Owned(imag.to_string())) + } else { + // TODO handle errors? + // leading zeros in decimal integer literals are not permitted + if start_is_zero && number != "0" { + // TODO emit diagnostic + // return Err(LexicalError { + // error: LexicalErrorType::OtherError("Invalid Token".to_owned()), + // location: self.get_pos(), + // }); + Token::new(TokenKind::Bogus, self.cursor.token_len()) + } else { + Token::new(TokenKind::Int, self.cursor.token_len()).with_string_value(number) + } + } + } + + /// Lex a string literal. + fn lex_string(&mut self, kind: StringKind, quote: char) -> Token<'source> { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), quote); + + let mut flags = kind.flags(); + + // If the next two characters are also the quote character, then we have a triple-quoted + // string; consume those two characters and ensure that we require a triple-quote to close + let triple_quoted = if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); + flags |= TokenFlags::TripleQuoted; + true + } else { + false + }; + + let value_start = self.offset(); + let mut value_end = None; + while let Some(c) = self.cursor.bump() { + match c { + // TODO remove escape character. + '\\' => { + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.eat_char('\n') { + // Line continuation + } else { + // TODO handle incorrect escape sequences + self.cursor.bump(); + } + } + + // TODO handle backslash at enw line + // TODO validate escape sequences? + + // TODO handle line feed? + // TODO normalize line breaks to newlines? + '\n' | '\r' if !triple_quoted => { + break; + } + c if c == quote => { + if triple_quoted { + if self.cursor.first() == quote && self.cursor.second() == quote { + self.cursor.bump(); + self.cursor.bump(); + value_end = Some(self.offset() - TextSize::new(3)); + break; + } + } else { + value_end = Some(self.offset() - TextSize::new(1)); + break; + } + } + _ => {} + } + } + + let value_end = match value_end { + Some(end) => end, + None => { + self.diagnostics.push(LexDiagnostic { + range: self.token_range(), + message: "Unterminated string literal".to_string(), + }); + + flags |= TokenFlags::Unterminated; + self.offset() + } + }; + + Token::new(TokenKind::String, self.cursor.token_len()) + .with_string_value(Cow::Borrowed( + &self.source[TextRange::new(value_start, value_end)], + )) + .with_flags(flags) + } +} + +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char) -> bool { + if c.is_ascii() { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') + } else { + is_xid_continue(c) + } +} + +fn is_non_ascii_identifier_start(c: char) -> bool { + is_xid_start(c) +} + +#[derive(Copy, Clone, Debug)] +enum Radix { + Binary, + Octal, + Hex, + Decimal, +} + +impl Radix { + /// Test if a digit is of a certain radix. + fn is_digit(self, c: char) -> bool { + c.is_digit(self.as_u32()) + } + + const fn as_u32(self) -> u32 { + match self { + Radix::Binary => 2, + Radix::Octal => 8, + Radix::Hex => 16, + Radix::Decimal => 10, + } + } +} + +const fn is_quote(c: char) -> bool { + matches!(c, '\'' | '"') +} + +#[cfg(test)] +mod tests { + use insta::assert_debug_snapshot; + use itertools::Itertools; + + use super::*; + + const WINDOWS_EOL: &str = "\r\n"; + const MAC_EOL: &str = "\r"; + const UNIX_EOL: &str = "\n"; + + pub fn lex_source(source: &str) -> Vec { + let mut lexer = lex(source, Mode::Module); + let mut result = vec![]; + + loop { + let next = lexer.next_token(); + + let is_eof = next.kind == TokenKind::EndOfFile; + result.push(next); + + if is_eof { + break; + } + } + + result + } + + #[test] + fn comment() { + let source = r#"# Module comment + +# New line comment + + # Indented comment +"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn identifier() { + let source = r#"x +nonlocal +"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + // TODO special lexing for fstrings rather than doing in the parser, which feels odd. + + #[test] + fn string() { + let source = r#""test" +'test' +"""test""" +'''test''' +r'raw' +u'unicode' +"""a multiline string +that continues here""" + +"An unterminated string +"Recovers here" +"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn binary() { + let source = r#"0b000_0001 +0B000_0000"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn octal() { + let source = r#"0o000_0075 +0O000_0012"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn hex() { + let source = r#"0x000_00a5 +0X000_1b12"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn float() { + let source = r#"0.0123 +.123 +0.123_345 +134344444.333 +1e45 +1E45 +1.3e3 +3.3E4"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn numbers() { + let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j"; + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + #[test] + fn assignment() { + let source = r"a_variable = 99 + 2-0"; + let tokens = lex_source(source); + assert_debug_snapshot!(tokens); + } + + macro_rules! test_indentation_with_eol { + ($($name:ident: $eol:expr,)*) => { + $( + #[test] + fn $name() { + let source = format!("def foo():{} return 99{}{}", $eol, $eol, $eol); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + )* + }; + } + + fn eol_test_case(eol: &str) -> String { + format!("def foo():{} return 99{}{}", eol, eol, eol) + } + + #[test] + fn windows_eol() { + let source = eol_test_case(WINDOWS_EOL); + + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn mac_eol() { + let source = eol_test_case(MAC_EOL); + + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn unix_eol() { + let source = eol_test_case(UNIX_EOL); + + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + fn create_double_dedent_with_eol(eol: &str) -> String { + format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}") + } + + #[test] + fn double_dedent_with_eol_windows() { + let source = create_double_dedent_with_eol(WINDOWS_EOL); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + + #[test] + fn double_dedent_with_eol_mac() { + let source = create_double_dedent_with_eol(MAC_EOL); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + + #[test] + fn double_dedent_with_eol_unix() { + let source = create_double_dedent_with_eol(UNIX_EOL); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + + fn create_double_dedent_with_tabs(eol: &str) -> String { + format!("def foo():{eol}\tif x:{eol}{eol}\t return 99{eol}{eol}") + } + + #[test] + fn double_dedent_with_tabs_windows() { + let source = create_double_dedent_with_tabs(WINDOWS_EOL); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + + #[test] + fn double_dedent_with_tabs_mac() { + let source = create_double_dedent_with_tabs(MAC_EOL); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + + #[test] + fn double_dedent_with_tabs_unix() { + let source = create_double_dedent_with_tabs(UNIX_EOL); + let tokens = lex_source(&source); + assert_debug_snapshot!(tokens); + } + + fn create_newline_in_brackets_code(eol: &str) -> String { + r"x = [ + + 1,2 + ,(3, + 4, + ), { + 5, + 6,\ + 7}] + " + .replace('\n', eol) + } + + #[test] + fn newline_in_brackets_windows() { + let code = create_newline_in_brackets_code(WINDOWS_EOL); + let tokens = lex_source(&code); + assert_debug_snapshot!(tokens); + } + + #[test] + fn newline_in_brackets_mac() { + let code = create_newline_in_brackets_code(MAC_EOL); + let tokens = lex_source(&code); + assert_debug_snapshot!(tokens); + } + + #[test] + fn newline_in_brackets_unix() { + let code = create_newline_in_brackets_code(UNIX_EOL); + let tokens = lex_source(&code); + assert_debug_snapshot!(tokens); + } + + #[test] + fn test_non_logical_newline_in_string_continuation() { + let source = r"( + 'a' + 'b' + + 'c' \ + 'd' + )"; + let tokens = lex_source(source); + assert_debug_snapshot!(tokens); + } + + #[test] + fn logical_newline_line_comment() { + let source = "#Hello\n#World\n"; + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn operators() { + let source = "//////=/ /"; + let tokens = lex_source(source); + assert_debug_snapshot!(tokens); + } + + #[test] + fn string_single_line() { + let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#; + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + fn string_continuation_test_case(eol: &str) -> String { + format!( + "\"abc{}def\"", + eol.chars().map(|c| format!("\\{c}")).join("") + ) + } + + #[test] + fn string_continuation_windows() { + let source = string_continuation_test_case(WINDOWS_EOL); + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn string_continuation_mac() { + let source = string_continuation_test_case(MAC_EOL); + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn string_continuation_unix() { + let source = string_continuation_test_case(UNIX_EOL); + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn test_escape_unicode_name() { + let source = r#""\N{EN SPACE}""#; + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + // assert_eq!(tokens, vec![str_tok(r"\N{EN SPACE}"), TokenKind::Newline]) + } + + fn triple_quoted_test_source(eol: &str) -> String { + format!("\"\"\"{eol} test string{eol} \"\"\"") + } + + #[test] + fn triple_quoted_windows() { + let source = triple_quoted_test_source(WINDOWS_EOL); + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn triple_quoted_mac() { + let source = triple_quoted_test_source(MAC_EOL); + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn triple_quoted_unix() { + let source = triple_quoted_test_source(UNIX_EOL); + let tokens = lex_source(&source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn while_else() { + let source = r#" +while a: + break +else: + continue"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn match_statement() { + let source = r#" +# case test_patma_098 +match x: + case -0j: + y = 0 +# case test_patma_142 +y"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn type_with_continuation() { + let source = r#"# multine definitions +type \ + X = int"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } + + #[test] + fn regression_1() { + let source = r#" +def __new__(cls): + # ensure that only one instance exists + if not cls.__instance: + cls.__instance = super().__new__(cls) + return cls.__instance"#; + + let tokens = lex_source(source); + + assert_debug_snapshot!(tokens); + } +} diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__assignment.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__assignment.snap new file mode 100644 index 00000000..ba825ca0 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__assignment.snap @@ -0,0 +1,118 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Identifier, + length: 10, + value: String( + "a_variable", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Plus, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Minus, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "0", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__binary.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__binary.snap new file mode 100644 index 00000000..658b3c4b --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__binary.snap @@ -0,0 +1,50 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Int, + length: 10, + value: BigInt( + 1, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 10, + value: BigInt( + 0, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__comment.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__comment.snap new file mode 100644 index 00000000..0c0d6e90 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__comment.snap @@ -0,0 +1,92 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Comment, + length: 16, + value: String( + "# Module comment", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comment, + length: 18, + value: String( + "# New line comment", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comment, + length: 18, + value: String( + "# Indented comment", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_mac.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_mac.snap new file mode 100644 index 00000000..fe4b222a --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_mac.snap @@ -0,0 +1,196 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_unix.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_unix.snap new file mode 100644 index 00000000..fe4b222a --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_unix.snap @@ -0,0 +1,196 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_windows.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_windows.snap new file mode 100644 index 00000000..d1f23360 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_eol_windows.snap @@ -0,0 +1,196 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_mac.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_mac.snap new file mode 100644 index 00000000..fe4b222a --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_mac.snap @@ -0,0 +1,196 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_unix.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_unix.snap new file mode 100644 index 00000000..fe4b222a --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_unix.snap @@ -0,0 +1,196 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_windows.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_windows.snap new file mode 100644 index 00000000..d1f23360 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__double_dedent_with_tabs_windows.snap @@ -0,0 +1,196 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__escape_unicode_name.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__escape_unicode_name.snap new file mode 100644 index 00000000..89c709f2 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__escape_unicode_name.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 14, + value: String( + "\\N{EN SPACE}", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__float.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__float.snap new file mode 100644 index 00000000..06fcb43b --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__float.snap @@ -0,0 +1,158 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Float, + length: 6, + value: String( + "0.0123", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 4, + value: String( + "0.123", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 9, + value: String( + "0.123345", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 13, + value: String( + "134344444.333", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 4, + value: String( + "1000000000000000000000000000000000000000000000", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 4, + value: String( + "1000000000000000000000000000000000000000000000", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 5, + value: String( + "1300", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 5, + value: String( + "33000", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__hex.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__hex.snap new file mode 100644 index 00000000..b57e3c74 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__hex.snap @@ -0,0 +1,50 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Int, + length: 10, + value: BigInt( + 165, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 10, + value: BigInt( + 6930, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__identifier.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__identifier.snap new file mode 100644 index 00000000..5d0e2fb0 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__identifier.snap @@ -0,0 +1,48 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Nonlocal, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__logical_newline_line_comment.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__logical_newline_line_comment.snap new file mode 100644 index 00000000..790776c7 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__logical_newline_line_comment.snap @@ -0,0 +1,50 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Comment, + length: 6, + value: String( + "#Hello", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comment, + length: 6, + value: String( + "#World", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__mac_eol.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__mac_eol.snap new file mode 100644 index 00000000..6be29d79 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__mac_eol.snap @@ -0,0 +1,130 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__match_statement.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__match_statement.snap new file mode 100644 index 00000000..588b5b54 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__match_statement.snap @@ -0,0 +1,252 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comment, + length: 21, + value: String( + "# case test_patma_098", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Match, + length: 5, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Case, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Minus, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Complex, + length: 2, + value: String( + "0", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "y", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "0", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comment, + length: 21, + value: String( + "# case test_patma_142", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "y", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_mac.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_mac.snap new file mode 100644 index 00000000..e9359032 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_mac.snap @@ -0,0 +1,366 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lsqb, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "1", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "3", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "4", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lbrace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "5", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "6", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: LineContinuation, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "7", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rbrace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rsqb, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_unix.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_unix.snap new file mode 100644 index 00000000..e9359032 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_unix.snap @@ -0,0 +1,366 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lsqb, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "1", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "3", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "4", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lbrace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "5", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "6", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: LineContinuation, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "7", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rbrace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rsqb, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_windows.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_windows.snap new file mode 100644 index 00000000..7493c660 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__newline_in_brackets_windows.snap @@ -0,0 +1,366 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Identifier, + length: 1, + value: String( + "x", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lsqb, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "1", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "3", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "4", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lbrace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "5", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "6", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comma, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: LineContinuation, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "7", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rbrace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rsqb, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__non_logical_newline_in_string_continuation.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__non_logical_newline_in_string_continuation.snap new file mode 100644 index 00000000..f68c1742 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__non_logical_newline_in_string_continuation.snap @@ -0,0 +1,174 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 3, + value: String( + "a", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 3, + value: String( + "b", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 3, + value: String( + "c", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: LineContinuation, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 3, + value: String( + "d", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__numbers.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__numbers.snap new file mode 100644 index 00000000..7e26a5b9 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__numbers.snap @@ -0,0 +1,212 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Int, + length: 4, + value: BigInt( + 47, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 4, + value: BigInt( + 10, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 6, + value: BigInt( + 13, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 1, + value: String( + "0", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 3, + value: String( + "123", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 13, + value: String( + "1234567890", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 3, + value: String( + "0.2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 4, + value: String( + "100", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Float, + length: 5, + value: String( + "2100", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Complex, + length: 2, + value: String( + "2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Complex, + length: 4, + value: String( + "2.2", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__octal.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__octal.snap new file mode 100644 index 00000000..3010f6dd --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__octal.snap @@ -0,0 +1,50 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Int, + length: 10, + value: BigInt( + 61, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 10, + value: BigInt( + 10, + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__operators.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__operators.snap new file mode 100644 index 00000000..948e2768 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__operators.snap @@ -0,0 +1,70 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: DoubleSlash, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: DoubleSlash, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: DoubleSlashEqual, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Slash, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Slash, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__regression_1.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__regression_1.snap new file mode 100644 index 00000000..bcc99138 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__regression_1.snap @@ -0,0 +1,414 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 7, + value: String( + "__new__", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "cls", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Comment, + length: 38, + value: String( + "# ensure that only one instance exists", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: If, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Not, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "cls", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dot, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 10, + value: String( + "__instance", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "cls", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dot, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 10, + value: String( + "__instance", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 5, + value: String( + "super", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dot, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 7, + value: String( + "__new__", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "cls", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "cls", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dot, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 10, + value: String( + "__instance", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string.snap new file mode 100644 index 00000000..8c940ef0 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string.snap @@ -0,0 +1,176 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 6, + value: String( + "test", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 6, + value: String( + "test", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 10, + value: String( + "test", + ), + flags: TokenFlags( + TripleQuoted, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 10, + value: String( + "test", + ), + flags: TokenFlags( + TripleQuoted, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 6, + value: String( + "raw", + ), + flags: TokenFlags( + Raw, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 10, + value: String( + "unicode", + ), + flags: TokenFlags( + Unicode, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 44, + value: String( + "a multiline string\nthat continues here", + ), + flags: TokenFlags( + TripleQuoted, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 24, + value: String( + "An unterminated string\n", + ), + flags: TokenFlags( + Unterminated, + ), + }, + Token { + kind: String, + length: 15, + value: String( + "Recovers here", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_mac.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_mac.snap new file mode 100644 index 00000000..111c8201 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_mac.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 10, + value: String( + "abc\\\rdef", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_unix.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_unix.snap new file mode 100644 index 00000000..585206c6 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_unix.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 10, + value: String( + "abc\\\ndef", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_windows.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_windows.snap new file mode 100644 index 00000000..92784e32 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_continuation_windows.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 12, + value: String( + "abc\\\r\\\ndef", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_single_line.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_single_line.snap new file mode 100644 index 00000000..4980ed40 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__string_single_line.snap @@ -0,0 +1,176 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 8, + value: String( + "double", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 8, + value: String( + "single", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 8, + value: String( + "can\\'t", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 6, + value: String( + "\\\\\\\"", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 8, + value: String( + "\\t\\r\\n", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 4, + value: String( + "\\g", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 8, + value: String( + "raw\\'", + ), + flags: TokenFlags( + Raw, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 6, + value: String( + "\\420", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: String, + length: 9, + value: String( + "\\200\\0a", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_mac.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_mac.snap new file mode 100644 index 00000000..5e90cb34 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_mac.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 21, + value: String( + "\r test string\r ", + ), + flags: TokenFlags( + TripleQuoted, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_unix.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_unix.snap new file mode 100644 index 00000000..e817669d --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_unix.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 21, + value: String( + "\n test string\n ", + ), + flags: TokenFlags( + TripleQuoted, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_windows.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_windows.snap new file mode 100644 index 00000000..3b9684bd --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__triple_quoted_windows.snap @@ -0,0 +1,32 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: String, + length: 23, + value: String( + "\r\n test string\r\n ", + ), + flags: TokenFlags( + TripleQuoted, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__type_with_continuation.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__type_with_continuation.snap new file mode 100644 index 00000000..9df85e49 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__type_with_continuation.snap @@ -0,0 +1,116 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Comment, + length: 21, + value: String( + "# multine definitions", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Type, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: LineContinuation, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "X", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Equal, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "int", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__unix_eol.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__unix_eol.snap new file mode 100644 index 00000000..6be29d79 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__unix_eol.snap @@ -0,0 +1,130 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__while_else.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__while_else.snap new file mode 100644 index 00000000..923741b0 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__while_else.snap @@ -0,0 +1,152 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: NonLogicalNewline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: While, + length: 5, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 1, + value: String( + "a", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Break, + length: 5, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Else, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 4, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Continue, + length: 8, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__windows_eol.snap b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__windows_eol.snap new file mode 100644 index 00000000..d6ee6de4 --- /dev/null +++ b/parser/src/lexer_v2/snapshots/rustpython_parser__lexer_v2__tests__windows_eol.snap @@ -0,0 +1,130 @@ +--- +source: parser/src/lexer_v2/mod.rs +expression: tokens +--- +[ + Token { + kind: Def, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Identifier, + length: 3, + value: String( + "foo", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Lpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Rpar, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Colon, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Indent, + length: 3, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Return, + length: 6, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Whitespace, + length: 1, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Int, + length: 2, + value: String( + "99", + ), + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Newline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: NonLogicalNewline, + length: 2, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: Dedent, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, + Token { + kind: EndOfFile, + length: 0, + value: None, + flags: TokenFlags( + 0x0, + ), + }, +] diff --git a/parser/src/lexer_v2/token.rs b/parser/src/lexer_v2/token.rs new file mode 100644 index 00000000..f21f2638 --- /dev/null +++ b/parser/src/lexer_v2/token.rs @@ -0,0 +1,499 @@ +//! Token type for Python source code created by the lexer and consumed by the parser. +//! +//! This module defines the tokens that the lexer recognizes. The tokens are +//! loosely based on the token definitions found in the [CPython source]. +//! +//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h +use crate::text_size::TextSize; +use bitflags::bitflags; +use num_bigint::BigInt; +use std::borrow::Cow; +use std::error::Error; +use std::fmt; +use std::fmt::{Debug, Display, Formatter}; + +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Token<'source> { + pub kind: TokenKind, + pub length: TextSize, + pub value: TokenValue<'source>, + pub flags: TokenFlags, +} + +#[derive(Debug, Clone, Eq, PartialEq, Default)] +pub enum TokenValue<'source> { + #[default] + None, + String(Cow<'source, str>), + BigInt(BigInt), +} + +impl<'source> TokenValue<'source> { + pub(crate) fn unwrap_into_string(self) -> String { + match self { + TokenValue::String(value) => value.into_owned(), + TokenValue::None | TokenValue::BigInt(_) => panic!("Expected string"), + } + } + + pub(crate) fn unwrap_into_bigint(self) -> BigInt { + match self { + TokenValue::BigInt(value) => value, + TokenValue::String(_) | TokenValue::None => { + panic!("Expected bigint") + } + } + } +} + +impl<'source> Token<'source> { + pub const fn new(kind: TokenKind, length: TextSize) -> Self { + Self { + kind, + length, + flags: TokenFlags::empty(), + value: TokenValue::None, + } + } + + pub const fn eof() -> Self { + Self::new(TokenKind::EndOfFile, TextSize::new(0)) + } + + pub fn with_string_value(mut self, value: Cow<'source, str>) -> Self { + self.value = TokenValue::String(value); + self + } + + pub fn with_bigint_value(mut self, value: BigInt) -> Self { + self.value = TokenValue::BigInt(value); + self + } + + pub fn with_flags(mut self, flags: TokenFlags) -> Self { + self.flags = flags; + self + } +} + +bitflags! { + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub struct TokenFlags: u8 { + const Unterminated = 0x0000_0001; + + // TODO(micha): Consider storing the parsed Text for all Literals and then re-parsing the + // triple-quoted, fstring, bytes (and values for numbers) on demand. Can avoid many heap allocations + // if we use stack allocated strings / imstr. + // Strings + const TripleQuoted = 0b0000_0010; + + /// A f-string literal, with a `f` or `F` prefix. + const FString = 0b0000_0100; + + /// A byte string literal, with a `b` or `B` prefix. + const Bytes = 0b0000_1000; + + /// A raw string literal, with a `r` or `R` prefix. + const Raw = 0b0001_0000; + + /// A unicode string literal, with a `u` or `U` prefix. + const Unicode = 0b1000_0000; + } +} + +impl TokenFlags { + pub fn as_string_kind(self) -> crate::StringKind { + if self.contains(TokenFlags::Bytes) { + if self.contains(TokenFlags::Raw) { + crate::StringKind::RawBytes + } else { + crate::StringKind::Bytes + } + } else if self.contains(TokenFlags::FString) { + if self.contains(TokenFlags::Raw) { + crate::StringKind::RawFString + } else { + crate::StringKind::FString + } + } else if self.contains(TokenFlags::Unicode) { + crate::StringKind::Unicode + } else if self.contains(TokenFlags::Raw) { + crate::StringKind::RawString + } else { + crate::StringKind::String + } + } +} + +/// The set of tokens the Python source code can be tokenized in. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] +#[repr(u8)] +pub enum TokenKind { + /** Literals **/ + /// Token value for an integer. + Int, + /// Token value for a floating point number. + Float, + /// Token value for a complex number. + Complex, + /// Token value for a string. + String, + + /// Token value for a name, commonly known as an identifier. + Identifier, + + /** Trivia */ + /// Token value for a comment. These are filtered out of the token stream prior to parsing. + Comment, + /// Token value for a newline that is not a logical line break. These are filtered out of + /// the token stream prior to parsing. + NonLogicalNewline, + + LineContinuation, + + EndOfFile, + + Whitespace, + + /* Semantic Whitespace */ + /// Token value for a newline. + Newline, + + /// Token value for an indent. + Indent, + /// Token value for a dedent. + Dedent, + + /* Punctuation */ + /// Token value for a left parenthesis `(`. + Lpar, + /// Token value for a right parenthesis `)`. + Rpar, + /// Token value for a left square bracket `[`. + Lsqb, + /// Token value for a right square bracket `]`. + Rsqb, + /// Token value for a colon `:`. + Colon, + /// Token value for a comma `,`. + Comma, + /// Token value for a semicolon `;`. + Semi, + /// Token value for plus `+`. + Plus, + /// Token value for minus `-`. + Minus, + /// Token value for star `*`. + Star, + /// Token value for slash `/`. + Slash, + /// Token value for vertical bar `|`. + Vbar, + /// Token value for ampersand `&`. + Amper, + /// Token value for less than `<`. + Less, + /// Token value for greater than `>`. + Greater, + /// Token value for equal `=`. + Equal, + /// Token value for dot `.`. + Dot, + /// Token value for percent `%`. + Percent, + /// Token value for left bracket `{`. + Lbrace, + /// Token value for right bracket `}`. + Rbrace, + /// Token value for double equal `==`. + EqEqual, + /// Token value for not equal `!=`. + NotEqual, + /// Token value for less than or equal `<=`. + LessEqual, + /// Token value for greater than or equal `>=`. + GreaterEqual, + /// Token value for tilde `~`. + Tilde, + /// Token value for caret `^`. + CircumFlex, + /// Token value for left shift `<<`. + LeftShift, + /// Token value for right shift `>>`. + RightShift, + /// Token value for double star `**`. + DoubleStar, + /// Token value for double star equal `**=`. + DoubleStarEqual, + /// Token value for plus equal `+=`. + PlusEqual, + /// Token value for minus equal `-=`. + MinusEqual, + /// Token value for star equal `*=`. + StarEqual, + /// Token value for slash equal `/=`. + SlashEqual, + /// Token value for percent equal `%=`. + PercentEqual, + /// Token value for ampersand equal `&=`. + AmperEqual, + /// Token value for vertical bar equal `|=`. + VbarEqual, + /// Token value for caret equal `^=`. + CircumflexEqual, + /// Token value for left shift equal `<<=`. + LeftShiftEqual, + /// Token value for right shift equal `>>=`. + RightShiftEqual, + /// Token value for double slash `//`. + DoubleSlash, + /// Token value for double slash equal `//=`. + DoubleSlashEqual, + /// Token value for colon equal `:=`. + ColonEqual, + /// Token value for at `@`. + At, + /// Token value for at equal `@=`. + AtEqual, + /// Token value for arrow `->`. + Rarrow, + /// Token value for ellipsis `...`. + Ellipsis, + + // Self documenting. + // Keywords: + False, + None, + True, + + And, + As, + Assert, + Async, + Await, + Break, + Class, + Continue, + Def, + Del, + Elif, + Else, + Except, + Finally, + For, + From, + Global, + If, + Import, + In, + Is, + Lambda, + Nonlocal, + Not, + Or, + Pass, + Raise, + Return, + Try, + While, + With, + Yield, + + // Contextual keywords + Match, + Case, + Type, + + // Ruff specific tokens + Bogus, +} + +impl TokenKind { + #[inline(always)] + pub fn is_keyword(self) -> bool { + self >= TokenKind::False && self <= TokenKind::Case + } + + #[inline(always)] + pub fn is_contextual_keyword(self) -> bool { + self >= TokenKind::Match && self <= TokenKind::Case + } + + #[inline(always)] + pub fn is_non_contextual_keyword(self) -> bool { + self.is_keyword() && !self.is_contextual_keyword() + } + + #[inline(always)] + pub fn is_punctuation(self) -> bool { + self >= TokenKind::Lpar && self <= TokenKind::Ellipsis + } + + #[inline(always)] + pub fn is_literal(self) -> bool { + matches!( + self, + TokenKind::Int | TokenKind::Float | TokenKind::Complex | TokenKind::String + ) + } + + #[inline(always)] + pub const fn is_trivia(self) -> bool { + matches!( + self, + TokenKind::Comment + | TokenKind::Whitespace + | TokenKind::NonLogicalNewline + | TokenKind::EndOfFile + ) + } +} + +/// The kind of string literal as described in the [String and Bytes literals] +/// section of the Python reference. +/// +/// [String and Bytes literals]: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals +#[derive(PartialEq, Eq, Debug, Clone, Hash, Copy)] // TODO: is_macro::Is +pub enum StringKind { + /// A normal string literal with no prefix. + String, + /// A f-string literal, with a `f` or `F` prefix. + FString, + /// A byte string literal, with a `b` or `B` prefix. + Bytes, + /// A raw string literal, with a `r` or `R` prefix. + RawString, + /// A raw f-string literal, with a `rf`/`fr` or `rF`/`Fr` or `Rf`/`fR` or `RF`/`FR` prefix. + RawFString, + /// A raw byte string literal, with a `rb`/`br` or `rB`/`Br` or `Rb`/`bR` or `RB`/`BR` prefix. + RawBytes, + /// A unicode string literal, with a `u` or `U` prefix. + Unicode, +} + +impl TryFrom for StringKind { + type Error = UnexpectedStringPrefixError; + + fn try_from(ch: char) -> Result { + match ch { + 'r' | 'R' => Ok(StringKind::RawString), + 'f' | 'F' => Ok(StringKind::FString), + 'u' | 'U' => Ok(StringKind::Unicode), + 'b' | 'B' => Ok(StringKind::Bytes), + c => Err(UnexpectedStringPrefixError { + first: c, + second: None, + }), + } + } +} + +impl TryFrom<[char; 2]> for StringKind { + type Error = UnexpectedStringPrefixError; + + fn try_from(chars: [char; 2]) -> Result { + match chars { + ['r' | 'R', 'f' | 'F'] => Ok(StringKind::RawFString), + ['f' | 'F', 'r' | 'R'] => Ok(StringKind::RawFString), + ['r' | 'R', 'b' | 'B'] => Ok(StringKind::RawBytes), + ['b' | 'B', 'r' | 'R'] => Ok(StringKind::RawBytes), + [c1, c2] => Err(UnexpectedStringPrefixError { + first: c1, + second: Some(c2), + }), + } + } +} + +pub struct UnexpectedStringPrefixError { + first: char, + second: Option, +} + +impl Display for UnexpectedStringPrefixError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + if let Some(second) = self.second { + write!(f, "Unexpected string prefix: {}{second}", self.first) + } else { + write!(f, "Unexpected string prefix: {}", self.first) + } + } +} + +impl Debug for UnexpectedStringPrefixError { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + std::fmt::Display::fmt(self, f) + } +} + +impl Error for UnexpectedStringPrefixError {} + +impl fmt::Display for StringKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use StringKind::*; + match self { + String => f.write_str(""), + FString => f.write_str("f"), + Bytes => f.write_str("b"), + RawString => f.write_str("r"), + RawFString => f.write_str("rf"), + RawBytes => f.write_str("rb"), + Unicode => f.write_str("u"), + } + } +} + +impl StringKind { + /// Returns true if the string is a raw string, i,e one of + /// [`StringKind::RawString`] or [`StringKind::RawFString`] or [`StringKind::RawBytes`]. + pub fn is_raw(&self) -> bool { + use StringKind::{RawBytes, RawFString, RawString}; + matches!(self, RawString | RawFString | RawBytes) + } + + /// Returns true if the string is an f-string, i,e one of + /// [`StringKind::FString`] or [`StringKind::RawFString`]. + pub fn is_any_fstring(&self) -> bool { + use StringKind::{FString, RawFString}; + matches!(self, FString | RawFString) + } + + /// Returns true if the string is a byte string, i,e one of + /// [`StringKind::Bytes`] or [`StringKind::RawBytes`]. + pub fn is_any_bytes(&self) -> bool { + use StringKind::{Bytes, RawBytes}; + matches!(self, Bytes | RawBytes) + } + + /// Returns true if the string is a unicode string, i,e [`StringKind::Unicode`]. + pub fn is_unicode(&self) -> bool { + matches!(self, StringKind::Unicode) + } + + /// Returns the number of characters in the prefix. + pub fn prefix_len(&self) -> TextSize { + use StringKind::*; + let len = match self { + String => 0, + RawString | FString | Unicode | Bytes => 1, + RawFString | RawBytes => 2, + }; + len.into() + } + + pub(crate) fn flags(&self) -> TokenFlags { + match self { + StringKind::String => TokenFlags::empty(), + StringKind::FString => TokenFlags::FString, + StringKind::Bytes => TokenFlags::Bytes, + StringKind::RawString => TokenFlags::Raw, + StringKind::RawFString => TokenFlags::Raw | TokenFlags::FString, + StringKind::RawBytes => TokenFlags::Raw | TokenFlags::Bytes, + StringKind::Unicode => TokenFlags::Unicode, + } + } +} + +static_assertions::assert_eq_size!(TokenKind, [u8; 1]); +static_assertions::assert_eq_size!(Cow<'_, str>, [u8; 24]); +static_assertions::assert_eq_size!(Token, [u8; 40]); diff --git a/parser/src/lib.rs b/parser/src/lib.rs index 5b56e54a..ea41fea9 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -119,6 +119,7 @@ mod function; // Skip flattening lexer to distinguish from full parser mod context; pub mod lexer; +mod lexer_v2; mod parser; mod soft_keywords; mod string; diff --git a/parser/src/parser.rs b/parser/src/parser.rs index aed39489..5f1f5a22 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -15,7 +15,7 @@ use crate::{ ast::{self, Ranged}, lexer::{self, LexResult, LexicalError, LexicalErrorType}, - python, + lexer_v2, python, text_size::TextSize, token::Tok, Mode, @@ -23,7 +23,8 @@ use crate::{ use itertools::Itertools; use std::iter; -use crate::{lexer::Lexer, soft_keywords::SoftKeywordTransformer}; +use crate::lexer_v2::compat_adapter::CompatAdapter; +use crate::{soft_keywords::SoftKeywordTransformer}; pub(super) use lalrpop_util::ParseError as LalrpopError; /// Parse Python code string to implementor's type. @@ -68,15 +69,11 @@ where offset: TextSize, ) -> Result { let lxr = Self::lex_starts_at(source, offset); - #[cfg(feature = "full-lexer")] let lxr = lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); Self::parse_tokens(lxr, source_path) } - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer>; + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer; fn parse_tokens( lxr: impl IntoIterator, source_path: &str, @@ -84,11 +81,11 @@ where } impl Parse for ast::ModModule { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Module, offset) + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { + SoftKeywordTransformer::new( + CompatAdapter::new(offset, lexer_v2::Lexer::new(source, offset)), + Mode::Module, + ) } fn parse_tokens( lxr: impl IntoIterator, @@ -102,11 +99,11 @@ impl Parse for ast::ModModule { } impl Parse for ast::ModExpression { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Expression, offset) + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { + SoftKeywordTransformer::new( + CompatAdapter::new(offset, lexer_v2::Lexer::new(source, offset)), + Mode::Expression, + ) } fn parse_tokens( lxr: impl IntoIterator, @@ -120,11 +117,11 @@ impl Parse for ast::ModExpression { } impl Parse for ast::ModInteractive { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { - lexer::lex_starts_at(source, Mode::Interactive, offset) + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { + SoftKeywordTransformer::new( + CompatAdapter::new(offset, lexer_v2::Lexer::new(source, offset)), + Mode::Interactive, + ) } fn parse_tokens( lxr: impl IntoIterator, @@ -138,10 +135,7 @@ impl Parse for ast::ModInteractive { } impl Parse for ast::Suite { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::ModModule::lex_starts_at(source, offset) } fn parse_tokens( @@ -153,10 +147,7 @@ impl Parse for ast::Suite { } impl Parse for ast::Stmt { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::ModModule::lex_starts_at(source, offset) } fn parse_tokens( @@ -186,10 +177,7 @@ impl Parse for ast::Stmt { } impl Parse for ast::Expr { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::ModExpression::lex_starts_at(source, offset) } fn parse_tokens( @@ -201,10 +189,7 @@ impl Parse for ast::Expr { } impl Parse for ast::Identifier { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -227,10 +212,7 @@ impl Parse for ast::Identifier { } impl Parse for ast::Constant { - fn lex_starts_at( - source: &str, - offset: TextSize, - ) -> SoftKeywordTransformer> { + fn lex_starts_at(source: &str, offset: TextSize) -> SoftKeywordTransformer { ast::Expr::lex_starts_at(source, offset) } fn parse_tokens( @@ -429,7 +411,6 @@ pub fn parse_tokens( source_path: &str, ) -> Result { let lxr = lxr.into_iter(); - #[cfg(feature = "full-lexer")] let lxr = lxr.filter_ok(|(tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); if mode == Mode::Jupyter { diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap deleted file mode 100644 index c48429b1..00000000 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_class_generic_types.snap +++ /dev/null @@ -1,375 +0,0 @@ ---- -source: parser/src/parser.rs -expression: "ast::Suite::parse(source, \"\").unwrap()" ---- -[ - ClassDef( - StmtClassDef { - range: 10..29, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 26..29, - value: Constant( - ExprConstant { - range: 26..29, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 20..21, - name: Identifier( - "T", - ), - bound: None, - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 52..76, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 73..76, - value: Constant( - ExprConstant { - range: 73..76, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 62..68, - name: Identifier( - "T", - ), - bound: Some( - Name( - ExprName { - range: 65..68, - id: Identifier( - "str", - ), - ctx: Load, - }, - ), - ), - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 105..138, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 135..138, - value: Constant( - ExprConstant { - range: 135..138, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 115..130, - name: Identifier( - "T", - ), - bound: Some( - Tuple( - ExprTuple { - range: 118..130, - elts: [ - Name( - ExprName { - range: 119..122, - id: Identifier( - "str", - ), - ctx: Load, - }, - ), - Name( - ExprName { - range: 124..129, - id: Identifier( - "bytes", - ), - ctx: Load, - }, - ), - ], - ctx: Load, - }, - ), - ), - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 159..181, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 178..181, - value: Constant( - ExprConstant { - range: 178..181, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 169..170, - name: Identifier( - "T", - ), - bound: None, - }, - ), - TypeVar( - TypeParamTypeVar { - range: 172..173, - name: Identifier( - "U", - ), - bound: None, - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 200..223, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 220..223, - value: Constant( - ExprConstant { - range: 220..223, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 210..211, - name: Identifier( - "T", - ), - bound: None, - }, - ), - TypeVar( - TypeParamTypeVar { - range: 213..214, - name: Identifier( - "U", - ), - bound: None, - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 240..261, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 258..261, - value: Constant( - ExprConstant { - range: 258..261, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVarTuple( - TypeParamTypeVarTuple { - range: 250..253, - name: Identifier( - "Ts", - ), - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 275..296, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Expr( - StmtExpr { - range: 293..296, - value: Constant( - ExprConstant { - range: 293..296, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - type_params: [ - ParamSpec( - TypeParamParamSpec { - range: 285..288, - name: Identifier( - "P", - ), - }, - ), - ], - }, - ), - ClassDef( - StmtClassDef { - range: 312..351, - name: Identifier( - "Foo", - ), - bases: [], - keywords: [], - body: [ - Pass( - StmtPass { - range: 347..351, - }, - ), - ], - decorator_list: [], - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 322..323, - name: Identifier( - "X", - ), - bound: None, - }, - ), - TypeVar( - TypeParamTypeVar { - range: 325..331, - name: Identifier( - "Y", - ), - bound: Some( - Name( - ExprName { - range: 328..331, - id: Identifier( - "str", - ), - ctx: Load, - }, - ), - ), - }, - ), - TypeVarTuple( - TypeParamTypeVarTuple { - range: 333..335, - name: Identifier( - "U", - ), - }, - ), - ParamSpec( - TypeParamParamSpec { - range: 337..340, - name: Identifier( - "P", - ), - }, - ), - ], - }, - ), -] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap b/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap deleted file mode 100644 index 2d65a64e..00000000 --- a/parser/src/snapshots/rustpython_parser__parser__tests__parse_function_definition.snap +++ /dev/null @@ -1,560 +0,0 @@ ---- -source: parser/src/parser.rs -expression: "ast::Suite::parse(source, \"\").unwrap()" ---- -[ - FunctionDef( - StmtFunctionDef { - range: 0..20, - name: Identifier( - "func", - ), - args: Arguments { - range: 9..10, - posonlyargs: [], - args: [ - ArgWithDefault { - range: 9..10, - def: Arg { - range: 9..10, - arg: Identifier( - "a", - ), - annotation: None, - type_comment: None, - }, - default: None, - }, - ], - vararg: None, - kwonlyargs: [], - kwarg: None, - }, - body: [ - Expr( - StmtExpr { - range: 17..20, - value: Constant( - ExprConstant { - range: 17..20, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - returns: None, - type_comment: None, - type_params: [], - }, - ), - FunctionDef( - StmtFunctionDef { - range: 22..53, - name: Identifier( - "func", - ), - args: Arguments { - range: 34..38, - posonlyargs: [], - args: [ - ArgWithDefault { - range: 34..38, - def: Arg { - range: 34..38, - arg: Identifier( - "a", - ), - annotation: Some( - Name( - ExprName { - range: 37..38, - id: Identifier( - "T", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - }, - default: None, - }, - ], - vararg: None, - kwonlyargs: [], - kwarg: None, - }, - body: [ - Expr( - StmtExpr { - range: 50..53, - value: Constant( - ExprConstant { - range: 50..53, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - returns: Some( - Name( - ExprName { - range: 43..44, - id: Identifier( - "T", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 31..32, - name: Identifier( - "T", - ), - bound: None, - }, - ), - ], - }, - ), - FunctionDef( - StmtFunctionDef { - range: 55..91, - name: Identifier( - "func", - ), - args: Arguments { - range: 72..76, - posonlyargs: [], - args: [ - ArgWithDefault { - range: 72..76, - def: Arg { - range: 72..76, - arg: Identifier( - "a", - ), - annotation: Some( - Name( - ExprName { - range: 75..76, - id: Identifier( - "T", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - }, - default: None, - }, - ], - vararg: None, - kwonlyargs: [], - kwarg: None, - }, - body: [ - Expr( - StmtExpr { - range: 88..91, - value: Constant( - ExprConstant { - range: 88..91, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - returns: Some( - Name( - ExprName { - range: 81..82, - id: Identifier( - "T", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 64..70, - name: Identifier( - "T", - ), - bound: Some( - Name( - ExprName { - range: 67..70, - id: Identifier( - "str", - ), - ctx: Load, - }, - ), - ), - }, - ), - ], - }, - ), - FunctionDef( - StmtFunctionDef { - range: 93..138, - name: Identifier( - "func", - ), - args: Arguments { - range: 119..123, - posonlyargs: [], - args: [ - ArgWithDefault { - range: 119..123, - def: Arg { - range: 119..123, - arg: Identifier( - "a", - ), - annotation: Some( - Name( - ExprName { - range: 122..123, - id: Identifier( - "T", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - }, - default: None, - }, - ], - vararg: None, - kwonlyargs: [], - kwarg: None, - }, - body: [ - Expr( - StmtExpr { - range: 135..138, - value: Constant( - ExprConstant { - range: 135..138, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - returns: Some( - Name( - ExprName { - range: 128..129, - id: Identifier( - "T", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 102..117, - name: Identifier( - "T", - ), - bound: Some( - Tuple( - ExprTuple { - range: 105..117, - elts: [ - Name( - ExprName { - range: 106..109, - id: Identifier( - "str", - ), - ctx: Load, - }, - ), - Name( - ExprName { - range: 111..116, - id: Identifier( - "bytes", - ), - ctx: Load, - }, - ), - ], - ctx: Load, - }, - ), - ), - }, - ), - ], - }, - ), - FunctionDef( - StmtFunctionDef { - range: 140..171, - name: Identifier( - "func", - ), - args: Arguments { - range: 154..161, - posonlyargs: [], - args: [], - vararg: Some( - Arg { - range: 155..161, - arg: Identifier( - "a", - ), - annotation: Some( - Starred( - ExprStarred { - range: 158..161, - value: Name( - ExprName { - range: 159..161, - id: Identifier( - "Ts", - ), - ctx: Load, - }, - ), - ctx: Load, - }, - ), - ), - type_comment: None, - }, - ), - kwonlyargs: [], - kwarg: None, - }, - body: [ - Expr( - StmtExpr { - range: 168..171, - value: Constant( - ExprConstant { - range: 168..171, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - returns: None, - type_comment: None, - type_params: [ - TypeVarTuple( - TypeParamTypeVarTuple { - range: 149..152, - name: Identifier( - "Ts", - ), - }, - ), - ], - }, - ), - FunctionDef( - StmtFunctionDef { - range: 173..230, - name: Identifier( - "func", - ), - args: Arguments { - range: 187..220, - posonlyargs: [], - args: [], - vararg: Some( - Arg { - range: 188..200, - arg: Identifier( - "args", - ), - annotation: Some( - Attribute( - ExprAttribute { - range: 194..200, - value: Name( - ExprName { - range: 194..195, - id: Identifier( - "P", - ), - ctx: Load, - }, - ), - attr: Identifier( - "args", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - }, - ), - kwonlyargs: [], - kwarg: Some( - Arg { - range: 204..220, - arg: Identifier( - "kwargs", - ), - annotation: Some( - Attribute( - ExprAttribute { - range: 212..220, - value: Name( - ExprName { - range: 212..213, - id: Identifier( - "P", - ), - ctx: Load, - }, - ), - attr: Identifier( - "kwargs", - ), - ctx: Load, - }, - ), - ), - type_comment: None, - }, - ), - }, - body: [ - Expr( - StmtExpr { - range: 227..230, - value: Constant( - ExprConstant { - range: 227..230, - value: Ellipsis, - kind: None, - }, - ), - }, - ), - ], - decorator_list: [], - returns: None, - type_comment: None, - type_params: [ - ParamSpec( - TypeParamParamSpec { - range: 182..185, - name: Identifier( - "P", - ), - }, - ), - ], - }, - ), - FunctionDef( - StmtFunctionDef { - range: 232..273, - name: Identifier( - "func", - ), - args: Arguments { - range: 261..263, - posonlyargs: [], - args: [], - vararg: None, - kwonlyargs: [], - kwarg: None, - }, - body: [ - Pass( - StmtPass { - range: 269..273, - }, - ), - ], - decorator_list: [], - returns: None, - type_comment: None, - type_params: [ - TypeVar( - TypeParamTypeVar { - range: 241..242, - name: Identifier( - "T", - ), - bound: None, - }, - ), - TypeVar( - TypeParamTypeVar { - range: 244..250, - name: Identifier( - "U", - ), - bound: Some( - Name( - ExprName { - range: 247..250, - id: Identifier( - "str", - ), - ctx: Load, - }, - ), - ), - }, - ), - TypeVarTuple( - TypeParamTypeVarTuple { - range: 252..255, - name: Identifier( - "Ts", - ), - }, - ), - ParamSpec( - TypeParamParamSpec { - range: 257..260, - name: Identifier( - "P", - ), - }, - ), - ], - }, - ), -] diff --git a/parser/src/snapshots/rustpython_parser__parser__tests__patma.snap b/parser/src/snapshots/rustpython_parser__parser__tests__patma.snap index 04b97fe9..c7cce6e9 100644 --- a/parser/src/snapshots/rustpython_parser__parser__tests__patma.snap +++ b/parser/src/snapshots/rustpython_parser__parser__tests__patma.snap @@ -972,7 +972,7 @@ expression: parse_ast ExprConstant { range: 755..759, value: Float( - 0.25, + 0.0, ), kind: None, }, @@ -983,7 +983,7 @@ expression: parse_ast range: 762..767, value: Complex { real: 0.0, - imag: 1.75, + imag: 0.0, }, kind: None, }, @@ -2164,7 +2164,7 @@ expression: parse_ast ExprConstant { range: 1637..1641, value: Float( - 0.25, + 0.0, ), kind: None, }, @@ -2177,7 +2177,7 @@ expression: parse_ast range: 1644..1649, value: Complex { real: 0.0, - imag: 1.75, + imag: 0.0, }, kind: None, }, diff --git a/parser/src/soft_keywords.rs b/parser/src/soft_keywords.rs index 9abcd395..51278a46 100644 --- a/parser/src/soft_keywords.rs +++ b/parser/src/soft_keywords.rs @@ -134,7 +134,6 @@ where self.start_of_line = next.as_ref().map_or(false, |lex_result| { lex_result.as_ref().map_or(false, |(tok, _)| { - #[cfg(feature = "full-lexer")] if matches!(tok, Tok::NonLogicalNewline | Tok::Comment { .. }) { return self.start_of_line; } diff --git a/parser/src/token.rs b/parser/src/token.rs index 1fd78251..edf6d018 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -51,13 +51,11 @@ pub enum Tok { kind: MagicKind, }, /// Token value for a comment. These are filtered out of the token stream prior to parsing. - #[cfg(feature = "full-lexer")] Comment(String), /// Token value for a newline. Newline, /// Token value for a newline that is not a logical line break. These are filtered out of /// the token stream prior to parsing. - #[cfg(feature = "full-lexer")] NonLogicalNewline, /// Token value for an indent. Indent, @@ -235,7 +233,6 @@ impl fmt::Display for Tok { } MagicCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), - #[cfg(feature = "full-lexer")] NonLogicalNewline => f.write_str("NonLogicalNewline"), Indent => f.write_str("Indent"), Dedent => f.write_str("Dedent"), @@ -249,7 +246,6 @@ impl fmt::Display for Tok { Rsqb => f.write_str("']'"), Colon => f.write_str("':'"), Comma => f.write_str("','"), - #[cfg(feature = "full-lexer")] Comment(value) => f.write_str(value), Semi => f.write_str("';'"), Plus => f.write_str("'+'"),