From 0578b17a4bf4718c8f740b20af073d47afe6bf2c Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Fri, 31 May 2024 08:51:49 +0530
Subject: [PATCH] Re-order lexer methods

---
 crates/ruff_python_parser/src/lexer.rs | 1628 ++++++++++++------------
 1 file changed, 814 insertions(+), 814 deletions(-)

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
index 8933e4cb748a42..5b5bb3d213f0a6 100644
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@@ -125,431 +125,597 @@ impl<'src> Lexer<'src> {
         self.current_flags
     }
 
-    /// Helper function to push the given error and return the [`TokenKind::Unknown`] token.
-    fn push_error(&mut self, error: LexicalError) -> TokenKind {
-        self.errors.push(error);
-        TokenKind::Unknown
+    /// Takes the token value corresponding to the current token out of the lexer, replacing it
+    /// with the default value.
+    ///
+    /// All the subsequent call to this method without moving the lexer would always return the
+    /// default value which is [`TokenValue::None`].
+    pub(crate) fn take_value(&mut self) -> TokenValue {
+        std::mem::take(&mut self.current_value)
     }
 
-    /// Try lexing the single character string prefix, updating the token flags accordingly.
-    /// Returns `true` if it matches.
-    fn try_single_char_prefix(&mut self, first: char) -> bool {
-        match first {
-            'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
-            'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
-            'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
-            'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
-            'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
-            _ => return false,
-        }
-        true
+    /// Lex the next token.
+    pub fn next_token(&mut self) -> TokenKind {
+        self.cursor.start_token();
+        self.current_value = TokenValue::None;
+        self.current_flags = TokenFlags::empty();
+        self.current_kind = self.lex_token();
+        self.current_range = self.token_range();
+        self.current_kind
     }
 
-    /// Try lexing the double character string prefix, updating the token flags accordingly.
-    /// Returns `true` if it matches.
-    fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
-        match value {
-            ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
-                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
-            }
-            ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
-                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
-            }
-            ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
-                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
+    fn lex_token(&mut self) -> TokenKind {
+        if let Some(fstring) = self.fstrings.current() {
+            if !fstring.is_in_expression(self.nesting) {
+                if let Some(token) = self.lex_fstring_middle_or_end() {
+                    if matches!(token, TokenKind::FStringEnd) {
+                        self.fstrings.pop();
+                    }
+                    return token;
+                }
             }
-            ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
-                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
+        }
+        // Return dedent tokens until the current indentation level matches the indentation of the next token.
+        else if let Some(indentation) = self.pending_indentation.take() {
+            match self.indentations.current().try_compare(indentation) {
+                Ok(Ordering::Greater) => {
+                    self.pending_indentation = Some(indentation);
+                    if self.indentations.dedent_one(indentation).is_err() {
+                        return self.push_error(LexicalError::new(
+                            LexicalErrorType::IndentationError,
+                            self.token_range(),
+                        ));
+                    }
+                    return TokenKind::Dedent;
+                }
+                Ok(_) => {}
+                Err(_) => {
+                    return self.push_error(LexicalError::new(
+                        LexicalErrorType::IndentationError,
+                        self.token_range(),
+                    ));
+                }
             }
-            _ => return false,
         }
-        true
-    }
 
-    /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
-    fn lex_identifier(&mut self, first: char) -> TokenKind {
-        // Detect potential string like rb'' b'' f'' u'' r''
-        let quote = match (first, self.cursor.first()) {
-            (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
-                self.cursor.bump();
-                quote
-            }),
-            (_, second) if is_quote(self.cursor.second()) => {
-                self.try_double_char_prefix([first, second]).then(|| {
-                    self.cursor.bump();
-                    // SAFETY: Safe because of the `is_quote` check in this match arm's guard
-                    self.cursor.bump().unwrap()
-                })
+        if self.state.is_after_newline() {
+            if let Some(indentation) = self.eat_indentation() {
+                return indentation;
             }
-            _ => None,
-        };
-
-        if let Some(quote) = quote {
-            if self.current_flags.is_f_string() {
-                return self.lex_fstring_start(quote);
+        } else {
+            if let Err(error) = self.skip_whitespace() {
+                return self.push_error(error);
             }
-
-            return self.lex_string(quote);
         }
 
-        // Keep track of whether the identifier is ASCII-only or not.
-        //
-        // This is important because Python applies NFKC normalization to
-        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
-        // We need to therefore do the same in our lexer, but applying NFKC normalization
-        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
-        // (by far the most common case), we can skip NFKC normalization of the identifier.
-        let mut is_ascii = first.is_ascii();
-        self.cursor
-            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
-
-        let text = self.token_text();
+        // The lexer might've skipped whitespaces, so update the start offset
+        self.cursor.start_token();
 
-        if !is_ascii {
-            self.current_value = TokenValue::Name(text.nfkc().collect::<String>().into_boxed_str());
-            return TokenKind::Name;
-        }
+        if let Some(c) = self.cursor.bump() {
+            if c.is_ascii() {
+                self.consume_ascii_character(c)
+            } else if is_unicode_identifier_start(c) {
+                let identifier = self.lex_identifier(c);
+                self.state = State::Other;
 
-        match text {
-            "False" => TokenKind::False,
-            "None" => TokenKind::None,
-            "True" => TokenKind::True,
-            "and" => TokenKind::And,
-            "as" => TokenKind::As,
-            "assert" => TokenKind::Assert,
-            "async" => TokenKind::Async,
-            "await" => TokenKind::Await,
-            "break" => TokenKind::Break,
-            "case" => TokenKind::Case,
-            "class" => TokenKind::Class,
-            "continue" => TokenKind::Continue,
-            "def" => TokenKind::Def,
-            "del" => TokenKind::Del,
-            "elif" => TokenKind::Elif,
-            "else" => TokenKind::Else,
-            "except" => TokenKind::Except,
-            "finally" => TokenKind::Finally,
-            "for" => TokenKind::For,
-            "from" => TokenKind::From,
-            "global" => TokenKind::Global,
-            "if" => TokenKind::If,
-            "import" => TokenKind::Import,
-            "in" => TokenKind::In,
-            "is" => TokenKind::Is,
-            "lambda" => TokenKind::Lambda,
-            "match" => TokenKind::Match,
-            "nonlocal" => TokenKind::Nonlocal,
-            "not" => TokenKind::Not,
-            "or" => TokenKind::Or,
-            "pass" => TokenKind::Pass,
-            "raise" => TokenKind::Raise,
-            "return" => TokenKind::Return,
-            "try" => TokenKind::Try,
-            "type" => TokenKind::Type,
-            "while" => TokenKind::While,
-            "with" => TokenKind::With,
-            "yield" => TokenKind::Yield,
-            _ => {
-                self.current_value = TokenValue::Name(text.to_string().into_boxed_str());
-                TokenKind::Name
+                identifier
+            } else {
+                self.push_error(LexicalError::new(
+                    LexicalErrorType::UnrecognizedToken { tok: c },
+                    self.token_range(),
+                ))
             }
+        } else {
+            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
+            // empty the dedent stack, and finally, return the EndOfFile token.
+            self.consume_end()
         }
     }
 
-    /// Numeric lexing. The feast can start!
-    fn lex_number(&mut self, first: char) -> TokenKind {
-        if first == '0' {
-            if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
-                self.lex_number_radix(Radix::Hex)
-            } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
-                self.lex_number_radix(Radix::Octal)
-            } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
-                self.lex_number_radix(Radix::Binary)
-            } else {
-                self.lex_decimal_number(first)
+    fn eat_indentation(&mut self) -> Option<TokenKind> {
+        let mut indentation = Indentation::root();
+
+        loop {
+            match self.cursor.first() {
+                ' ' => {
+                    self.cursor.bump();
+                    indentation = indentation.add_space();
+                }
+                '\t' => {
+                    self.cursor.bump();
+                    indentation = indentation.add_tab();
+                }
+                '\\' => {
+                    self.cursor.bump();
+                    if self.cursor.eat_char('\r') {
+                        self.cursor.eat_char('\n');
+                    } else if self.cursor.is_eof() {
+                        return Some(self.push_error(LexicalError::new(
+                            LexicalErrorType::Eof,
+                            self.token_range(),
+                        )));
+                    } else if !self.cursor.eat_char('\n') {
+                        return Some(self.push_error(LexicalError::new(
+                            LexicalErrorType::LineContinuationError,
+                            self.token_range(),
+                        )));
+                    }
+                    indentation = Indentation::root();
+                }
+                // Form feed
+                '\x0C' => {
+                    self.cursor.bump();
+                    indentation = Indentation::root();
+                }
+                _ => break,
             }
-        } else {
-            self.lex_decimal_number(first)
         }
-    }
 
-    /// Lex a hex/octal/decimal/binary number without a decimal point.
-    fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
-        #[cfg(debug_assertions)]
-        debug_assert!(matches!(
-            self.cursor.previous().to_ascii_lowercase(),
-            'x' | 'o' | 'b'
-        ));
+        // Handle indentation if this is a new, not all empty, logical line
+        if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
+            self.state = State::NonEmptyLogicalLine;
 
-        // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
-        let mut number = LexedText::new(self.offset(), self.source);
-        self.radix_run(&mut number, radix);
+            // Set to false so that we don't handle indentation on the next call.
+            return self.handle_indentation(indentation);
+        }
 
-        // Extract the entire number, including the base prefix (e.g., `0x9D5`).
-        let token = &self.source[self.token_range()];
+        None
+    }
 
-        let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
-            Ok(int) => int,
-            Err(err) => {
-                return self.push_error(LexicalError::new(
-                    LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
-                    self.token_range(),
-                ));
-            }
-        };
-        self.current_value = TokenValue::Int(value);
-        TokenKind::Int
-    }
-
-    /// Lex a normal number, that is, no octal, hex or binary number.
-    fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
-        #[cfg(debug_assertions)]
-        debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
-        let start_is_zero = first_digit_or_dot == '0';
+    fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
+        let token = match self.indentations.current().try_compare(indentation) {
+            // Dedent
+            Ok(Ordering::Greater) => {
+                self.pending_indentation = Some(indentation);
 
-        let mut number = LexedText::new(self.token_start(), self.source);
-        if first_digit_or_dot != '.' {
-            number.push(first_digit_or_dot);
-            self.radix_run(&mut number, Radix::Decimal);
-        };
+                if self.indentations.dedent_one(indentation).is_err() {
+                    return Some(self.push_error(LexicalError::new(
+                        LexicalErrorType::IndentationError,
+                        self.token_range(),
+                    )));
+                };
 
-        let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
-            number.push('.');
+                // The lexer might've eaten some whitespaces to calculate the `indentation`. For
+                // example:
+                //
+                // ```py
+                // if first:
+                //     if second:
+                //         pass
+                //     foo
+                // #   ^
+                // ```
+                //
+                // Here, the cursor is at `^` and the `indentation` contains the whitespaces before
+                // the `pass` token.
+                self.cursor.start_token();
 
-            if self.cursor.eat_char('_') {
-                return self.push_error(LexicalError::new(
-                    LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
-                    TextRange::new(self.offset() - TextSize::new(1), self.offset()),
-                ));
+                Some(TokenKind::Dedent)
             }
 
-            self.radix_run(&mut number, Radix::Decimal);
-            true
-        } else {
-            // Normal number:
-            false
-        };
-
-        let is_float = match self.cursor.rest().as_bytes() {
-            [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
-                // 'e' | 'E'
-                number.push(self.cursor.bump().unwrap());
-
-                if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
-                    number.push(sign);
-                }
-
-                self.radix_run(&mut number, Radix::Decimal);
+            Ok(Ordering::Equal) => None,
 
-                true
+            // Indent
+            Ok(Ordering::Less) => {
+                self.indentations.indent(indentation);
+                Some(TokenKind::Indent)
+            }
+            Err(_) => {
+                return Some(self.push_error(LexicalError::new(
+                    LexicalErrorType::IndentationError,
+                    self.token_range(),
+                )));
             }
-            _ => is_float,
         };
 
-        if is_float {
-            // Improvement: Use `Cow` instead of pushing to value text
-            let Ok(value) = f64::from_str(number.as_str()) else {
-                return self.push_error(LexicalError::new(
-                    LexicalErrorType::OtherError(
-                        "Invalid decimal literal".to_string().into_boxed_str(),
-                    ),
-                    self.token_range(),
-                ));
-            };
+        token
+    }
 
-            // Parse trailing 'j':
-            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
-                self.current_value = TokenValue::Complex {
-                    real: 0.0,
-                    imag: value,
-                };
-                TokenKind::Complex
-            } else {
-                self.current_value = TokenValue::Float(value);
-                TokenKind::Float
-            }
-        } else {
-            // Parse trailing 'j':
-            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
-                let imag = f64::from_str(number.as_str()).unwrap();
-                self.current_value = TokenValue::Complex { real: 0.0, imag };
-                TokenKind::Complex
-            } else {
-                let value = match Int::from_str(number.as_str()) {
-                    Ok(value) => {
-                        if start_is_zero && value.as_u8() != Some(0) {
-                            // Leading zeros in decimal integer literals are not permitted.
-                            return self.push_error(LexicalError::new(
-                                LexicalErrorType::OtherError(
-                                    "Invalid decimal integer literal"
-                                        .to_string()
-                                        .into_boxed_str(),
-                                ),
-                                self.token_range(),
-                            ));
-                        }
-                        value
-                    }
-                    Err(err) => {
-                        return self.push_error(LexicalError::new(
-                            LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
+    fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
+        loop {
+            match self.cursor.first() {
+                ' ' => {
+                    self.cursor.bump();
+                }
+                '\t' => {
+                    self.cursor.bump();
+                }
+                '\\' => {
+                    self.cursor.bump();
+                    if self.cursor.eat_char('\r') {
+                        self.cursor.eat_char('\n');
+                    } else if self.cursor.is_eof() {
+                        return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
+                    } else if !self.cursor.eat_char('\n') {
+                        return Err(LexicalError::new(
+                            LexicalErrorType::LineContinuationError,
                             self.token_range(),
-                        ))
+                        ));
                     }
-                };
-                self.current_value = TokenValue::Int(value);
-                TokenKind::Int
+                }
+                // Form feed
+                '\x0C' => {
+                    self.cursor.bump();
+                }
+                _ => break,
             }
         }
+
+        Ok(())
     }
 
-    /// Consume a sequence of numbers with the given radix,
-    /// the digits can be decorated with underscores
-    /// like this: '`1_2_3_4`' == '1234'
-    fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
-        loop {
-            if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
-                number.push(c);
+    // Dispatch based on the given character.
+    fn consume_ascii_character(&mut self, c: char) -> TokenKind {
+        let token = match c {
+            c if is_ascii_identifier_start(c) => self.lex_identifier(c),
+            '0'..='9' => self.lex_number(c),
+            '#' => return self.lex_comment(),
+            '\'' | '"' => self.lex_string(c),
+            '=' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::EqEqual
+                } else {
+                    self.state = State::AfterEqual;
+                    return TokenKind::Equal;
+                }
             }
-            // Number that contains `_` separators. Remove them from the parsed text.
-            else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
-                // Skip over `_`
-                self.cursor.bump();
-                number.skip_char();
-            } else {
-                break;
+            '+' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::PlusEqual
+                } else {
+                    TokenKind::Plus
+                }
+            }
+            '*' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::StarEqual
+                } else if self.cursor.eat_char('*') {
+                    if self.cursor.eat_char('=') {
+                        TokenKind::DoubleStarEqual
+                    } else {
+                        TokenKind::DoubleStar
+                    }
+                } else {
+                    TokenKind::Star
+                }
             }
-        }
-    }
 
-    /// Lex a single comment.
-    fn lex_comment(&mut self) -> TokenKind {
-        #[cfg(debug_assertions)]
-        debug_assert_eq!(self.cursor.previous(), '#');
+            c @ ('%' | '!')
+                if self.mode == Mode::Ipython
+                    && self.state.is_after_equal()
+                    && self.nesting == 0 =>
+            {
+                // SAFETY: Safe because `c` has been matched against one of the possible escape command token
+                self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
+            }
 
-        let bytes = self.cursor.rest().as_bytes();
-        let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
-        self.cursor.skip_bytes(offset);
+            c @ ('%' | '!' | '?' | '/' | ';' | ',')
+                if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
+            {
+                let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
+                    self.cursor.bump();
+                    kind
+                } else {
+                    // SAFETY: Safe because `c` has been matched against one of the possible escape command token
+                    IpyEscapeKind::try_from(c).unwrap()
+                };
 
-        TokenKind::Comment
-    }
+                self.lex_ipython_escape_command(kind)
+            }
 
-    /// Lex a single IPython escape command.
-    fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
-        let mut value = String::new();
+            '?' if self.mode == Mode::Ipython => TokenKind::Question,
 
-        loop {
-            match self.cursor.first() {
-                '\\' => {
-                    // Only skip the line continuation if it is followed by a newline
-                    // otherwise it is a normal backslash which is part of the magic command:
-                    //
-                    //        Skip this backslash
-                    //        v
-                    //   !pwd \
-                    //      && ls -a | sed 's/^/\\    /'
-                    //                          ^^
-                    //                          Don't skip these backslashes
-                    if self.cursor.second() == '\r' {
-                        self.cursor.bump();
-                        self.cursor.bump();
-                        self.cursor.eat_char('\n');
-                        continue;
-                    } else if self.cursor.second() == '\n' {
-                        self.cursor.bump();
-                        self.cursor.bump();
-                        continue;
+            '/' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::SlashEqual
+                } else if self.cursor.eat_char('/') {
+                    if self.cursor.eat_char('=') {
+                        TokenKind::DoubleSlashEqual
+                    } else {
+                        TokenKind::DoubleSlash
                     }
-
-                    self.cursor.bump();
-                    value.push('\\');
+                } else {
+                    TokenKind::Slash
                 }
-                // Help end escape commands are those that end with 1 or 2 question marks.
-                // Here, we're only looking for a subset of help end escape commands which
-                // are the ones that has the escape token at the start of the line as well.
-                // On the other hand, we're not looking for help end escape commands that
-                // are strict in the sense that the escape token is only at the end. For example,
-                //
-                //   * `%foo?` is recognized as a help end escape command but not as a strict one.
-                //   * `foo?` is recognized as a strict help end escape command which is not
-                //     lexed here but is identified at the parser level.
-                //
-                // Help end escape commands implemented in the IPython codebase using regex:
-                // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
-                '?' => {
-                    self.cursor.bump();
-                    let mut question_count = 1u32;
-                    while self.cursor.eat_char('?') {
-                        question_count += 1;
+            }
+            '%' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::PercentEqual
+                } else {
+                    TokenKind::Percent
+                }
+            }
+            '|' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::VbarEqual
+                } else {
+                    TokenKind::Vbar
+                }
+            }
+            '^' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::CircumflexEqual
+                } else {
+                    TokenKind::CircumFlex
+                }
+            }
+            '&' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::AmperEqual
+                } else {
+                    TokenKind::Amper
+                }
+            }
+            '-' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::MinusEqual
+                } else if self.cursor.eat_char('>') {
+                    TokenKind::Rarrow
+                } else {
+                    TokenKind::Minus
+                }
+            }
+            '@' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::AtEqual
+                } else {
+                    TokenKind::At
+                }
+            }
+            '!' => {
+                if self.cursor.eat_char('=') {
+                    TokenKind::NotEqual
+                } else {
+                    TokenKind::Exclamation
+                }
+            }
+            '~' => TokenKind::Tilde,
+            '(' => {
+                self.nesting += 1;
+                TokenKind::Lpar
+            }
+            ')' => {
+                self.nesting = self.nesting.saturating_sub(1);
+                TokenKind::Rpar
+            }
+            '[' => {
+                self.nesting += 1;
+                TokenKind::Lsqb
+            }
+            ']' => {
+                self.nesting = self.nesting.saturating_sub(1);
+                TokenKind::Rsqb
+            }
+            '{' => {
+                self.nesting += 1;
+                TokenKind::Lbrace
+            }
+            '}' => {
+                if let Some(fstring) = self.fstrings.current_mut() {
+                    if fstring.nesting() == self.nesting {
+                        return self.push_error(LexicalError::new(
+                            LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
+                            self.token_range(),
+                        ));
                     }
-
-                    // The original implementation in the IPython codebase is based on regex which
-                    // means that it's strict in the sense that it won't recognize a help end escape:
-                    //   * If there's any whitespace before the escape token (e.g. `%foo ?`)
-                    //   * If there are more than 2 question mark tokens (e.g. `%foo???`)
-                    // which is what we're doing here as well. In that case, we'll continue with
-                    // the prefixed escape token.
-                    //
-                    // Now, the whitespace and empty value check also makes sure that an empty
-                    // command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
-                    // is not recognized as a help end escape command. So, `%?` and `? ??` are
-                    // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
-                    // tokens.
-                    if question_count > 2
-                        || value.chars().last().map_or(true, is_python_whitespace)
-                        || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
-                    {
-                        // Not a help end escape command, so continue with the lexing.
-                        value.reserve(question_count as usize);
-                        for _ in 0..question_count {
-                            value.push('?');
-                        }
-                        continue;
+                    fstring.try_end_format_spec(self.nesting);
+                }
+                self.nesting = self.nesting.saturating_sub(1);
+                TokenKind::Rbrace
+            }
+            ':' => {
+                if self
+                    .fstrings
+                    .current_mut()
+                    .is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
+                {
+                    TokenKind::Colon
+                } else if self.cursor.eat_char('=') {
+                    TokenKind::ColonEqual
+                } else {
+                    TokenKind::Colon
+                }
+            }
+            ';' => TokenKind::Semi,
+            '<' => {
+                if self.cursor.eat_char('<') {
+                    if self.cursor.eat_char('=') {
+                        TokenKind::LeftShiftEqual
+                    } else {
+                        TokenKind::LeftShift
+                    }
+                } else if self.cursor.eat_char('=') {
+                    TokenKind::LessEqual
+                } else {
+                    TokenKind::Less
+                }
+            }
+            '>' => {
+                if self.cursor.eat_char('>') {
+                    if self.cursor.eat_char('=') {
+                        TokenKind::RightShiftEqual
+                    } else {
+                        TokenKind::RightShift
+                    }
+                } else if self.cursor.eat_char('=') {
+                    TokenKind::GreaterEqual
+                } else {
+                    TokenKind::Greater
+                }
+            }
+            ',' => TokenKind::Comma,
+            '.' => {
+                if self.cursor.first().is_ascii_digit() {
+                    self.lex_decimal_number('.')
+                } else if self.cursor.eat_char2('.', '.') {
+                    TokenKind::Ellipsis
+                } else {
+                    TokenKind::Dot
+                }
+            }
+            '\n' => {
+                return if self.nesting == 0 && !self.state.is_new_logical_line() {
+                    self.state = State::AfterNewline;
+                    TokenKind::Newline
+                } else {
+                    if let Some(fstring) = self.fstrings.current_mut() {
+                        fstring.try_end_format_spec(self.nesting);
                     }
+                    TokenKind::NonLogicalNewline
+                }
+            }
+            '\r' => {
+                self.cursor.eat_char('\n');
 
-                    if escape_kind.is_help() {
-                        // If we've recognize this as a help end escape command, then
-                        // any question mark token / whitespaces at the start are not
-                        // considered as part of the value.
-                        //
-                        // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
-                        // `value` is `foo` instead of `??foo`.
-                        value = value.trim_start_matches([' ', '?']).to_string();
-                    } else if escape_kind.is_magic() {
-                        // Between `%` and `?` (at the end), the `?` takes priority
-                        // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
-                        // and `value` is `%foo` instead of `foo`. So, we need to
-                        // insert the magic escape token at the start.
-                        value.insert_str(0, escape_kind.as_str());
+                return if self.nesting == 0 && !self.state.is_new_logical_line() {
+                    self.state = State::AfterNewline;
+                    TokenKind::Newline
+                } else {
+                    if let Some(fstring) = self.fstrings.current_mut() {
+                        fstring.try_end_format_spec(self.nesting);
                     }
+                    TokenKind::NonLogicalNewline
+                };
+            }
 
-                    let kind = match question_count {
-                        1 => IpyEscapeKind::Help,
-                        2 => IpyEscapeKind::Help2,
-                        _ => unreachable!("`question_count` is always 1 or 2"),
-                    };
+            _ => {
+                self.state = State::Other;
 
-                    self.current_value = TokenValue::IpyEscapeCommand {
-                        kind,
-                        value: value.into_boxed_str(),
-                    };
+                return self.push_error(LexicalError::new(
+                    LexicalErrorType::UnrecognizedToken { tok: c },
+                    self.token_range(),
+                ));
+            }
+        };
 
-                    return TokenKind::IpyEscapeCommand;
-                }
-                '\n' | '\r' | EOF_CHAR => {
-                    self.current_value = TokenValue::IpyEscapeCommand {
-                        kind: escape_kind,
-                        value: value.into_boxed_str(),
-                    };
+        self.state = State::Other;
+
+        token
+    }
+
+    /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
+    fn lex_identifier(&mut self, first: char) -> TokenKind {
+        // Detect potential string like rb'' b'' f'' u'' r''
+        let quote = match (first, self.cursor.first()) {
+            (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
+                self.cursor.bump();
+                quote
+            }),
+            (_, second) if is_quote(self.cursor.second()) => {
+                self.try_double_char_prefix([first, second]).then(|| {
+                    self.cursor.bump();
+                    // SAFETY: Safe because of the `is_quote` check in this match arm's guard
+                    self.cursor.bump().unwrap()
+                })
+            }
+            _ => None,
+        };
+
+        if let Some(quote) = quote {
+            if self.current_flags.is_f_string() {
+                return self.lex_fstring_start(quote);
+            }
+
+            return self.lex_string(quote);
+        }
+
+        // Keep track of whether the identifier is ASCII-only or not.
+        //
+        // This is important because Python applies NFKC normalization to
+        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
+        // We need to therefore do the same in our lexer, but applying NFKC normalization
+        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
+        // (by far the most common case), we can skip NFKC normalization of the identifier.
+        let mut is_ascii = first.is_ascii();
+        self.cursor
+            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
+
+        let text = self.token_text();
+
+        if !is_ascii {
+            self.current_value = TokenValue::Name(text.nfkc().collect::<String>().into_boxed_str());
+            return TokenKind::Name;
+        }
+
+        match text {
+            "False" => TokenKind::False,
+            "None" => TokenKind::None,
+            "True" => TokenKind::True,
+            "and" => TokenKind::And,
+            "as" => TokenKind::As,
+            "assert" => TokenKind::Assert,
+            "async" => TokenKind::Async,
+            "await" => TokenKind::Await,
+            "break" => TokenKind::Break,
+            "case" => TokenKind::Case,
+            "class" => TokenKind::Class,
+            "continue" => TokenKind::Continue,
+            "def" => TokenKind::Def,
+            "del" => TokenKind::Del,
+            "elif" => TokenKind::Elif,
+            "else" => TokenKind::Else,
+            "except" => TokenKind::Except,
+            "finally" => TokenKind::Finally,
+            "for" => TokenKind::For,
+            "from" => TokenKind::From,
+            "global" => TokenKind::Global,
+            "if" => TokenKind::If,
+            "import" => TokenKind::Import,
+            "in" => TokenKind::In,
+            "is" => TokenKind::Is,
+            "lambda" => TokenKind::Lambda,
+            "match" => TokenKind::Match,
+            "nonlocal" => TokenKind::Nonlocal,
+            "not" => TokenKind::Not,
+            "or" => TokenKind::Or,
+            "pass" => TokenKind::Pass,
+            "raise" => TokenKind::Raise,
+            "return" => TokenKind::Return,
+            "try" => TokenKind::Try,
+            "type" => TokenKind::Type,
+            "while" => TokenKind::While,
+            "with" => TokenKind::With,
+            "yield" => TokenKind::Yield,
+            _ => {
+                self.current_value = TokenValue::Name(text.to_string().into_boxed_str());
+                TokenKind::Name
+            }
+        }
+    }
 
-                    return TokenKind::IpyEscapeCommand;
-                }
-                c => {
-                    self.cursor.bump();
-                    value.push(c);
-                }
+    /// Try lexing the single character string prefix, updating the token flags accordingly.
+    /// Returns `true` if it matches.
+    fn try_single_char_prefix(&mut self, first: char) -> bool {
+        match first {
+            'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
+            'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
+            'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
+            'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
+            'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
+            _ => return false,
+        }
+        true
+    }
+
+    /// Try lexing the double character string prefix, updating the token flags accordingly.
+    /// Returns `true` if it matches.
+    fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
+        match value {
+            ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
+                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
+            }
+            ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
+                self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
+            }
+            ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
+                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
+            }
+            ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
+                self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
             }
+            _ => return false,
         }
+        true
     }
 
     /// Lex a f-string start token.
@@ -704,8 +870,8 @@ impl<'src> Lexer<'src> {
         };
 
         self.current_value = TokenValue::FStringMiddle(value.into_boxed_str());
-        self.current_flags = fstring.flags();
 
+        self.current_flags = fstring.flags();
         Some(TokenKind::FStringMiddle)
     }
 
@@ -820,485 +986,322 @@ impl<'src> Lexer<'src> {
         TokenKind::String
     }
 
-    /// Lex the next token.
-    pub fn next_token(&mut self) -> TokenKind {
-        self.cursor.start_token();
-        self.current_value = TokenValue::None;
-        self.current_flags = TokenFlags::empty();
-        self.current_kind = self.lex_token();
-        self.current_range = self.token_range();
-        self.current_kind
-    }
-
-    fn lex_token(&mut self) -> TokenKind {
-        if let Some(fstring) = self.fstrings.current() {
-            if !fstring.is_in_expression(self.nesting) {
-                if let Some(token) = self.lex_fstring_middle_or_end() {
-                    if matches!(token, TokenKind::FStringEnd) {
-                        self.fstrings.pop();
-                    }
-                    return token;
-                }
-            }
-        }
-        // Return dedent tokens until the current indentation level matches the indentation of the next token.
-        else if let Some(indentation) = self.pending_indentation.take() {
-            match self.indentations.current().try_compare(indentation) {
-                Ok(Ordering::Greater) => {
-                    self.pending_indentation = Some(indentation);
-                    if self.indentations.dedent_one(indentation).is_err() {
-                        return self.push_error(LexicalError::new(
-                            LexicalErrorType::IndentationError,
-                            self.token_range(),
-                        ));
-                    }
-                    return TokenKind::Dedent;
-                }
-                Ok(_) => {}
-                Err(_) => {
-                    return self.push_error(LexicalError::new(
-                        LexicalErrorType::IndentationError,
-                        self.token_range(),
-                    ));
-                }
-            }
-        }
-
-        if self.state.is_after_newline() {
-            if let Some(indentation) = self.eat_indentation() {
-                return indentation;
-            }
-        } else {
-            if let Err(error) = self.skip_whitespace() {
-                return self.push_error(error);
-            }
-        }
-
-        // The lexer might've skipped whitespaces, so update the start offset
-        self.cursor.start_token();
-
-        if let Some(c) = self.cursor.bump() {
-            if c.is_ascii() {
-                self.consume_ascii_character(c)
-            } else if is_unicode_identifier_start(c) {
-                let identifier = self.lex_identifier(c);
-                self.state = State::Other;
-
-                identifier
+    /// Numeric lexing. The feast can start!
+    fn lex_number(&mut self, first: char) -> TokenKind {
+        if first == '0' {
+            if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
+                self.lex_number_radix(Radix::Hex)
+            } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
+                self.lex_number_radix(Radix::Octal)
+            } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
+                self.lex_number_radix(Radix::Binary)
             } else {
-                self.push_error(LexicalError::new(
-                    LexicalErrorType::UnrecognizedToken { tok: c },
-                    self.token_range(),
-                ))
+                self.lex_decimal_number(first)
             }
         } else {
-            // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line,
-            // empty the dedent stack, and finally, return the EndOfFile token.
-            self.consume_end()
-        }
-    }
-
-    fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
-        loop {
-            match self.cursor.first() {
-                ' ' => {
-                    self.cursor.bump();
-                }
-                '\t' => {
-                    self.cursor.bump();
-                }
-                '\\' => {
-                    self.cursor.bump();
-                    if self.cursor.eat_char('\r') {
-                        self.cursor.eat_char('\n');
-                    } else if self.cursor.is_eof() {
-                        return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
-                    } else if !self.cursor.eat_char('\n') {
-                        return Err(LexicalError::new(
-                            LexicalErrorType::LineContinuationError,
-                            self.token_range(),
-                        ));
-                    }
-                }
-                // Form feed
-                '\x0C' => {
-                    self.cursor.bump();
-                }
-                _ => break,
-            }
-        }
-
-        Ok(())
-    }
-
-    fn eat_indentation(&mut self) -> Option<TokenKind> {
-        let mut indentation = Indentation::root();
-
-        loop {
-            match self.cursor.first() {
-                ' ' => {
-                    self.cursor.bump();
-                    indentation = indentation.add_space();
-                }
-                '\t' => {
-                    self.cursor.bump();
-                    indentation = indentation.add_tab();
-                }
-                '\\' => {
-                    self.cursor.bump();
-                    if self.cursor.eat_char('\r') {
-                        self.cursor.eat_char('\n');
-                    } else if self.cursor.is_eof() {
-                        return Some(self.push_error(LexicalError::new(
-                            LexicalErrorType::Eof,
-                            self.token_range(),
-                        )));
-                    } else if !self.cursor.eat_char('\n') {
-                        return Some(self.push_error(LexicalError::new(
-                            LexicalErrorType::LineContinuationError,
-                            self.token_range(),
-                        )));
-                    }
-                    indentation = Indentation::root();
-                }
-                // Form feed
-                '\x0C' => {
-                    self.cursor.bump();
-                    indentation = Indentation::root();
-                }
-                _ => break,
-            }
-        }
-
-        // Handle indentation if this is a new, not all empty, logical line
-        if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
-            self.state = State::NonEmptyLogicalLine;
-
-            // Set to false so that we don't handle indentation on the next call.
-            return self.handle_indentation(indentation);
+            self.lex_decimal_number(first)
         }
-
-        None
     }
 
-    fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
-        let token = match self.indentations.current().try_compare(indentation) {
-            // Dedent
-            Ok(Ordering::Greater) => {
-                self.pending_indentation = Some(indentation);
-
-                if self.indentations.dedent_one(indentation).is_err() {
-                    return Some(self.push_error(LexicalError::new(
-                        LexicalErrorType::IndentationError,
-                        self.token_range(),
-                    )));
-                };
-
-                // The lexer might've eaten some whitespaces to calculate the `indentation`. For
-                // example:
-                //
-                // ```py
-                // if first:
-                //     if second:
-                //         pass
-                //     foo
-                // #   ^
-                // ```
-                //
-                // Here, the cursor is at `^` and the `indentation` contains the whitespaces before
-                // the `pass` token.
-                self.cursor.start_token();
-
-                Some(TokenKind::Dedent)
-            }
+    /// Lex a hex/octal/decimal/binary number without a decimal point.
+    fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
+        #[cfg(debug_assertions)]
+        debug_assert!(matches!(
+            self.cursor.previous().to_ascii_lowercase(),
+            'x' | 'o' | 'b'
+        ));
 
-            Ok(Ordering::Equal) => None,
+        // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`).
+        let mut number = LexedText::new(self.offset(), self.source);
+        self.radix_run(&mut number, radix);
 
-            // Indent
-            Ok(Ordering::Less) => {
-                self.indentations.indent(indentation);
-                Some(TokenKind::Indent)
-            }
-            Err(_) => {
-                return Some(self.push_error(LexicalError::new(
-                    LexicalErrorType::IndentationError,
+        // Extract the entire number, including the base prefix (e.g., `0x9D5`).
+        let token = &self.source[self.token_range()];
+
+        let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) {
+            Ok(int) => int,
+            Err(err) => {
+                return self.push_error(LexicalError::new(
+                    LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
                     self.token_range(),
-                )));
+                ));
             }
         };
-
-        token
+        self.current_value = TokenValue::Int(value);
+        TokenKind::Int
     }
 
-    fn consume_end(&mut self) -> TokenKind {
-        // We reached end of file.
-        // First of all, we need all nestings to be finished.
-        if self.nesting > 0 {
-            // Reset the nesting to avoid going into infinite loop.
-            self.nesting = 0;
-            return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
-        }
+    /// Lex a normal number, that is, no octal, hex or binary number.
+    fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
+        #[cfg(debug_assertions)]
+        debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
+        let start_is_zero = first_digit_or_dot == '0';
 
-        // Next, insert a trailing newline, if required.
-        if !self.state.is_new_logical_line() {
-            self.state = State::AfterNewline;
-            TokenKind::Newline
-        }
-        // Next, flush the indentation stack to zero.
-        else if self.indentations.dedent().is_some() {
-            TokenKind::Dedent
-        } else {
-            TokenKind::EndOfFile
-        }
-    }
+        let mut number = LexedText::new(self.token_start(), self.source);
+        if first_digit_or_dot != '.' {
+            number.push(first_digit_or_dot);
+            self.radix_run(&mut number, Radix::Decimal);
+        };
 
-    // Dispatch based on the given character.
-    fn consume_ascii_character(&mut self, c: char) -> TokenKind {
-        let token = match c {
-            c if is_ascii_identifier_start(c) => self.lex_identifier(c),
-            '0'..='9' => self.lex_number(c),
-            '#' => return self.lex_comment(),
-            '\'' | '"' => self.lex_string(c),
-            '=' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::EqEqual
-                } else {
-                    self.state = State::AfterEqual;
-                    return TokenKind::Equal;
-                }
-            }
-            '+' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::PlusEqual
-                } else {
-                    TokenKind::Plus
-                }
-            }
-            '*' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::StarEqual
-                } else if self.cursor.eat_char('*') {
-                    if self.cursor.eat_char('=') {
-                        TokenKind::DoubleStarEqual
-                    } else {
-                        TokenKind::DoubleStar
-                    }
-                } else {
-                    TokenKind::Star
-                }
-            }
+        let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
+            number.push('.');
 
-            c @ ('%' | '!')
-                if self.mode == Mode::Ipython
-                    && self.state.is_after_equal()
-                    && self.nesting == 0 =>
-            {
-                // SAFETY: Safe because `c` has been matched against one of the possible escape command token
-                self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap())
+            if self.cursor.eat_char('_') {
+                return self.push_error(LexicalError::new(
+                    LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
+                    TextRange::new(self.offset() - TextSize::new(1), self.offset()),
+                ));
             }
 
-            c @ ('%' | '!' | '?' | '/' | ';' | ',')
-                if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
-            {
-                let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) {
-                    self.cursor.bump();
-                    kind
-                } else {
-                    // SAFETY: Safe because `c` has been matched against one of the possible escape command token
-                    IpyEscapeKind::try_from(c).unwrap()
-                };
-
-                self.lex_ipython_escape_command(kind)
-            }
+            self.radix_run(&mut number, Radix::Decimal);
+            true
+        } else {
+            // Normal number:
+            false
+        };
 
-            '?' if self.mode == Mode::Ipython => TokenKind::Question,
+        let is_float = match self.cursor.rest().as_bytes() {
+            [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
+                // 'e' | 'E'
+                number.push(self.cursor.bump().unwrap());
 
-            '/' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::SlashEqual
-                } else if self.cursor.eat_char('/') {
-                    if self.cursor.eat_char('=') {
-                        TokenKind::DoubleSlashEqual
-                    } else {
-                        TokenKind::DoubleSlash
-                    }
-                } else {
-                    TokenKind::Slash
-                }
-            }
-            '%' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::PercentEqual
-                } else {
-                    TokenKind::Percent
-                }
-            }
-            '|' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::VbarEqual
-                } else {
-                    TokenKind::Vbar
-                }
-            }
-            '^' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::CircumflexEqual
-                } else {
-                    TokenKind::CircumFlex
-                }
-            }
-            '&' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::AmperEqual
-                } else {
-                    TokenKind::Amper
-                }
-            }
-            '-' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::MinusEqual
-                } else if self.cursor.eat_char('>') {
-                    TokenKind::Rarrow
-                } else {
-                    TokenKind::Minus
-                }
-            }
-            '@' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::AtEqual
-                } else {
-                    TokenKind::At
-                }
-            }
-            '!' => {
-                if self.cursor.eat_char('=') {
-                    TokenKind::NotEqual
-                } else {
-                    TokenKind::Exclamation
+                if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) {
+                    number.push(sign);
                 }
+
+                self.radix_run(&mut number, Radix::Decimal);
+
+                true
             }
-            '~' => TokenKind::Tilde,
-            '(' => {
-                self.nesting += 1;
-                TokenKind::Lpar
-            }
-            ')' => {
-                self.nesting = self.nesting.saturating_sub(1);
-                TokenKind::Rpar
-            }
-            '[' => {
-                self.nesting += 1;
-                TokenKind::Lsqb
-            }
-            ']' => {
-                self.nesting = self.nesting.saturating_sub(1);
-                TokenKind::Rsqb
-            }
-            '{' => {
-                self.nesting += 1;
-                TokenKind::Lbrace
+            _ => is_float,
+        };
+
+        if is_float {
+            // Improvement: Use `Cow` instead of pushing to value text
+            let Ok(value) = f64::from_str(number.as_str()) else {
+                return self.push_error(LexicalError::new(
+                    LexicalErrorType::OtherError(
+                        "Invalid decimal literal".to_string().into_boxed_str(),
+                    ),
+                    self.token_range(),
+                ));
+            };
+
+            // Parse trailing 'j':
+            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
+                self.current_value = TokenValue::Complex {
+                    real: 0.0,
+                    imag: value,
+                };
+                TokenKind::Complex
+            } else {
+                self.current_value = TokenValue::Float(value);
+                TokenKind::Float
             }
-            '}' => {
-                if let Some(fstring) = self.fstrings.current_mut() {
-                    if fstring.nesting() == self.nesting {
+        } else {
+            // Parse trailing 'j':
+            if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
+                let imag = f64::from_str(number.as_str()).unwrap();
+                self.current_value = TokenValue::Complex { real: 0.0, imag };
+                TokenKind::Complex
+            } else {
+                let value = match Int::from_str(number.as_str()) {
+                    Ok(value) => {
+                        if start_is_zero && value.as_u8() != Some(0) {
+                            // Leading zeros in decimal integer literals are not permitted.
+                            return self.push_error(LexicalError::new(
+                                LexicalErrorType::OtherError(
+                                    "Invalid decimal integer literal"
+                                        .to_string()
+                                        .into_boxed_str(),
+                                ),
+                                self.token_range(),
+                            ));
+                        }
+                        value
+                    }
+                    Err(err) => {
                         return self.push_error(LexicalError::new(
-                            LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
+                            LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
                             self.token_range(),
-                        ));
+                        ))
                     }
-                    fstring.try_end_format_spec(self.nesting);
-                }
-                self.nesting = self.nesting.saturating_sub(1);
-                TokenKind::Rbrace
+                };
+                self.current_value = TokenValue::Int(value);
+                TokenKind::Int
             }
-            ':' => {
-                if self
-                    .fstrings
-                    .current_mut()
-                    .is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
-                {
-                    TokenKind::Colon
-                } else if self.cursor.eat_char('=') {
-                    TokenKind::ColonEqual
-                } else {
-                    TokenKind::Colon
-                }
+        }
+    }
+
+    /// Consume a sequence of numbers with the given radix,
+    /// the digits can be decorated with underscores
+    /// like this: '`1_2_3_4`' == '1234'
+    fn radix_run(&mut self, number: &mut LexedText, radix: Radix) {
+        loop {
+            if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
+                number.push(c);
             }
-            ';' => TokenKind::Semi,
-            '<' => {
-                if self.cursor.eat_char('<') {
-                    if self.cursor.eat_char('=') {
-                        TokenKind::LeftShiftEqual
-                    } else {
-                        TokenKind::LeftShift
-                    }
-                } else if self.cursor.eat_char('=') {
-                    TokenKind::LessEqual
-                } else {
-                    TokenKind::Less
-                }
+            // Number that contains `_` separators. Remove them from the parsed text.
+            else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
+                // Skip over `_`
+                self.cursor.bump();
+                number.skip_char();
+            } else {
+                break;
             }
-            '>' => {
-                if self.cursor.eat_char('>') {
-                    if self.cursor.eat_char('=') {
-                        TokenKind::RightShiftEqual
-                    } else {
-                        TokenKind::RightShift
+        }
+    }
+
+    /// Lex a single comment.
+    fn lex_comment(&mut self) -> TokenKind {
+        #[cfg(debug_assertions)]
+        debug_assert_eq!(self.cursor.previous(), '#');
+
+        let bytes = self.cursor.rest().as_bytes();
+        let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
+        self.cursor.skip_bytes(offset);
+
+        TokenKind::Comment
+    }
+
+    /// Lex a single IPython escape command.
+    fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind {
+        let mut value = String::new();
+
+        loop {
+            match self.cursor.first() {
+                '\\' => {
+                    // Only skip the line continuation if it is followed by a newline
+                    // otherwise it is a normal backslash which is part of the magic command:
+                    //
+                    //        Skip this backslash
+                    //        v
+                    //   !pwd \
+                    //      && ls -a | sed 's/^/\\    /'
+                    //                          ^^
+                    //                          Don't skip these backslashes
+                    if self.cursor.second() == '\r' {
+                        self.cursor.bump();
+                        self.cursor.bump();
+                        self.cursor.eat_char('\n');
+                        continue;
+                    } else if self.cursor.second() == '\n' {
+                        self.cursor.bump();
+                        self.cursor.bump();
+                        continue;
                     }
-                } else if self.cursor.eat_char('=') {
-                    TokenKind::GreaterEqual
-                } else {
-                    TokenKind::Greater
-                }
-            }
-            ',' => TokenKind::Comma,
-            '.' => {
-                if self.cursor.first().is_ascii_digit() {
-                    self.lex_decimal_number('.')
-                } else if self.cursor.eat_char2('.', '.') {
-                    TokenKind::Ellipsis
-                } else {
-                    TokenKind::Dot
+
+                    self.cursor.bump();
+                    value.push('\\');
                 }
-            }
-            '\n' => {
-                return if self.nesting == 0 && !self.state.is_new_logical_line() {
-                    self.state = State::AfterNewline;
-                    TokenKind::Newline
-                } else {
-                    if let Some(fstring) = self.fstrings.current_mut() {
-                        fstring.try_end_format_spec(self.nesting);
+                // Help end escape commands are those that end with 1 or 2 question marks.
+                // Here, we're only looking for a subset of help end escape commands which
+                // are the ones that has the escape token at the start of the line as well.
+                // On the other hand, we're not looking for help end escape commands that
+                // are strict in the sense that the escape token is only at the end. For example,
+                //
+                //   * `%foo?` is recognized as a help end escape command but not as a strict one.
+                //   * `foo?` is recognized as a strict help end escape command which is not
+                //     lexed here but is identified at the parser level.
+                //
+                // Help end escape commands implemented in the IPython codebase using regex:
+                // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462
+                '?' => {
+                    self.cursor.bump();
+                    let mut question_count = 1u32;
+                    while self.cursor.eat_char('?') {
+                        question_count += 1;
                     }
-                    TokenKind::NonLogicalNewline
-                }
-            }
-            '\r' => {
-                self.cursor.eat_char('\n');
 
-                return if self.nesting == 0 && !self.state.is_new_logical_line() {
-                    self.state = State::AfterNewline;
-                    TokenKind::Newline
-                } else {
-                    if let Some(fstring) = self.fstrings.current_mut() {
-                        fstring.try_end_format_spec(self.nesting);
+                    // The original implementation in the IPython codebase is based on regex which
+                    // means that it's strict in the sense that it won't recognize a help end escape:
+                    //   * If there's any whitespace before the escape token (e.g. `%foo ?`)
+                    //   * If there are more than 2 question mark tokens (e.g. `%foo???`)
+                    // which is what we're doing here as well. In that case, we'll continue with
+                    // the prefixed escape token.
+                    //
+                    // Now, the whitespace and empty value check also makes sure that an empty
+                    // command (e.g. `%?` or `? ??`, no value after/between the escape tokens)
+                    // is not recognized as a help end escape command. So, `%?` and `? ??` are
+                    // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??`
+                    // tokens.
+                    if question_count > 2
+                        || value.chars().last().map_or(true, is_python_whitespace)
+                        || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR)
+                    {
+                        // Not a help end escape command, so continue with the lexing.
+                        value.reserve(question_count as usize);
+                        for _ in 0..question_count {
+                            value.push('?');
+                        }
+                        continue;
                     }
-                    TokenKind::NonLogicalNewline
-                };
-            }
 
-            _ => {
-                self.state = State::Other;
+                    if escape_kind.is_help() {
+                        // If we've recognize this as a help end escape command, then
+                        // any question mark token / whitespaces at the start are not
+                        // considered as part of the value.
+                        //
+                        // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and
+                        // `value` is `foo` instead of `??foo`.
+                        value = value.trim_start_matches([' ', '?']).to_string();
+                    } else if escape_kind.is_magic() {
+                        // Between `%` and `?` (at the end), the `?` takes priority
+                        // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help`
+                        // and `value` is `%foo` instead of `foo`. So, we need to
+                        // insert the magic escape token at the start.
+                        value.insert_str(0, escape_kind.as_str());
+                    }
 
-                return self.push_error(LexicalError::new(
-                    LexicalErrorType::UnrecognizedToken { tok: c },
-                    self.token_range(),
-                ));
+                    let kind = match question_count {
+                        1 => IpyEscapeKind::Help,
+                        2 => IpyEscapeKind::Help2,
+                        _ => unreachable!("`question_count` is always 1 or 2"),
+                    };
+
+                    self.current_value = TokenValue::IpyEscapeCommand {
+                        kind,
+                        value: value.into_boxed_str(),
+                    };
+
+                    return TokenKind::IpyEscapeCommand;
+                }
+                '\n' | '\r' | EOF_CHAR => {
+                    self.current_value = TokenValue::IpyEscapeCommand {
+                        kind: escape_kind,
+                        value: value.into_boxed_str(),
+                    };
+
+                    return TokenKind::IpyEscapeCommand;
+                }
+                c => {
+                    self.cursor.bump();
+                    value.push(c);
+                }
             }
-        };
+        }
+    }
 
-        self.state = State::Other;
+    fn consume_end(&mut self) -> TokenKind {
+        // We reached end of file.
+        // First of all, we need all nestings to be finished.
+        if self.nesting > 0 {
+            // Reset the nesting to avoid going into infinite loop.
+            self.nesting = 0;
+            return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
+        }
 
-        token
+        // Next, insert a trailing newline, if required.
+        if !self.state.is_new_logical_line() {
+            self.state = State::AfterNewline;
+            TokenKind::Newline
+        }
+        // Next, flush the indentation stack to zero.
+        else if self.indentations.dedent().is_some() {
+            TokenKind::Dedent
+        } else {
+            TokenKind::EndOfFile
+        }
     }
 
     #[inline]
@@ -1327,13 +1330,10 @@ impl<'src> Lexer<'src> {
         self.token_range().start()
     }
 
-    /// Takes the token value corresponding to the current token out of the lexer, replacing it
-    /// with the default value.
-    ///
-    /// All the subsequent call to this method without moving the lexer would always return the
-    /// default value which is [`TokenValue::None`].
-    pub(crate) fn take_value(&mut self) -> TokenValue {
-        std::mem::take(&mut self.current_value)
+    /// Helper function to push the given error and return the [`TokenKind::Unknown`] token.
+    fn push_error(&mut self, error: LexicalError) -> TokenKind {
+        self.errors.push(error);
+        TokenKind::Unknown
     }
 
     /// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].