From 0578b17a4bf4718c8f740b20af073d47afe6bf2c Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Fri, 31 May 2024 08:51:49 +0530 Subject: [PATCH] Re-order lexer methods --- crates/ruff_python_parser/src/lexer.rs | 1628 ++++++++++++------------ 1 file changed, 814 insertions(+), 814 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 8933e4cb748a42..5b5bb3d213f0a6 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -125,431 +125,597 @@ impl<'src> Lexer<'src> { self.current_flags } - /// Helper function to push the given error and return the [`TokenKind::Unknown`] token. - fn push_error(&mut self, error: LexicalError) -> TokenKind { - self.errors.push(error); - TokenKind::Unknown + /// Takes the token value corresponding to the current token out of the lexer, replacing it + /// with the default value. + /// + /// All the subsequent call to this method without moving the lexer would always return the + /// default value which is [`TokenValue::None`]. + pub(crate) fn take_value(&mut self) -> TokenValue { + std::mem::take(&mut self.current_value) } - /// Try lexing the single character string prefix, updating the token flags accordingly. - /// Returns `true` if it matches. - fn try_single_char_prefix(&mut self, first: char) -> bool { - match first { - 'f' | 'F' => self.current_flags |= TokenFlags::F_STRING, - 'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING, - 'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING, - 'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE, - 'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE, - _ => return false, - } - true + /// Lex the next token. + pub fn next_token(&mut self) -> TokenKind { + self.cursor.start_token(); + self.current_value = TokenValue::None; + self.current_flags = TokenFlags::empty(); + self.current_kind = self.lex_token(); + self.current_range = self.token_range(); + self.current_kind } - /// Try lexing the double character string prefix, updating the token flags accordingly. - /// Returns `true` if it matches. - fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool { - match value { - ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => { - self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE; - } - ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => { - self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE; - } - ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => { - self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE; + fn lex_token(&mut self) -> TokenKind { + if let Some(fstring) = self.fstrings.current() { + if !fstring.is_in_expression(self.nesting) { + if let Some(token) = self.lex_fstring_middle_or_end() { + if matches!(token, TokenKind::FStringEnd) { + self.fstrings.pop(); + } + return token; + } } - ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => { - self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE; + } + // Return dedent tokens until the current indentation level matches the indentation of the next token. + else if let Some(indentation) = self.pending_indentation.take() { + match self.indentations.current().try_compare(indentation) { + Ok(Ordering::Greater) => { + self.pending_indentation = Some(indentation); + if self.indentations.dedent_one(indentation).is_err() { + return self.push_error(LexicalError::new( + LexicalErrorType::IndentationError, + self.token_range(), + )); + } + return TokenKind::Dedent; + } + Ok(_) => {} + Err(_) => { + return self.push_error(LexicalError::new( + LexicalErrorType::IndentationError, + self.token_range(), + )); + } } - _ => return false, } - true - } - /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. - fn lex_identifier(&mut self, first: char) -> TokenKind { - // Detect potential string like rb'' b'' f'' u'' r'' - let quote = match (first, self.cursor.first()) { - (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| { - self.cursor.bump(); - quote - }), - (_, second) if is_quote(self.cursor.second()) => { - self.try_double_char_prefix([first, second]).then(|| { - self.cursor.bump(); - // SAFETY: Safe because of the `is_quote` check in this match arm's guard - self.cursor.bump().unwrap() - }) + if self.state.is_after_newline() { + if let Some(indentation) = self.eat_indentation() { + return indentation; } - _ => None, - }; - - if let Some(quote) = quote { - if self.current_flags.is_f_string() { - return self.lex_fstring_start(quote); + } else { + if let Err(error) = self.skip_whitespace() { + return self.push_error(error); } - - return self.lex_string(quote); } - // Keep track of whether the identifier is ASCII-only or not. - // - // This is important because Python applies NFKC normalization to - // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers. - // We need to therefore do the same in our lexer, but applying NFKC normalization - // unconditionally is extremely expensive. If we know an identifier is ASCII-only, - // (by far the most common case), we can skip NFKC normalization of the identifier. - let mut is_ascii = first.is_ascii(); - self.cursor - .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); - - let text = self.token_text(); + // The lexer might've skipped whitespaces, so update the start offset + self.cursor.start_token(); - if !is_ascii { - self.current_value = TokenValue::Name(text.nfkc().collect::().into_boxed_str()); - return TokenKind::Name; - } + if let Some(c) = self.cursor.bump() { + if c.is_ascii() { + self.consume_ascii_character(c) + } else if is_unicode_identifier_start(c) { + let identifier = self.lex_identifier(c); + self.state = State::Other; - match text { - "False" => TokenKind::False, - "None" => TokenKind::None, - "True" => TokenKind::True, - "and" => TokenKind::And, - "as" => TokenKind::As, - "assert" => TokenKind::Assert, - "async" => TokenKind::Async, - "await" => TokenKind::Await, - "break" => TokenKind::Break, - "case" => TokenKind::Case, - "class" => TokenKind::Class, - "continue" => TokenKind::Continue, - "def" => TokenKind::Def, - "del" => TokenKind::Del, - "elif" => TokenKind::Elif, - "else" => TokenKind::Else, - "except" => TokenKind::Except, - "finally" => TokenKind::Finally, - "for" => TokenKind::For, - "from" => TokenKind::From, - "global" => TokenKind::Global, - "if" => TokenKind::If, - "import" => TokenKind::Import, - "in" => TokenKind::In, - "is" => TokenKind::Is, - "lambda" => TokenKind::Lambda, - "match" => TokenKind::Match, - "nonlocal" => TokenKind::Nonlocal, - "not" => TokenKind::Not, - "or" => TokenKind::Or, - "pass" => TokenKind::Pass, - "raise" => TokenKind::Raise, - "return" => TokenKind::Return, - "try" => TokenKind::Try, - "type" => TokenKind::Type, - "while" => TokenKind::While, - "with" => TokenKind::With, - "yield" => TokenKind::Yield, - _ => { - self.current_value = TokenValue::Name(text.to_string().into_boxed_str()); - TokenKind::Name + identifier + } else { + self.push_error(LexicalError::new( + LexicalErrorType::UnrecognizedToken { tok: c }, + self.token_range(), + )) } + } else { + // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line, + // empty the dedent stack, and finally, return the EndOfFile token. + self.consume_end() } } - /// Numeric lexing. The feast can start! - fn lex_number(&mut self, first: char) -> TokenKind { - if first == '0' { - if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { - self.lex_number_radix(Radix::Hex) - } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { - self.lex_number_radix(Radix::Octal) - } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { - self.lex_number_radix(Radix::Binary) - } else { - self.lex_decimal_number(first) + fn eat_indentation(&mut self) -> Option { + let mut indentation = Indentation::root(); + + loop { + match self.cursor.first() { + ' ' => { + self.cursor.bump(); + indentation = indentation.add_space(); + } + '\t' => { + self.cursor.bump(); + indentation = indentation.add_tab(); + } + '\\' => { + self.cursor.bump(); + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.is_eof() { + return Some(self.push_error(LexicalError::new( + LexicalErrorType::Eof, + self.token_range(), + ))); + } else if !self.cursor.eat_char('\n') { + return Some(self.push_error(LexicalError::new( + LexicalErrorType::LineContinuationError, + self.token_range(), + ))); + } + indentation = Indentation::root(); + } + // Form feed + '\x0C' => { + self.cursor.bump(); + indentation = Indentation::root(); + } + _ => break, } - } else { - self.lex_decimal_number(first) } - } - /// Lex a hex/octal/decimal/binary number without a decimal point. - fn lex_number_radix(&mut self, radix: Radix) -> TokenKind { - #[cfg(debug_assertions)] - debug_assert!(matches!( - self.cursor.previous().to_ascii_lowercase(), - 'x' | 'o' | 'b' - )); + // Handle indentation if this is a new, not all empty, logical line + if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) { + self.state = State::NonEmptyLogicalLine; - // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`). - let mut number = LexedText::new(self.offset(), self.source); - self.radix_run(&mut number, radix); + // Set to false so that we don't handle indentation on the next call. + return self.handle_indentation(indentation); + } - // Extract the entire number, including the base prefix (e.g., `0x9D5`). - let token = &self.source[self.token_range()]; + None + } - let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) { - Ok(int) => int, - Err(err) => { - return self.push_error(LexicalError::new( - LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()), - self.token_range(), - )); - } - }; - self.current_value = TokenValue::Int(value); - TokenKind::Int - } - - /// Lex a normal number, that is, no octal, hex or binary number. - fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind { - #[cfg(debug_assertions)] - debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); - let start_is_zero = first_digit_or_dot == '0'; + fn handle_indentation(&mut self, indentation: Indentation) -> Option { + let token = match self.indentations.current().try_compare(indentation) { + // Dedent + Ok(Ordering::Greater) => { + self.pending_indentation = Some(indentation); - let mut number = LexedText::new(self.token_start(), self.source); - if first_digit_or_dot != '.' { - number.push(first_digit_or_dot); - self.radix_run(&mut number, Radix::Decimal); - }; + if self.indentations.dedent_one(indentation).is_err() { + return Some(self.push_error(LexicalError::new( + LexicalErrorType::IndentationError, + self.token_range(), + ))); + }; - let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { - number.push('.'); + // The lexer might've eaten some whitespaces to calculate the `indentation`. For + // example: + // + // ```py + // if first: + // if second: + // pass + // foo + // # ^ + // ``` + // + // Here, the cursor is at `^` and the `indentation` contains the whitespaces before + // the `pass` token. + self.cursor.start_token(); - if self.cursor.eat_char('_') { - return self.push_error(LexicalError::new( - LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()), - TextRange::new(self.offset() - TextSize::new(1), self.offset()), - )); + Some(TokenKind::Dedent) } - self.radix_run(&mut number, Radix::Decimal); - true - } else { - // Normal number: - false - }; - - let is_float = match self.cursor.rest().as_bytes() { - [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { - // 'e' | 'E' - number.push(self.cursor.bump().unwrap()); - - if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { - number.push(sign); - } - - self.radix_run(&mut number, Radix::Decimal); + Ok(Ordering::Equal) => None, - true + // Indent + Ok(Ordering::Less) => { + self.indentations.indent(indentation); + Some(TokenKind::Indent) + } + Err(_) => { + return Some(self.push_error(LexicalError::new( + LexicalErrorType::IndentationError, + self.token_range(), + ))); } - _ => is_float, }; - if is_float { - // Improvement: Use `Cow` instead of pushing to value text - let Ok(value) = f64::from_str(number.as_str()) else { - return self.push_error(LexicalError::new( - LexicalErrorType::OtherError( - "Invalid decimal literal".to_string().into_boxed_str(), - ), - self.token_range(), - )); - }; + token + } - // Parse trailing 'j': - if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { - self.current_value = TokenValue::Complex { - real: 0.0, - imag: value, - }; - TokenKind::Complex - } else { - self.current_value = TokenValue::Float(value); - TokenKind::Float - } - } else { - // Parse trailing 'j': - if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { - let imag = f64::from_str(number.as_str()).unwrap(); - self.current_value = TokenValue::Complex { real: 0.0, imag }; - TokenKind::Complex - } else { - let value = match Int::from_str(number.as_str()) { - Ok(value) => { - if start_is_zero && value.as_u8() != Some(0) { - // Leading zeros in decimal integer literals are not permitted. - return self.push_error(LexicalError::new( - LexicalErrorType::OtherError( - "Invalid decimal integer literal" - .to_string() - .into_boxed_str(), - ), - self.token_range(), - )); - } - value - } - Err(err) => { - return self.push_error(LexicalError::new( - LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()), + fn skip_whitespace(&mut self) -> Result<(), LexicalError> { + loop { + match self.cursor.first() { + ' ' => { + self.cursor.bump(); + } + '\t' => { + self.cursor.bump(); + } + '\\' => { + self.cursor.bump(); + if self.cursor.eat_char('\r') { + self.cursor.eat_char('\n'); + } else if self.cursor.is_eof() { + return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range())); + } else if !self.cursor.eat_char('\n') { + return Err(LexicalError::new( + LexicalErrorType::LineContinuationError, self.token_range(), - )) + )); } - }; - self.current_value = TokenValue::Int(value); - TokenKind::Int + } + // Form feed + '\x0C' => { + self.cursor.bump(); + } + _ => break, } } + + Ok(()) } - /// Consume a sequence of numbers with the given radix, - /// the digits can be decorated with underscores - /// like this: '`1_2_3_4`' == '1234' - fn radix_run(&mut self, number: &mut LexedText, radix: Radix) { - loop { - if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { - number.push(c); + // Dispatch based on the given character. + fn consume_ascii_character(&mut self, c: char) -> TokenKind { + let token = match c { + c if is_ascii_identifier_start(c) => self.lex_identifier(c), + '0'..='9' => self.lex_number(c), + '#' => return self.lex_comment(), + '\'' | '"' => self.lex_string(c), + '=' => { + if self.cursor.eat_char('=') { + TokenKind::EqEqual + } else { + self.state = State::AfterEqual; + return TokenKind::Equal; + } } - // Number that contains `_` separators. Remove them from the parsed text. - else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { - // Skip over `_` - self.cursor.bump(); - number.skip_char(); - } else { - break; + '+' => { + if self.cursor.eat_char('=') { + TokenKind::PlusEqual + } else { + TokenKind::Plus + } + } + '*' => { + if self.cursor.eat_char('=') { + TokenKind::StarEqual + } else if self.cursor.eat_char('*') { + if self.cursor.eat_char('=') { + TokenKind::DoubleStarEqual + } else { + TokenKind::DoubleStar + } + } else { + TokenKind::Star + } } - } - } - /// Lex a single comment. - fn lex_comment(&mut self) -> TokenKind { - #[cfg(debug_assertions)] - debug_assert_eq!(self.cursor.previous(), '#'); + c @ ('%' | '!') + if self.mode == Mode::Ipython + && self.state.is_after_equal() + && self.nesting == 0 => + { + // SAFETY: Safe because `c` has been matched against one of the possible escape command token + self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap()) + } - let bytes = self.cursor.rest().as_bytes(); - let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); - self.cursor.skip_bytes(offset); + c @ ('%' | '!' | '?' | '/' | ';' | ',') + if self.mode == Mode::Ipython && self.state.is_new_logical_line() => + { + let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) { + self.cursor.bump(); + kind + } else { + // SAFETY: Safe because `c` has been matched against one of the possible escape command token + IpyEscapeKind::try_from(c).unwrap() + }; - TokenKind::Comment - } + self.lex_ipython_escape_command(kind) + } - /// Lex a single IPython escape command. - fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind { - let mut value = String::new(); + '?' if self.mode == Mode::Ipython => TokenKind::Question, - loop { - match self.cursor.first() { - '\\' => { - // Only skip the line continuation if it is followed by a newline - // otherwise it is a normal backslash which is part of the magic command: - // - // Skip this backslash - // v - // !pwd \ - // && ls -a | sed 's/^/\\ /' - // ^^ - // Don't skip these backslashes - if self.cursor.second() == '\r' { - self.cursor.bump(); - self.cursor.bump(); - self.cursor.eat_char('\n'); - continue; - } else if self.cursor.second() == '\n' { - self.cursor.bump(); - self.cursor.bump(); - continue; + '/' => { + if self.cursor.eat_char('=') { + TokenKind::SlashEqual + } else if self.cursor.eat_char('/') { + if self.cursor.eat_char('=') { + TokenKind::DoubleSlashEqual + } else { + TokenKind::DoubleSlash } - - self.cursor.bump(); - value.push('\\'); + } else { + TokenKind::Slash } - // Help end escape commands are those that end with 1 or 2 question marks. - // Here, we're only looking for a subset of help end escape commands which - // are the ones that has the escape token at the start of the line as well. - // On the other hand, we're not looking for help end escape commands that - // are strict in the sense that the escape token is only at the end. For example, - // - // * `%foo?` is recognized as a help end escape command but not as a strict one. - // * `foo?` is recognized as a strict help end escape command which is not - // lexed here but is identified at the parser level. - // - // Help end escape commands implemented in the IPython codebase using regex: - // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462 - '?' => { - self.cursor.bump(); - let mut question_count = 1u32; - while self.cursor.eat_char('?') { - question_count += 1; + } + '%' => { + if self.cursor.eat_char('=') { + TokenKind::PercentEqual + } else { + TokenKind::Percent + } + } + '|' => { + if self.cursor.eat_char('=') { + TokenKind::VbarEqual + } else { + TokenKind::Vbar + } + } + '^' => { + if self.cursor.eat_char('=') { + TokenKind::CircumflexEqual + } else { + TokenKind::CircumFlex + } + } + '&' => { + if self.cursor.eat_char('=') { + TokenKind::AmperEqual + } else { + TokenKind::Amper + } + } + '-' => { + if self.cursor.eat_char('=') { + TokenKind::MinusEqual + } else if self.cursor.eat_char('>') { + TokenKind::Rarrow + } else { + TokenKind::Minus + } + } + '@' => { + if self.cursor.eat_char('=') { + TokenKind::AtEqual + } else { + TokenKind::At + } + } + '!' => { + if self.cursor.eat_char('=') { + TokenKind::NotEqual + } else { + TokenKind::Exclamation + } + } + '~' => TokenKind::Tilde, + '(' => { + self.nesting += 1; + TokenKind::Lpar + } + ')' => { + self.nesting = self.nesting.saturating_sub(1); + TokenKind::Rpar + } + '[' => { + self.nesting += 1; + TokenKind::Lsqb + } + ']' => { + self.nesting = self.nesting.saturating_sub(1); + TokenKind::Rsqb + } + '{' => { + self.nesting += 1; + TokenKind::Lbrace + } + '}' => { + if let Some(fstring) = self.fstrings.current_mut() { + if fstring.nesting() == self.nesting { + return self.push_error(LexicalError::new( + LexicalErrorType::FStringError(FStringErrorType::SingleRbrace), + self.token_range(), + )); } - - // The original implementation in the IPython codebase is based on regex which - // means that it's strict in the sense that it won't recognize a help end escape: - // * If there's any whitespace before the escape token (e.g. `%foo ?`) - // * If there are more than 2 question mark tokens (e.g. `%foo???`) - // which is what we're doing here as well. In that case, we'll continue with - // the prefixed escape token. - // - // Now, the whitespace and empty value check also makes sure that an empty - // command (e.g. `%?` or `? ??`, no value after/between the escape tokens) - // is not recognized as a help end escape command. So, `%?` and `? ??` are - // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??` - // tokens. - if question_count > 2 - || value.chars().last().map_or(true, is_python_whitespace) - || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR) - { - // Not a help end escape command, so continue with the lexing. - value.reserve(question_count as usize); - for _ in 0..question_count { - value.push('?'); - } - continue; + fstring.try_end_format_spec(self.nesting); + } + self.nesting = self.nesting.saturating_sub(1); + TokenKind::Rbrace + } + ':' => { + if self + .fstrings + .current_mut() + .is_some_and(|fstring| fstring.try_start_format_spec(self.nesting)) + { + TokenKind::Colon + } else if self.cursor.eat_char('=') { + TokenKind::ColonEqual + } else { + TokenKind::Colon + } + } + ';' => TokenKind::Semi, + '<' => { + if self.cursor.eat_char('<') { + if self.cursor.eat_char('=') { + TokenKind::LeftShiftEqual + } else { + TokenKind::LeftShift + } + } else if self.cursor.eat_char('=') { + TokenKind::LessEqual + } else { + TokenKind::Less + } + } + '>' => { + if self.cursor.eat_char('>') { + if self.cursor.eat_char('=') { + TokenKind::RightShiftEqual + } else { + TokenKind::RightShift + } + } else if self.cursor.eat_char('=') { + TokenKind::GreaterEqual + } else { + TokenKind::Greater + } + } + ',' => TokenKind::Comma, + '.' => { + if self.cursor.first().is_ascii_digit() { + self.lex_decimal_number('.') + } else if self.cursor.eat_char2('.', '.') { + TokenKind::Ellipsis + } else { + TokenKind::Dot + } + } + '\n' => { + return if self.nesting == 0 && !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + TokenKind::Newline + } else { + if let Some(fstring) = self.fstrings.current_mut() { + fstring.try_end_format_spec(self.nesting); } + TokenKind::NonLogicalNewline + } + } + '\r' => { + self.cursor.eat_char('\n'); - if escape_kind.is_help() { - // If we've recognize this as a help end escape command, then - // any question mark token / whitespaces at the start are not - // considered as part of the value. - // - // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and - // `value` is `foo` instead of `??foo`. - value = value.trim_start_matches([' ', '?']).to_string(); - } else if escape_kind.is_magic() { - // Between `%` and `?` (at the end), the `?` takes priority - // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help` - // and `value` is `%foo` instead of `foo`. So, we need to - // insert the magic escape token at the start. - value.insert_str(0, escape_kind.as_str()); + return if self.nesting == 0 && !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + TokenKind::Newline + } else { + if let Some(fstring) = self.fstrings.current_mut() { + fstring.try_end_format_spec(self.nesting); } + TokenKind::NonLogicalNewline + }; + } - let kind = match question_count { - 1 => IpyEscapeKind::Help, - 2 => IpyEscapeKind::Help2, - _ => unreachable!("`question_count` is always 1 or 2"), - }; + _ => { + self.state = State::Other; - self.current_value = TokenValue::IpyEscapeCommand { - kind, - value: value.into_boxed_str(), - }; + return self.push_error(LexicalError::new( + LexicalErrorType::UnrecognizedToken { tok: c }, + self.token_range(), + )); + } + }; - return TokenKind::IpyEscapeCommand; - } - '\n' | '\r' | EOF_CHAR => { - self.current_value = TokenValue::IpyEscapeCommand { - kind: escape_kind, - value: value.into_boxed_str(), - }; + self.state = State::Other; + + token + } + + /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. + fn lex_identifier(&mut self, first: char) -> TokenKind { + // Detect potential string like rb'' b'' f'' u'' r'' + let quote = match (first, self.cursor.first()) { + (_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| { + self.cursor.bump(); + quote + }), + (_, second) if is_quote(self.cursor.second()) => { + self.try_double_char_prefix([first, second]).then(|| { + self.cursor.bump(); + // SAFETY: Safe because of the `is_quote` check in this match arm's guard + self.cursor.bump().unwrap() + }) + } + _ => None, + }; + + if let Some(quote) = quote { + if self.current_flags.is_f_string() { + return self.lex_fstring_start(quote); + } + + return self.lex_string(quote); + } + + // Keep track of whether the identifier is ASCII-only or not. + // + // This is important because Python applies NFKC normalization to + // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers. + // We need to therefore do the same in our lexer, but applying NFKC normalization + // unconditionally is extremely expensive. If we know an identifier is ASCII-only, + // (by far the most common case), we can skip NFKC normalization of the identifier. + let mut is_ascii = first.is_ascii(); + self.cursor + .eat_while(|c| is_identifier_continuation(c, &mut is_ascii)); + + let text = self.token_text(); + + if !is_ascii { + self.current_value = TokenValue::Name(text.nfkc().collect::().into_boxed_str()); + return TokenKind::Name; + } + + match text { + "False" => TokenKind::False, + "None" => TokenKind::None, + "True" => TokenKind::True, + "and" => TokenKind::And, + "as" => TokenKind::As, + "assert" => TokenKind::Assert, + "async" => TokenKind::Async, + "await" => TokenKind::Await, + "break" => TokenKind::Break, + "case" => TokenKind::Case, + "class" => TokenKind::Class, + "continue" => TokenKind::Continue, + "def" => TokenKind::Def, + "del" => TokenKind::Del, + "elif" => TokenKind::Elif, + "else" => TokenKind::Else, + "except" => TokenKind::Except, + "finally" => TokenKind::Finally, + "for" => TokenKind::For, + "from" => TokenKind::From, + "global" => TokenKind::Global, + "if" => TokenKind::If, + "import" => TokenKind::Import, + "in" => TokenKind::In, + "is" => TokenKind::Is, + "lambda" => TokenKind::Lambda, + "match" => TokenKind::Match, + "nonlocal" => TokenKind::Nonlocal, + "not" => TokenKind::Not, + "or" => TokenKind::Or, + "pass" => TokenKind::Pass, + "raise" => TokenKind::Raise, + "return" => TokenKind::Return, + "try" => TokenKind::Try, + "type" => TokenKind::Type, + "while" => TokenKind::While, + "with" => TokenKind::With, + "yield" => TokenKind::Yield, + _ => { + self.current_value = TokenValue::Name(text.to_string().into_boxed_str()); + TokenKind::Name + } + } + } - return TokenKind::IpyEscapeCommand; - } - c => { - self.cursor.bump(); - value.push(c); - } + /// Try lexing the single character string prefix, updating the token flags accordingly. + /// Returns `true` if it matches. + fn try_single_char_prefix(&mut self, first: char) -> bool { + match first { + 'f' | 'F' => self.current_flags |= TokenFlags::F_STRING, + 'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING, + 'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING, + 'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE, + 'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE, + _ => return false, + } + true + } + + /// Try lexing the double character string prefix, updating the token flags accordingly. + /// Returns `true` if it matches. + fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool { + match value { + ['r', 'f' | 'F'] | ['f' | 'F', 'r'] => { + self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE; + } + ['R', 'f' | 'F'] | ['f' | 'F', 'R'] => { + self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE; + } + ['r', 'b' | 'B'] | ['b' | 'B', 'r'] => { + self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE; + } + ['R', 'b' | 'B'] | ['b' | 'B', 'R'] => { + self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE; } + _ => return false, } + true } /// Lex a f-string start token. @@ -704,8 +870,8 @@ impl<'src> Lexer<'src> { }; self.current_value = TokenValue::FStringMiddle(value.into_boxed_str()); - self.current_flags = fstring.flags(); + self.current_flags = fstring.flags(); Some(TokenKind::FStringMiddle) } @@ -820,485 +986,322 @@ impl<'src> Lexer<'src> { TokenKind::String } - /// Lex the next token. - pub fn next_token(&mut self) -> TokenKind { - self.cursor.start_token(); - self.current_value = TokenValue::None; - self.current_flags = TokenFlags::empty(); - self.current_kind = self.lex_token(); - self.current_range = self.token_range(); - self.current_kind - } - - fn lex_token(&mut self) -> TokenKind { - if let Some(fstring) = self.fstrings.current() { - if !fstring.is_in_expression(self.nesting) { - if let Some(token) = self.lex_fstring_middle_or_end() { - if matches!(token, TokenKind::FStringEnd) { - self.fstrings.pop(); - } - return token; - } - } - } - // Return dedent tokens until the current indentation level matches the indentation of the next token. - else if let Some(indentation) = self.pending_indentation.take() { - match self.indentations.current().try_compare(indentation) { - Ok(Ordering::Greater) => { - self.pending_indentation = Some(indentation); - if self.indentations.dedent_one(indentation).is_err() { - return self.push_error(LexicalError::new( - LexicalErrorType::IndentationError, - self.token_range(), - )); - } - return TokenKind::Dedent; - } - Ok(_) => {} - Err(_) => { - return self.push_error(LexicalError::new( - LexicalErrorType::IndentationError, - self.token_range(), - )); - } - } - } - - if self.state.is_after_newline() { - if let Some(indentation) = self.eat_indentation() { - return indentation; - } - } else { - if let Err(error) = self.skip_whitespace() { - return self.push_error(error); - } - } - - // The lexer might've skipped whitespaces, so update the start offset - self.cursor.start_token(); - - if let Some(c) = self.cursor.bump() { - if c.is_ascii() { - self.consume_ascii_character(c) - } else if is_unicode_identifier_start(c) { - let identifier = self.lex_identifier(c); - self.state = State::Other; - - identifier + /// Numeric lexing. The feast can start! + fn lex_number(&mut self, first: char) -> TokenKind { + if first == '0' { + if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() { + self.lex_number_radix(Radix::Hex) + } else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() { + self.lex_number_radix(Radix::Octal) + } else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() { + self.lex_number_radix(Radix::Binary) } else { - self.push_error(LexicalError::new( - LexicalErrorType::UnrecognizedToken { tok: c }, - self.token_range(), - )) + self.lex_decimal_number(first) } } else { - // Reached the end of the file. Emit a trailing newline token if not at the beginning of a logical line, - // empty the dedent stack, and finally, return the EndOfFile token. - self.consume_end() - } - } - - fn skip_whitespace(&mut self) -> Result<(), LexicalError> { - loop { - match self.cursor.first() { - ' ' => { - self.cursor.bump(); - } - '\t' => { - self.cursor.bump(); - } - '\\' => { - self.cursor.bump(); - if self.cursor.eat_char('\r') { - self.cursor.eat_char('\n'); - } else if self.cursor.is_eof() { - return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range())); - } else if !self.cursor.eat_char('\n') { - return Err(LexicalError::new( - LexicalErrorType::LineContinuationError, - self.token_range(), - )); - } - } - // Form feed - '\x0C' => { - self.cursor.bump(); - } - _ => break, - } - } - - Ok(()) - } - - fn eat_indentation(&mut self) -> Option { - let mut indentation = Indentation::root(); - - loop { - match self.cursor.first() { - ' ' => { - self.cursor.bump(); - indentation = indentation.add_space(); - } - '\t' => { - self.cursor.bump(); - indentation = indentation.add_tab(); - } - '\\' => { - self.cursor.bump(); - if self.cursor.eat_char('\r') { - self.cursor.eat_char('\n'); - } else if self.cursor.is_eof() { - return Some(self.push_error(LexicalError::new( - LexicalErrorType::Eof, - self.token_range(), - ))); - } else if !self.cursor.eat_char('\n') { - return Some(self.push_error(LexicalError::new( - LexicalErrorType::LineContinuationError, - self.token_range(), - ))); - } - indentation = Indentation::root(); - } - // Form feed - '\x0C' => { - self.cursor.bump(); - indentation = Indentation::root(); - } - _ => break, - } - } - - // Handle indentation if this is a new, not all empty, logical line - if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) { - self.state = State::NonEmptyLogicalLine; - - // Set to false so that we don't handle indentation on the next call. - return self.handle_indentation(indentation); + self.lex_decimal_number(first) } - - None } - fn handle_indentation(&mut self, indentation: Indentation) -> Option { - let token = match self.indentations.current().try_compare(indentation) { - // Dedent - Ok(Ordering::Greater) => { - self.pending_indentation = Some(indentation); - - if self.indentations.dedent_one(indentation).is_err() { - return Some(self.push_error(LexicalError::new( - LexicalErrorType::IndentationError, - self.token_range(), - ))); - }; - - // The lexer might've eaten some whitespaces to calculate the `indentation`. For - // example: - // - // ```py - // if first: - // if second: - // pass - // foo - // # ^ - // ``` - // - // Here, the cursor is at `^` and the `indentation` contains the whitespaces before - // the `pass` token. - self.cursor.start_token(); - - Some(TokenKind::Dedent) - } + /// Lex a hex/octal/decimal/binary number without a decimal point. + fn lex_number_radix(&mut self, radix: Radix) -> TokenKind { + #[cfg(debug_assertions)] + debug_assert!(matches!( + self.cursor.previous().to_ascii_lowercase(), + 'x' | 'o' | 'b' + )); - Ok(Ordering::Equal) => None, + // Lex the portion of the token after the base prefix (e.g., `9D5` in `0x9D5`). + let mut number = LexedText::new(self.offset(), self.source); + self.radix_run(&mut number, radix); - // Indent - Ok(Ordering::Less) => { - self.indentations.indent(indentation); - Some(TokenKind::Indent) - } - Err(_) => { - return Some(self.push_error(LexicalError::new( - LexicalErrorType::IndentationError, + // Extract the entire number, including the base prefix (e.g., `0x9D5`). + let token = &self.source[self.token_range()]; + + let value = match Int::from_str_radix(number.as_str(), radix.as_u32(), token) { + Ok(int) => int, + Err(err) => { + return self.push_error(LexicalError::new( + LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()), self.token_range(), - ))); + )); } }; - - token + self.current_value = TokenValue::Int(value); + TokenKind::Int } - fn consume_end(&mut self) -> TokenKind { - // We reached end of file. - // First of all, we need all nestings to be finished. - if self.nesting > 0 { - // Reset the nesting to avoid going into infinite loop. - self.nesting = 0; - return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range())); - } + /// Lex a normal number, that is, no octal, hex or binary number. + fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind { + #[cfg(debug_assertions)] + debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); + let start_is_zero = first_digit_or_dot == '0'; - // Next, insert a trailing newline, if required. - if !self.state.is_new_logical_line() { - self.state = State::AfterNewline; - TokenKind::Newline - } - // Next, flush the indentation stack to zero. - else if self.indentations.dedent().is_some() { - TokenKind::Dedent - } else { - TokenKind::EndOfFile - } - } + let mut number = LexedText::new(self.token_start(), self.source); + if first_digit_or_dot != '.' { + number.push(first_digit_or_dot); + self.radix_run(&mut number, Radix::Decimal); + }; - // Dispatch based on the given character. - fn consume_ascii_character(&mut self, c: char) -> TokenKind { - let token = match c { - c if is_ascii_identifier_start(c) => self.lex_identifier(c), - '0'..='9' => self.lex_number(c), - '#' => return self.lex_comment(), - '\'' | '"' => self.lex_string(c), - '=' => { - if self.cursor.eat_char('=') { - TokenKind::EqEqual - } else { - self.state = State::AfterEqual; - return TokenKind::Equal; - } - } - '+' => { - if self.cursor.eat_char('=') { - TokenKind::PlusEqual - } else { - TokenKind::Plus - } - } - '*' => { - if self.cursor.eat_char('=') { - TokenKind::StarEqual - } else if self.cursor.eat_char('*') { - if self.cursor.eat_char('=') { - TokenKind::DoubleStarEqual - } else { - TokenKind::DoubleStar - } - } else { - TokenKind::Star - } - } + let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { + number.push('.'); - c @ ('%' | '!') - if self.mode == Mode::Ipython - && self.state.is_after_equal() - && self.nesting == 0 => - { - // SAFETY: Safe because `c` has been matched against one of the possible escape command token - self.lex_ipython_escape_command(IpyEscapeKind::try_from(c).unwrap()) + if self.cursor.eat_char('_') { + return self.push_error(LexicalError::new( + LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()), + TextRange::new(self.offset() - TextSize::new(1), self.offset()), + )); } - c @ ('%' | '!' | '?' | '/' | ';' | ',') - if self.mode == Mode::Ipython && self.state.is_new_logical_line() => - { - let kind = if let Ok(kind) = IpyEscapeKind::try_from([c, self.cursor.first()]) { - self.cursor.bump(); - kind - } else { - // SAFETY: Safe because `c` has been matched against one of the possible escape command token - IpyEscapeKind::try_from(c).unwrap() - }; - - self.lex_ipython_escape_command(kind) - } + self.radix_run(&mut number, Radix::Decimal); + true + } else { + // Normal number: + false + }; - '?' if self.mode == Mode::Ipython => TokenKind::Question, + let is_float = match self.cursor.rest().as_bytes() { + [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { + // 'e' | 'E' + number.push(self.cursor.bump().unwrap()); - '/' => { - if self.cursor.eat_char('=') { - TokenKind::SlashEqual - } else if self.cursor.eat_char('/') { - if self.cursor.eat_char('=') { - TokenKind::DoubleSlashEqual - } else { - TokenKind::DoubleSlash - } - } else { - TokenKind::Slash - } - } - '%' => { - if self.cursor.eat_char('=') { - TokenKind::PercentEqual - } else { - TokenKind::Percent - } - } - '|' => { - if self.cursor.eat_char('=') { - TokenKind::VbarEqual - } else { - TokenKind::Vbar - } - } - '^' => { - if self.cursor.eat_char('=') { - TokenKind::CircumflexEqual - } else { - TokenKind::CircumFlex - } - } - '&' => { - if self.cursor.eat_char('=') { - TokenKind::AmperEqual - } else { - TokenKind::Amper - } - } - '-' => { - if self.cursor.eat_char('=') { - TokenKind::MinusEqual - } else if self.cursor.eat_char('>') { - TokenKind::Rarrow - } else { - TokenKind::Minus - } - } - '@' => { - if self.cursor.eat_char('=') { - TokenKind::AtEqual - } else { - TokenKind::At - } - } - '!' => { - if self.cursor.eat_char('=') { - TokenKind::NotEqual - } else { - TokenKind::Exclamation + if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { + number.push(sign); } + + self.radix_run(&mut number, Radix::Decimal); + + true } - '~' => TokenKind::Tilde, - '(' => { - self.nesting += 1; - TokenKind::Lpar - } - ')' => { - self.nesting = self.nesting.saturating_sub(1); - TokenKind::Rpar - } - '[' => { - self.nesting += 1; - TokenKind::Lsqb - } - ']' => { - self.nesting = self.nesting.saturating_sub(1); - TokenKind::Rsqb - } - '{' => { - self.nesting += 1; - TokenKind::Lbrace + _ => is_float, + }; + + if is_float { + // Improvement: Use `Cow` instead of pushing to value text + let Ok(value) = f64::from_str(number.as_str()) else { + return self.push_error(LexicalError::new( + LexicalErrorType::OtherError( + "Invalid decimal literal".to_string().into_boxed_str(), + ), + self.token_range(), + )); + }; + + // Parse trailing 'j': + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + self.current_value = TokenValue::Complex { + real: 0.0, + imag: value, + }; + TokenKind::Complex + } else { + self.current_value = TokenValue::Float(value); + TokenKind::Float } - '}' => { - if let Some(fstring) = self.fstrings.current_mut() { - if fstring.nesting() == self.nesting { + } else { + // Parse trailing 'j': + if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { + let imag = f64::from_str(number.as_str()).unwrap(); + self.current_value = TokenValue::Complex { real: 0.0, imag }; + TokenKind::Complex + } else { + let value = match Int::from_str(number.as_str()) { + Ok(value) => { + if start_is_zero && value.as_u8() != Some(0) { + // Leading zeros in decimal integer literals are not permitted. + return self.push_error(LexicalError::new( + LexicalErrorType::OtherError( + "Invalid decimal integer literal" + .to_string() + .into_boxed_str(), + ), + self.token_range(), + )); + } + value + } + Err(err) => { return self.push_error(LexicalError::new( - LexicalErrorType::FStringError(FStringErrorType::SingleRbrace), + LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()), self.token_range(), - )); + )) } - fstring.try_end_format_spec(self.nesting); - } - self.nesting = self.nesting.saturating_sub(1); - TokenKind::Rbrace + }; + self.current_value = TokenValue::Int(value); + TokenKind::Int } - ':' => { - if self - .fstrings - .current_mut() - .is_some_and(|fstring| fstring.try_start_format_spec(self.nesting)) - { - TokenKind::Colon - } else if self.cursor.eat_char('=') { - TokenKind::ColonEqual - } else { - TokenKind::Colon - } + } + } + + /// Consume a sequence of numbers with the given radix, + /// the digits can be decorated with underscores + /// like this: '`1_2_3_4`' == '1234' + fn radix_run(&mut self, number: &mut LexedText, radix: Radix) { + loop { + if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { + number.push(c); } - ';' => TokenKind::Semi, - '<' => { - if self.cursor.eat_char('<') { - if self.cursor.eat_char('=') { - TokenKind::LeftShiftEqual - } else { - TokenKind::LeftShift - } - } else if self.cursor.eat_char('=') { - TokenKind::LessEqual - } else { - TokenKind::Less - } + // Number that contains `_` separators. Remove them from the parsed text. + else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + // Skip over `_` + self.cursor.bump(); + number.skip_char(); + } else { + break; } - '>' => { - if self.cursor.eat_char('>') { - if self.cursor.eat_char('=') { - TokenKind::RightShiftEqual - } else { - TokenKind::RightShift + } + } + + /// Lex a single comment. + fn lex_comment(&mut self) -> TokenKind { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), '#'); + + let bytes = self.cursor.rest().as_bytes(); + let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); + self.cursor.skip_bytes(offset); + + TokenKind::Comment + } + + /// Lex a single IPython escape command. + fn lex_ipython_escape_command(&mut self, escape_kind: IpyEscapeKind) -> TokenKind { + let mut value = String::new(); + + loop { + match self.cursor.first() { + '\\' => { + // Only skip the line continuation if it is followed by a newline + // otherwise it is a normal backslash which is part of the magic command: + // + // Skip this backslash + // v + // !pwd \ + // && ls -a | sed 's/^/\\ /' + // ^^ + // Don't skip these backslashes + if self.cursor.second() == '\r' { + self.cursor.bump(); + self.cursor.bump(); + self.cursor.eat_char('\n'); + continue; + } else if self.cursor.second() == '\n' { + self.cursor.bump(); + self.cursor.bump(); + continue; } - } else if self.cursor.eat_char('=') { - TokenKind::GreaterEqual - } else { - TokenKind::Greater - } - } - ',' => TokenKind::Comma, - '.' => { - if self.cursor.first().is_ascii_digit() { - self.lex_decimal_number('.') - } else if self.cursor.eat_char2('.', '.') { - TokenKind::Ellipsis - } else { - TokenKind::Dot + + self.cursor.bump(); + value.push('\\'); } - } - '\n' => { - return if self.nesting == 0 && !self.state.is_new_logical_line() { - self.state = State::AfterNewline; - TokenKind::Newline - } else { - if let Some(fstring) = self.fstrings.current_mut() { - fstring.try_end_format_spec(self.nesting); + // Help end escape commands are those that end with 1 or 2 question marks. + // Here, we're only looking for a subset of help end escape commands which + // are the ones that has the escape token at the start of the line as well. + // On the other hand, we're not looking for help end escape commands that + // are strict in the sense that the escape token is only at the end. For example, + // + // * `%foo?` is recognized as a help end escape command but not as a strict one. + // * `foo?` is recognized as a strict help end escape command which is not + // lexed here but is identified at the parser level. + // + // Help end escape commands implemented in the IPython codebase using regex: + // https://github.com/ipython/ipython/blob/292e3a23459ca965b8c1bfe2c3707044c510209a/IPython/core/inputtransformer2.py#L454-L462 + '?' => { + self.cursor.bump(); + let mut question_count = 1u32; + while self.cursor.eat_char('?') { + question_count += 1; } - TokenKind::NonLogicalNewline - } - } - '\r' => { - self.cursor.eat_char('\n'); - return if self.nesting == 0 && !self.state.is_new_logical_line() { - self.state = State::AfterNewline; - TokenKind::Newline - } else { - if let Some(fstring) = self.fstrings.current_mut() { - fstring.try_end_format_spec(self.nesting); + // The original implementation in the IPython codebase is based on regex which + // means that it's strict in the sense that it won't recognize a help end escape: + // * If there's any whitespace before the escape token (e.g. `%foo ?`) + // * If there are more than 2 question mark tokens (e.g. `%foo???`) + // which is what we're doing here as well. In that case, we'll continue with + // the prefixed escape token. + // + // Now, the whitespace and empty value check also makes sure that an empty + // command (e.g. `%?` or `? ??`, no value after/between the escape tokens) + // is not recognized as a help end escape command. So, `%?` and `? ??` are + // `IpyEscapeKind::Magic` and `IpyEscapeKind::Help` because of the initial `%` and `??` + // tokens. + if question_count > 2 + || value.chars().last().map_or(true, is_python_whitespace) + || !matches!(self.cursor.first(), '\n' | '\r' | EOF_CHAR) + { + // Not a help end escape command, so continue with the lexing. + value.reserve(question_count as usize); + for _ in 0..question_count { + value.push('?'); + } + continue; } - TokenKind::NonLogicalNewline - }; - } - _ => { - self.state = State::Other; + if escape_kind.is_help() { + // If we've recognize this as a help end escape command, then + // any question mark token / whitespaces at the start are not + // considered as part of the value. + // + // For example, `??foo?` is recognized as `IpyEscapeKind::Help` and + // `value` is `foo` instead of `??foo`. + value = value.trim_start_matches([' ', '?']).to_string(); + } else if escape_kind.is_magic() { + // Between `%` and `?` (at the end), the `?` takes priority + // over the `%` so `%foo?` is recognized as `IpyEscapeKind::Help` + // and `value` is `%foo` instead of `foo`. So, we need to + // insert the magic escape token at the start. + value.insert_str(0, escape_kind.as_str()); + } - return self.push_error(LexicalError::new( - LexicalErrorType::UnrecognizedToken { tok: c }, - self.token_range(), - )); + let kind = match question_count { + 1 => IpyEscapeKind::Help, + 2 => IpyEscapeKind::Help2, + _ => unreachable!("`question_count` is always 1 or 2"), + }; + + self.current_value = TokenValue::IpyEscapeCommand { + kind, + value: value.into_boxed_str(), + }; + + return TokenKind::IpyEscapeCommand; + } + '\n' | '\r' | EOF_CHAR => { + self.current_value = TokenValue::IpyEscapeCommand { + kind: escape_kind, + value: value.into_boxed_str(), + }; + + return TokenKind::IpyEscapeCommand; + } + c => { + self.cursor.bump(); + value.push(c); + } } - }; + } + } - self.state = State::Other; + fn consume_end(&mut self) -> TokenKind { + // We reached end of file. + // First of all, we need all nestings to be finished. + if self.nesting > 0 { + // Reset the nesting to avoid going into infinite loop. + self.nesting = 0; + return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range())); + } - token + // Next, insert a trailing newline, if required. + if !self.state.is_new_logical_line() { + self.state = State::AfterNewline; + TokenKind::Newline + } + // Next, flush the indentation stack to zero. + else if self.indentations.dedent().is_some() { + TokenKind::Dedent + } else { + TokenKind::EndOfFile + } } #[inline] @@ -1327,13 +1330,10 @@ impl<'src> Lexer<'src> { self.token_range().start() } - /// Takes the token value corresponding to the current token out of the lexer, replacing it - /// with the default value. - /// - /// All the subsequent call to this method without moving the lexer would always return the - /// default value which is [`TokenValue::None`]. - pub(crate) fn take_value(&mut self) -> TokenValue { - std::mem::take(&mut self.current_value) + /// Helper function to push the given error and return the [`TokenKind::Unknown`] token. + fn push_error(&mut self, error: LexicalError) -> TokenKind { + self.errors.push(error); + TokenKind::Unknown } /// Creates a checkpoint to which the lexer can later return to using [`Self::rewind`].