Skip to content

Commit

Permalink
docs and better naming
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexWaygood committed Mar 15, 2024
1 parent d9a68ba commit d785be5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
27 changes: 21 additions & 6 deletions crates/ruff_python_parser/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,11 @@ impl<'source> Lexer<'source> {
}

/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char, ascii_first_char: bool) -> Result<Tok, LexicalError> {
fn lex_identifier(
&mut self,
first: char,
first_char_is_ascii: bool,
) -> Result<Tok, LexicalError> {
// Detect potential string like rb'' b'' f'' u'' r''
match (first, self.cursor.first()) {
('f' | 'F', quote @ ('\'' | '"')) => {
Expand Down Expand Up @@ -198,7 +202,14 @@ impl<'source> Lexer<'source> {
_ => {}
}

let mut is_ascii = ascii_first_char;
// Keep track of whether the identifier is ASCII-only or not.
//
// This is important because Python applies NFKC normalization to
// identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
// We need to therefore do the same in our lexer, but applying NFKC normalization
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
// (by far the most common case), we can skip NFKC normalization of the identifier.
let mut is_ascii = first_char_is_ascii;
self.cursor
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));

Expand Down Expand Up @@ -1589,15 +1600,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}

// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char, ascii_only_identifier: &mut bool) -> bool {
/// Checks if the character c is a valid continuation character as described
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
///
/// Additionally, this function also keeps track of whether or not the total
/// identifier is ASCII-only or not by mutably altering a reference to a
/// boolean value passed in.
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
// Arrange things such that ASCII codepoints never
// result in the slower `is_xid_continue` getting called.
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
*ascii_only_identifier = false;
*identifier_is_ascii_only = false;
is_xid_continue(c)
}
}
Expand Down
3 changes: 3 additions & 0 deletions crates/ruff_python_parser/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pub enum Tok {
/// Token value for a name, commonly known as an identifier.
Name {
/// The name value.
///
/// Unicode names are NFKC-normalized by the lexer,
/// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
name: Box<str>,
},
/// Token value for an integer.
Expand Down

0 comments on commit d785be5

Please sign in to comment.