diff --git a/src/librustc_lexer/src/cursor.rs b/src/librustc_lexer/src/cursor.rs index 5831159c344d7..73d305c6d4fe2 100644 --- a/src/librustc_lexer/src/cursor.rs +++ b/src/librustc_lexer/src/cursor.rs @@ -1,5 +1,9 @@ use std::str::Chars; +/// Peekable iterator over a char sequence. +/// +/// Next characters can be peeked via `nth_char` method, +/// and position can be shifted forward via `bump` method. pub(crate) struct Cursor<'a> { initial_len: usize, chars: Chars<'a>, @@ -18,7 +22,9 @@ impl<'a> Cursor<'a> { prev: EOF_CHAR, } } + /// For debug assertions only + /// Returns the last eaten symbol (or '\0' in release builds). pub(crate) fn prev(&self) -> char { #[cfg(debug_assertions)] { @@ -30,19 +36,30 @@ impl<'a> Cursor<'a> { '\0' } } + + /// Returns nth character relative to the current cursor position. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. pub(crate) fn nth_char(&self, n: usize) -> char { self.chars().nth(n).unwrap_or(EOF_CHAR) } + + /// Checks if there is nothing more to consume. pub(crate) fn is_eof(&self) -> bool { self.chars.as_str().is_empty() } + + /// Returns amount of already consumed symbols. pub(crate) fn len_consumed(&self) -> usize { self.initial_len - self.chars.as_str().len() } - /// Returns an iterator over the remaining characters. + + /// Returns a `Chars` iterator over the remaining characters. fn chars(&self) -> Chars<'a> { self.chars.clone() } + /// Moves to the next character. pub(crate) fn bump(&mut self) -> Option { let c = self.chars.next()?; diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 30a5175d8cdb0..d55ef46d7506e 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -1,3 +1,16 @@ +//! Low-level Rust lexer. +//! +//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax, +//! for that see `libsyntax::parse::lexer`, which converts this basic token stream +//! into wide tokens used by actual parser. +//! +//! The purpose of this crate is to convert raw sources into a labeled sequence +//! of well-known token types, so building an actual Rust token stream will +//! be easier. +//! +//! Main entity of this crate is [`TokenKind`] enum which represents common +//! lexeme types. + // We want to be able to build this crate with a stable compiler, so no // `#![feature]` attributes should be added. @@ -6,78 +19,144 @@ pub mod unescape; use crate::cursor::{Cursor, EOF_CHAR}; +/// Parsed token. +/// It doesn't contain information about data that has been parsed, +/// only the type of the token and its size. pub struct Token { pub kind: TokenKind, pub len: usize, } +impl Token { + fn new(kind: TokenKind, len: usize) -> Token { + Token { kind, len } + } +} + +/// Enum represening common lexeme types. #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum TokenKind { + // Multi-char tokens: + + /// "// comment" LineComment, + /// "/* block comment */" + /// Block comments can be recursive, so the sequence like "/* /* */" + /// will not be considered terminated and will result in a parsing error. BlockComment { terminated: bool }, + /// Any whitespace characters sequence. Whitespace, + /// "ident" or "continue" + /// At this step keywords are also considered identifiers. Ident, + /// "r#ident" RawIdent, + /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. Literal { kind: LiteralKind, suffix_start: usize }, + /// "'a" Lifetime { starts_with_number: bool }, + + // One-char tokens: + + /// ";" Semi, + /// "," Comma, + /// "." Dot, + /// "(" OpenParen, + /// ")" CloseParen, + /// "{" OpenBrace, + /// "}" CloseBrace, + /// "[" OpenBracket, + /// "]" CloseBracket, + /// "@" At, + /// "#" Pound, + /// "~" Tilde, + /// "?" Question, + /// ":" Colon, + /// "$" Dollar, + /// "=" Eq, + /// "!" Not, + /// "<" Lt, + /// ">" Gt, + /// "-" Minus, + /// "&" And, + /// "|" Or, + /// "+" Plus, + /// "*" Star, + /// "/" Slash, + /// "^" Caret, + /// "%" Percent, + + /// Unknown token, not expected by the lexer, e.g. "№" Unknown, } use self::TokenKind::*; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum LiteralKind { + /// "12_u8", "0o100", "0b120i99" Int { base: Base, empty_int: bool }, + /// "12.34f32", "0b100.100" Float { base: Base, empty_exponent: bool }, + /// "'a'", "'\\'", "'''", "';" Char { terminated: bool }, + /// "b'a'", "b'\\'", "b'''", "b';" Byte { terminated: bool }, + /// ""abc"", ""abc" Str { terminated: bool }, + /// "b"abc"", "b"abc" ByteStr { terminated: bool }, + /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" RawStr { n_hashes: usize, started: bool, terminated: bool }, + /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" RawByteStr { n_hashes: usize, started: bool, terminated: bool }, } use self::LiteralKind::*; +/// Base of numeric literal encoding according to its prefix. #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Base { + /// Literal starts with "0b". Binary, + /// Literal starts with "0o". Octal, + /// Literal starts with "0x". Hexadecimal, + /// Literal doesn't contain a prefix. Decimal, } -impl Token { - fn new(kind: TokenKind, len: usize) -> Token { - Token { kind, len } - } -} - +/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", +/// but shebang isn't a part of rust syntax, so this function +/// skips the line if it starts with a shebang ("#!"). +/// Line won't be skipped if it represents a valid Rust syntax +/// (e.g. "#![deny(missing_docs)]"). pub fn strip_shebang(input: &str) -> Option { debug_assert!(!input.is_empty()); if !input.starts_with("#!") || input.starts_with("#![") { @@ -86,11 +165,13 @@ pub fn strip_shebang(input: &str) -> Option { Some(input.find('\n').unwrap_or(input.len())) } +/// Parses the first token from the provided input string. pub fn first_token(input: &str) -> Token { debug_assert!(!input.is_empty()); Cursor::new(input).advance_token() } +/// Creates an iterator that produces tokens from the input string. pub fn tokenize(mut input: &str) -> impl Iterator + '_ { std::iter::from_fn(move || { if input.is_empty() { @@ -102,10 +183,9 @@ pub fn tokenize(mut input: &str) -> impl Iterator + '_ { }) } -// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these -// classes. - /// True if `c` is considered a whitespace according to Rust language definition. +/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) +/// for definitions of these classes. pub fn is_whitespace(c: char) -> bool { // This is Pattern_White_Space. // @@ -137,6 +217,8 @@ pub fn is_whitespace(c: char) -> bool { } /// True if `c` is valid as a first character of an identifier. +/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for +/// a formal definition of valid identifier name. pub fn is_id_start(c: char) -> bool { // This is XID_Start OR '_' (which formally is not a XID_Start). // We also add fast-path for ascii idents @@ -147,6 +229,8 @@ pub fn is_id_start(c: char) -> bool { } /// True if `c` is valid as a non-first character of an identifier. +/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for +/// a formal definition of valid identifier name. pub fn is_id_continue(c: char) -> bool { // This is exactly XID_Continue. // We also add fast-path for ascii idents @@ -159,15 +243,21 @@ pub fn is_id_continue(c: char) -> bool { impl Cursor<'_> { + /// Parses a token from the input string. fn advance_token(&mut self) -> Token { let first_char = self.bump().unwrap(); let token_kind = match first_char { + // Slash, comment or block comment. '/' => match self.nth_char(0) { '/' => self.line_comment(), '*' => self.block_comment(), _ => Slash, }, + + // Whitespace sequence. c if is_whitespace(c) => self.whitespace(), + + // Raw string literal or identifier. 'r' => match (self.nth_char(0), self.nth_char(1)) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { @@ -181,6 +271,8 @@ impl Cursor<'_> { } _ => self.ident(), }, + + // Byte literal, byte string literal, raw byte string literal or identifier. 'b' => match (self.nth_char(0), self.nth_char(1)) { ('\'', _) => { self.bump(); @@ -214,13 +306,20 @@ impl Cursor<'_> { } _ => self.ident(), }, + + // Identifier (this should be checked after other variant that can + // start as identifier). c if is_id_start(c) => self.ident(), + + // Numeric literal. c @ '0'..='9' => { let literal_kind = self.number(c); let suffix_start = self.len_consumed(); self.eat_literal_suffix(); TokenKind::Literal { kind: literal_kind, suffix_start } } + + // One-symbol tokens. ';' => Semi, ',' => Comma, '.' => Dot, @@ -247,7 +346,11 @@ impl Cursor<'_> { '*' => Star, '^' => Caret, '%' => Percent, + + // Lifetime or character literal. '\'' => self.lifetime_or_char(), + + // String literal. '"' => { let terminated = self.double_quoted_string(); let suffix_start = self.len_consumed(); @@ -291,6 +394,9 @@ impl Cursor<'_> { self.bump(); depth -= 1; if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. break; } } @@ -335,6 +441,7 @@ impl Cursor<'_> { debug_assert!('0' <= self.prev() && self.prev() <= '9'); let mut base = Base::Decimal; if first_digit == '0' { + // Attempt to parse encoding base. let has_digits = match self.nth_char(0) { 'b' => { base = Base::Binary; @@ -351,17 +458,21 @@ impl Cursor<'_> { self.bump(); self.eat_hexadecimal_digits() } + // Not a base prefix. '0'..='9' | '_' | '.' | 'e' | 'E' => { self.eat_decimal_digits(); true } - // just a 0 + // Just a 0. _ => return Int { base, empty_int: false }, }; + // Base prefix was provided, but there were no digits + // after it, e.g. "0x". if !has_digits { return Int { base, empty_int: true }; } } else { + // No base prefix, parse number in the usual way. self.eat_decimal_digits(); }; @@ -400,6 +511,9 @@ impl Cursor<'_> { fn lifetime_or_char(&mut self) -> TokenKind { debug_assert!(self.prev() == '\''); let mut starts_with_number = false; + + // Check if the first symbol after '\'' is a valid identifier + // character or a number (not a digit followed by '\''). if (is_id_start(self.nth_char(0)) || self.nth_char(0).is_digit(10) && { starts_with_number = true; @@ -408,6 +522,8 @@ impl Cursor<'_> { && self.nth_char(1) != '\'' { self.bump(); + + // Skip the identifier. while is_id_continue(self.nth_char(0)) { self.bump(); } @@ -420,6 +536,8 @@ impl Cursor<'_> { Lifetime { starts_with_number } }; } + + // This is not a lifetime (checked above), parse a char literal. let terminated = self.single_quoted_string(); let suffix_start = self.len_consumed(); if terminated { @@ -431,24 +549,32 @@ impl Cursor<'_> { fn single_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '\''); - // parse `'''` as a single char literal + // Parse `'''` as a single char literal. if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' { self.bump(); } + // Parse until either quotes are terminated or error is detected. let mut first = true; loop { match self.nth_char(0) { + // Probably beginning of the comment, which we don't want to include + // to the error report. '/' if !first => break, + // Newline without following '\'' means unclosed quote, stop parsing. '\n' if self.nth_char(1) != '\'' => break, + // End of file, stop parsing. EOF_CHAR if self.is_eof() => break, + // Quotes are terminated, finish parsing. '\'' => { self.bump(); return true; } + // Escaped slash is considered one character, so bump twice. '\\' => { self.bump(); self.bump(); } + // Skip the character. _ => { self.bump(); } @@ -458,6 +584,8 @@ impl Cursor<'_> { false } + /// Eats double-quoted string and returns true + /// if string is terminated. fn double_quoted_string(&mut self) -> bool { debug_assert!(self.prev() == '"'); loop { @@ -476,8 +604,11 @@ impl Cursor<'_> { } } + /// Eats the double-quoted string and returns a tuple of + /// (amount of the '#' symbols, raw string started, raw string terminated) fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { debug_assert!(self.prev() == 'r'); + // Count opening '#' symbols. let n_hashes = { let mut acc: usize = 0; loop { @@ -489,6 +620,8 @@ impl Cursor<'_> { } }; + // Skip the string itself and check that amount of closing '#' + // symbols is equal to the amount of opening ones. loop { match self.bump() { Some('"') => { @@ -549,6 +682,7 @@ impl Cursor<'_> { if self.eat_decimal_digits() { Ok(()) } else { Err(()) } } + // Eats the suffix if it's an identifier. fn eat_literal_suffix(&mut self) { if !is_id_start(self.nth_char(0)) { return; diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs index c709b7526082f..dee7bc2260bf0 100644 --- a/src/librustc_lexer/src/unescape.rs +++ b/src/librustc_lexer/src/unescape.rs @@ -7,32 +7,54 @@ use std::ops::Range; #[cfg(test)] mod tests; +/// Errors that can occur during string unescaping. #[derive(Debug, PartialEq, Eq)] pub enum EscapeError { + /// Expected 1 char, but 0 were found. ZeroChars, + /// Expected 1 char, but more than 1 were found. MoreThanOneChar, + /// Escaped '\' character without continuation. LoneSlash, + /// Invalid escape characted (e.g. '\z'). InvalidEscape, + /// Raw '\r' encountered. BareCarriageReturn, + /// Raw '\r' encountered in raw string. BareCarriageReturnInRawString, + /// Unescaped character that was expected to be escaped (e.g. raw '\t'). EscapeOnlyChar, + /// Numeric character escape is too short (e.g. '\x1'). TooShortHexEscape, + /// Invalid character in numeric escape (e.g. '\xz') InvalidCharInHexEscape, + /// Character code in numeric escape is non-ascii (e.g. '\xFF'). OutOfRangeHexEscape, + /// '\u' not followed by '{'. NoBraceInUnicodeEscape, + /// Non-hexadecimal value in '\u{..}'. InvalidCharInUnicodeEscape, + /// '\u{}' EmptyUnicodeEscape, + /// No closing brace in '\u{..}', e.g. '\u{12'. UnclosedUnicodeEscape, + /// '\u{_12}' LeadingUnderscoreUnicodeEscape, + /// More than 6 charactes in '\u{..}', e.g. '\u{10FFFF_FF}' OverlongUnicodeEscape, + /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'. LoneSurrogateUnicodeEscape, + /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'. OutOfRangeUnicodeEscape, + /// Unicode escape code in byte literal. UnicodeEscapeInByte, + /// Non-ascii character in byte literal. NonAsciiCharInByte, + /// Non-ascii character in byte string literal. NonAsciiCharInByteString, } @@ -44,15 +66,8 @@ pub fn unescape_char(literal_text: &str) -> Result { .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) } -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of escaped characters or errors. -pub fn unescape_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_str_or_byte_str(literal_text, Mode::Str, callback) -} - +/// Takes a contents of a byte literal (without quotes), and returns an +/// unescaped byte or an error. pub fn unescape_byte(literal_text: &str) -> Result { let mut chars = literal_text.chars(); unescape_char_or_byte(&mut chars, Mode::Byte) @@ -62,6 +77,17 @@ pub fn unescape_byte(literal_text: &str) -> Result { /// Takes a contents of a string literal (without quotes) and produces a /// sequence of escaped characters or errors. +/// Values are returned through invoking of the provided callback. +pub fn unescape_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::Str, callback) +} + +/// Takes a contents of a byte string literal (without quotes) and produces a +/// sequence of bytes or errors. +/// Values are returned through invoking of the provided callback. pub fn unescape_byte_str(literal_text: &str, callback: &mut F) where F: FnMut(Range, Result), @@ -71,8 +97,9 @@ where }) } -/// Takes a contents of a string literal (without quotes) and produces a +/// Takes a contents of a raw string literal (without quotes) and produces a /// sequence of characters or errors. +/// Values are returned through invoking of the provided callback. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. pub fn unescape_raw_str(literal_text: &str, callback: &mut F) @@ -82,8 +109,9 @@ where unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback) } -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. +/// Takes a contents of a raw byte string literal (without quotes) and produces a +/// sequence of bytes or errors. +/// Values are returned through invoking of the provided callback. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. pub fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) @@ -95,6 +123,7 @@ where }) } +/// What kind of literal do we parse. #[derive(Debug, Clone, Copy)] pub enum Mode { Char, @@ -126,6 +155,8 @@ impl Mode { fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result { if first_char != '\\' { + // Previous character was not a slash, and we don't expect it to be + // an escape-only character. return match first_char { '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), @@ -133,6 +164,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result Err(EscapeError::EscapeOnlyChar), _ => { if mode.is_bytes() && !first_char.is_ascii() { + // Byte literal can't be a non-ascii character. return Err(EscapeError::NonAsciiCharInByte); } Ok(first_char) @@ -140,6 +172,8 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result, mode: Mode) -> Result '\0', 'x' => { + // Parse hexadecimal character code. + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; @@ -160,6 +196,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result, mode: Mode) -> Result { + // We've parsed '\u', now we have to parse '{..}'. + if chars.next() != Some('{') { return Err(EscapeError::NoBraceInUnicodeEscape); } + // First characrer must be a hexadecimal digit. let mut n_digits = 1; let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), @@ -180,6 +220,8 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, }; + // First character is valid, now parse the rest of the number + // and closing brace. loop { match chars.next() { None => return Err(EscapeError::UnclosedUnicodeEscape), @@ -188,6 +230,9 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result 6 { return Err(EscapeError::OverlongUnicodeEscape); } + + // Incorrect syntax has higher priority for error reporting + // than unallowed value for a literal. if mode.is_bytes() { return Err(EscapeError::UnicodeEscapeInByte); } @@ -204,6 +249,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result 6 { + // Stop updating value since we're sure that it's is incorrect already. continue; } let digit = digit as u32; @@ -243,6 +289,10 @@ where let second_char = chars.clone().next(); match second_char { Some('\n') => { + // Rust language specification requires us to skip whitespaces + // if unescaped '\' character is followed by '\n'. + // For details see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). skip_ascii_whitespace(&mut chars); continue; }