From 98777b4c490386ab7889718e811d28ce7423f0af Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 14 Nov 2024 16:55:27 +1100 Subject: [PATCH 1/6] Merge `TokenTreesReader` into `StringReader`. There is a not-very-useful layering in the lexer, where `TokenTreesReader` contains a `StringReader`. This commit combines them and names the result `Lexer`, which is a more obvious name for it. The methods of `Lexer` are now split across `mod.rs` and `tokentrees.rs` which isn't ideal, but it doesn't seem worth moving a bunch of code to avoid it. --- compiler/rustc_parse/src/lexer/mod.rs | 21 +++++--- compiler/rustc_parse/src/lexer/tokentrees.rs | 51 +++++-------------- .../rustc_parse/src/lexer/unicode_chars.rs | 8 +-- 3 files changed, 31 insertions(+), 49 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 5023e83bd67c7..0ef5e9ed1d4a3 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol; use rustc_span::{BytePos, Pos, Span}; use tracing::debug; +use crate::lexer::diagnostics::TokenTreeDiagInfo; use crate::lexer::unicode_chars::UNICODE_ARRAY; use crate::{errors, make_unclosed_delims_error}; @@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>( } let cursor = Cursor::new(src); - let string_reader = StringReader { + let mut lexer = Lexer { psess, start_pos, pos: start_pos, @@ -65,9 +66,12 @@ pub(crate) fn lex_token_trees<'psess, 'src>( override_span, nbsp_is_whitespace: false, last_lifetime: None, + token: Token::dummy(), + diag_info: TokenTreeDiagInfo::default(), }; - let (stream, res, unmatched_delims) = - tokentrees::TokenTreesReader::lex_all_token_trees(string_reader); + let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false); + let unmatched_delims = lexer.diag_info.unmatched_delims; + match res { Ok(()) if unmatched_delims.is_empty() => Ok(stream), _ => { @@ -92,7 +96,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>( } } -struct StringReader<'psess, 'src> { +struct Lexer<'psess, 'src> { psess: &'psess ParseSess, /// Initial position, read-only. start_pos: BytePos, @@ -111,9 +115,14 @@ struct StringReader<'psess, 'src> { /// Track the `Span` for the leading `'` of the last lifetime. Used for /// diagnostics to detect possible typo where `"` was meant. last_lifetime: Option, + + /// The current token. + token: Token, + + diag_info: TokenTreeDiagInfo, } -impl<'psess, 'src> StringReader<'psess, 'src> { +impl<'psess, 'src> Lexer<'psess, 'src> { fn dcx(&self) -> DiagCtxtHandle<'psess> { self.psess.dcx() } @@ -124,7 +133,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> { /// Returns the next token, paired with a bool indicating if the token was /// preceded by whitespace. - fn next_token(&mut self) -> (Token, bool) { + fn next_token_from_cursor(&mut self) -> (Token, bool) { let mut preceded_by_whitespace = false; let mut swallow_next_invalid = 0; // Skip trivial (whitespace & comments) tokens diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 7b21ffacc841d..fab92aff74011 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -4,41 +4,19 @@ use rustc_ast_pretty::pprust::token_to_string; use rustc_errors::{Applicability, PErr}; use rustc_span::symbol::kw; -use super::diagnostics::{ - TokenTreeDiagInfo, report_suspicious_mismatch_block, same_indentation_level, -}; -use super::{StringReader, UnmatchedDelim}; +use super::diagnostics::{report_suspicious_mismatch_block, same_indentation_level}; +use super::{Lexer, UnmatchedDelim}; use crate::Parser; -pub(super) struct TokenTreesReader<'psess, 'src> { - string_reader: StringReader<'psess, 'src>, - /// The "next" token, which has been obtained from the `StringReader` but - /// not yet handled by the `TokenTreesReader`. - token: Token, - diag_info: TokenTreeDiagInfo, -} - -impl<'psess, 'src> TokenTreesReader<'psess, 'src> { - pub(super) fn lex_all_token_trees( - string_reader: StringReader<'psess, 'src>, - ) -> (TokenStream, Result<(), Vec>>, Vec) { - let mut tt_reader = TokenTreesReader { - string_reader, - token: Token::dummy(), - diag_info: TokenTreeDiagInfo::default(), - }; - let (_open_spacing, stream, res) = tt_reader.lex_token_trees(/* is_delimited */ false); - (stream, res, tt_reader.diag_info.unmatched_delims) - } - +impl<'psess, 'src> Lexer<'psess, 'src> { // Lex into a token stream. The `Spacing` in the result is that of the // opening delimiter. - fn lex_token_trees( + pub(super) fn lex_token_trees( &mut self, is_delimited: bool, ) -> (Spacing, TokenStream, Result<(), Vec>>) { // Move past the opening delimiter. - let (_, open_spacing) = self.bump(false); + let open_spacing = self.bump(false).1; let mut buf = Vec::new(); loop { @@ -80,7 +58,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> { fn eof_err(&mut self) -> PErr<'psess> { let msg = "this file contains an unclosed delimiter"; - let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg); + let mut err = self.dcx().struct_span_err(self.token.span, msg); let unclosed_delimiter_show_limit = 5; let len = usize::min(unclosed_delimiter_show_limit, self.diag_info.open_braces.len()); @@ -110,7 +88,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> { report_suspicious_mismatch_block( &mut err, &self.diag_info, - self.string_reader.psess.source_map(), + self.psess.source_map(), *delim, ) } @@ -136,7 +114,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> { // Expand to cover the entire delimited token tree. let delim_span = DelimSpan::from_pair(pre_span, self.token.span); - let sm = self.string_reader.psess.source_map(); + let sm = self.psess.source_map(); let close_spacing = match self.token.kind { // Correct delimiter. @@ -228,7 +206,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> { // Will glue adjacent single-char tokens together if `glue` is set. fn bump(&mut self, glue: bool) -> (Token, Spacing) { let (this_spacing, next_tok) = loop { - let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); + let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor(); if is_next_tok_preceded_by_whitespace { break (Spacing::Alone, next_tok); @@ -256,7 +234,7 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> { ) -> Vec> { // If there are unclosed delims, see if there are diff markers and if so, point them // out instead of complaining about the unclosed delims. - let mut parser = Parser::new(self.string_reader.psess, tts, None); + let mut parser = Parser::new(self.psess, tts, None); let mut diff_errs = vec![]; // Suggest removing a `{` we think appears in an `if`/`while` condition. // We want to suggest removing a `{` only if we think we're in an `if`/`while` condition, @@ -314,14 +292,9 @@ impl<'psess, 'src> TokenTreesReader<'psess, 'src> { // An unexpected closing delimiter (i.e., there is no matching opening delimiter). let token_str = token_to_string(&self.token); let msg = format!("unexpected closing delimiter: `{token_str}`"); - let mut err = self.string_reader.dcx().struct_span_err(self.token.span, msg); + let mut err = self.dcx().struct_span_err(self.token.span, msg); - report_suspicious_mismatch_block( - &mut err, - &self.diag_info, - self.string_reader.psess.source_map(), - delim, - ); + report_suspicious_mismatch_block(&mut err, &self.diag_info, self.psess.source_map(), delim); err.span_label(self.token.span, "unexpected closing delimiter"); err } diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index d78b3664b1ee8..42eef27803eb5 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -4,7 +4,7 @@ use rustc_span::symbol::kw; use rustc_span::{BytePos, Pos, Span}; -use super::StringReader; +use super::Lexer; use crate::errors::TokenSubstitution; use crate::token::{self, Delimiter}; @@ -338,7 +338,7 @@ const ASCII_ARRAY: &[(&str, &str, Option)] = &[ ]; pub(super) fn check_for_substitution( - reader: &StringReader<'_, '_>, + lexer: &Lexer<'_, '_>, pos: BytePos, ch: char, count: usize, @@ -351,11 +351,11 @@ pub(super) fn check_for_substitution( let Some((_, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(s, _, _)| s == ascii_str) else { let msg = format!("substitution character not found for '{ch}'"); - reader.dcx().span_bug(span, msg); + lexer.dcx().span_bug(span, msg); }; // special help suggestion for "directed" double quotes - let sugg = if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') { + let sugg = if let Some(s) = peek_delimited(&lexer.src[lexer.src_index(pos)..], '“', '”') { let span = Span::with_root_ctxt( pos, pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()), From 593cf680aa6578d48998dac05532d76dfcad07ac Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 15 Nov 2024 10:01:22 +1100 Subject: [PATCH 2/6] Split `Lexer::bump`. It has two different ways of being called. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 34 ++++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index fab92aff74011..c6c9eb3b0b263 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -16,7 +16,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { is_delimited: bool, ) -> (Spacing, TokenStream, Result<(), Vec>>) { // Move past the opening delimiter. - let open_spacing = self.bump(false).1; + let open_spacing = self.bump_minimal(); let mut buf = Vec::new(); loop { @@ -49,7 +49,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { } _ => { // Get the next normal token. - let (this_tok, this_spacing) = self.bump(true); + let (this_tok, this_spacing) = self.bump(); buf.push(TokenTree::Token(this_tok, this_spacing)); } } @@ -138,7 +138,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { } // Move past the closing delimiter. - self.bump(false).1 + self.bump_minimal() } // Incorrect delimiter. token::CloseDelim(close_delim) => { @@ -181,7 +181,7 @@ impl<'psess, 'src> Lexer<'psess, 'src> { // bar(baz( // } // Incorrect delimiter but matches the earlier `{` if !self.diag_info.open_braces.iter().any(|&(b, _)| b == close_delim) { - self.bump(false).1 + self.bump_minimal() } else { // The choice of value here doesn't matter. Spacing::Alone @@ -203,14 +203,14 @@ impl<'psess, 'src> Lexer<'psess, 'src> { } // Move on to the next token, returning the current token and its spacing. - // Will glue adjacent single-char tokens together if `glue` is set. - fn bump(&mut self, glue: bool) -> (Token, Spacing) { + // Will glue adjacent single-char tokens together. + fn bump(&mut self) -> (Token, Spacing) { let (this_spacing, next_tok) = loop { let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor(); if is_next_tok_preceded_by_whitespace { break (Spacing::Alone, next_tok); - } else if glue && let Some(glued) = self.token.glue(&next_tok) { + } else if let Some(glued) = self.token.glue(&next_tok) { self.token = glued; } else { let this_spacing = if next_tok.is_punct() { @@ -227,6 +227,26 @@ impl<'psess, 'src> Lexer<'psess, 'src> { (this_tok, this_spacing) } + // Cut-down version of `bump` used when the token kind is known in advance. + fn bump_minimal(&mut self) -> Spacing { + let (next_tok, is_next_tok_preceded_by_whitespace) = self.next_token_from_cursor(); + + let this_spacing = if is_next_tok_preceded_by_whitespace { + Spacing::Alone + } else { + if next_tok.is_punct() { + Spacing::Joint + } else if next_tok == token::Eof { + Spacing::Alone + } else { + Spacing::JointHidden + } + }; + + self.token = next_tok; + this_spacing + } + fn unclosed_delim_err( &mut self, tts: TokenStream, From ba1a1ddc3f8c8007061c6f915448b22880da61ca Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 15 Nov 2024 10:05:49 +1100 Subject: [PATCH 3/6] Fix some formatting. Must be one of those cases where the function is too long and rustfmt bails out. --- compiler/rustc_parse/src/lexer/mod.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 0ef5e9ed1d4a3..202a2fbee22aa 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -240,7 +240,8 @@ impl<'psess, 'src> Lexer<'psess, 'src> { .push(span); token::Ident(sym, IdentIsRaw::No) } - // split up (raw) c string literals to an ident and a string literal when edition < 2021. + // split up (raw) c string literals to an ident and a string literal when edition < + // 2021. rustc_lexer::TokenKind::Literal { kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }), suffix_start: _, @@ -261,7 +262,9 @@ impl<'psess, 'src> Lexer<'psess, 'src> { let prefix_span = self.mk_sp(start, lit_start); return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace); } - rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before), + rustc_lexer::TokenKind::GuardedStrPrefix => { + self.maybe_report_guarded_str(start, str_before) + } rustc_lexer::TokenKind::Literal { kind, suffix_start } => { let suffix_start = start + BytePos(suffix_start); let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); @@ -305,13 +308,20 @@ impl<'psess, 'src> Lexer<'psess, 'src> { if prefix_span.at_least_rust_2021() { let span = self.mk_sp(start, self.pos); - let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start)); + let lifetime_name_without_tick = + Symbol::intern(&self.str_from(ident_start)); if !lifetime_name_without_tick.can_be_raw() { - self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick }); + self.dcx().emit_err( + errors::CannotBeRawLifetime { + span, + ident: lifetime_name_without_tick + } + ); } // Put the `'` back onto the lifetime name. - let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1); + let mut lifetime_name = + String::with_capacity(lifetime_name_without_tick.as_str().len() + 1); lifetime_name.push('\''); lifetime_name += lifetime_name_without_tick.as_str(); let sym = Symbol::intern(&lifetime_name); From 11c96cfd94a9b24a11d20c94686929126991f8d9 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 15 Nov 2024 11:00:46 +1100 Subject: [PATCH 4/6] Improve `strip_shebang` testing. It's currently a bit ad hoc. This commit makes it more methodical, with pairs of match/no-match tests for all the relevant cases. --- compiler/rustc_lexer/src/tests.rs | 72 +++++++++++++------------------ 1 file changed, 31 insertions(+), 41 deletions(-) diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs index 556bbf1f5182e..db7225fc2a810 100644 --- a/compiler/rustc_lexer/src/tests.rs +++ b/compiler/rustc_lexer/src/tests.rs @@ -77,61 +77,51 @@ fn test_too_many_hashes() { check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 })); } +// https://github.com/rust-lang/rust/issues/70528 #[test] fn test_valid_shebang() { - // https://github.com/rust-lang/rust/issues/70528 - let input = "#!/usr/bin/rustrun\nlet x = 5;"; - assert_eq!(strip_shebang(input), Some(18)); -} + let input = "#!/bin/bash"; + assert_eq!(strip_shebang(input), Some(input.len())); -#[test] -fn test_invalid_shebang_valid_rust_syntax() { - // https://github.com/rust-lang/rust/issues/70528 - let input = "#! [bad_attribute]"; + let input = "#![attribute]"; assert_eq!(strip_shebang(input), None); -} -#[test] -fn test_shebang_second_line() { - // Because shebangs are interpreted by the kernel, they must be on the first line - let input = "\n#!/bin/bash"; + let input = "#! /bin/bash"; + assert_eq!(strip_shebang(input), Some(input.len())); + + let input = "#! [attribute]"; assert_eq!(strip_shebang(input), None); -} -#[test] -fn test_shebang_space() { - let input = "#! /bin/bash"; + let input = "#! /* blah */ /bin/bash"; assert_eq!(strip_shebang(input), Some(input.len())); -} -#[test] -fn test_shebang_empty_shebang() { - let input = "#! \n[attribute(foo)]"; + let input = "#! /* blah */ [attribute]"; assert_eq!(strip_shebang(input), None); -} -#[test] -fn test_invalid_shebang_comment() { - let input = "#!//bin/ami/a/comment\n["; - assert_eq!(strip_shebang(input), None) -} + let input = "#! // blah\n/bin/bash"; + assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline -#[test] -fn test_invalid_shebang_another_comment() { - let input = "#!/*bin/ami/a/comment*/\n[attribute"; - assert_eq!(strip_shebang(input), None) -} + let input = "#! // blah\n[attribute]"; + assert_eq!(strip_shebang(input), None); -#[test] -fn test_shebang_valid_rust_after() { - let input = "#!/*bin/ami/a/comment*/\npub fn main() {}"; - assert_eq!(strip_shebang(input), Some(23)) -} + let input = "#! /* blah\nblah\nblah */ /bin/bash"; + assert_eq!(strip_shebang(input), Some(10)); -#[test] -fn test_shebang_followed_by_attrib() { - let input = "#!/bin/rust-scripts\n#![allow_unused(true)]"; - assert_eq!(strip_shebang(input), Some(19)); + let input = "#! /* blah\nblah\nblah */ [attribute]"; + assert_eq!(strip_shebang(input), None); + + let input = "#!\n/bin/sh"; + assert_eq!(strip_shebang(input), Some(2)); + + let input = "#!\n[attribute]"; + assert_eq!(strip_shebang(input), None); + + // Because shebangs are interpreted by the kernel, they must be on the first line + let input = "\n#!/bin/bash"; + assert_eq!(strip_shebang(input), None); + + let input = "\n#![attribute]"; + assert_eq!(strip_shebang(input), None); } fn check_lexing(src: &str, expect: Expect) { From 4cd2840f003a1aa29da7e688043b954b4659b2ca Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 15 Nov 2024 14:01:53 +1100 Subject: [PATCH 5/6] Clean up `c_or_byte_string`. - Rename a misleading local `mk_kind` as `single_quoted`. - Use `fn` for all three arguments, for consistency. --- compiler/rustc_lexer/src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index bcb103957badf..b584e7afd98fa 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -566,19 +566,19 @@ impl Cursor<'_> { fn c_or_byte_string( &mut self, - mk_kind: impl FnOnce(bool) -> LiteralKind, - mk_kind_raw: impl FnOnce(Option) -> LiteralKind, + mk_kind: fn(bool) -> LiteralKind, + mk_kind_raw: fn(Option) -> LiteralKind, single_quoted: Option LiteralKind>, ) -> TokenKind { match (self.first(), self.second(), single_quoted) { - ('\'', _, Some(mk_kind)) => { + ('\'', _, Some(single_quoted)) => { self.bump(); let terminated = self.single_quoted_string(); let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } - let kind = mk_kind(terminated); + let kind = single_quoted(terminated); Literal { kind, suffix_start } } ('"', _, _) => { From 16a39bb7ca7d2af14069deef36291ca1c41b4bb0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 15 Nov 2024 14:54:15 +1100 Subject: [PATCH 6/6] Streamline `lex_token_trees` error handling. - Use iterators instead of `for` loops. - Use `if`/`else` instead of `match`. --- compiler/rustc_parse/src/lexer/mod.rs | 34 +++++++++++---------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 202a2fbee22aa..8db3b174a89fc 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -72,27 +72,21 @@ pub(crate) fn lex_token_trees<'psess, 'src>( let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false); let unmatched_delims = lexer.diag_info.unmatched_delims; - match res { - Ok(()) if unmatched_delims.is_empty() => Ok(stream), - _ => { - // Return error if there are unmatched delimiters or unclosed delimiters. - // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch - // because the delimiter mismatch is more likely to be the root cause of error - - let mut buffer = Vec::with_capacity(1); - for unmatched in unmatched_delims { - if let Some(err) = make_unclosed_delims_error(unmatched, psess) { - buffer.push(err); - } - } - if let Err(errs) = res { - // Add unclosing delimiter or diff marker errors - for err in errs { - buffer.push(err); - } - } - Err(buffer) + if res.is_ok() && unmatched_delims.is_empty() { + Ok(stream) + } else { + // Return error if there are unmatched delimiters or unclosed delimiters. + // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch + // because the delimiter mismatch is more likely to be the root cause of error + let mut buffer: Vec<_> = unmatched_delims + .into_iter() + .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess)) + .collect(); + if let Err(errs) = res { + // Add unclosing delimiter or diff marker errors + buffer.extend(errs); } + Err(buffer) } }