Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lexer tweaks #133070

Merged
merged 6 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -566,19 +566,19 @@ impl Cursor<'_> {

fn c_or_byte_string(
&mut self,
mk_kind: impl FnOnce(bool) -> LiteralKind,
mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
mk_kind: fn(bool) -> LiteralKind,
mk_kind_raw: fn(Option<u8>) -> LiteralKind,
single_quoted: Option<fn(bool) -> LiteralKind>,
) -> TokenKind {
match (self.first(), self.second(), single_quoted) {
('\'', _, Some(mk_kind)) => {
('\'', _, Some(single_quoted)) => {
self.bump();
let terminated = self.single_quoted_string();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
let kind = mk_kind(terminated);
let kind = single_quoted(terminated);
Literal { kind, suffix_start }
}
('"', _, _) => {
Expand Down
72 changes: 31 additions & 41 deletions compiler/rustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,61 +77,51 @@ fn test_too_many_hashes() {
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
}

// https://github.com/rust-lang/rust/issues/70528
#[test]
fn test_valid_shebang() {
// https://github.com/rust-lang/rust/issues/70528
let input = "#!/usr/bin/rustrun\nlet x = 5;";
assert_eq!(strip_shebang(input), Some(18));
}
let input = "#!/bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));

#[test]
fn test_invalid_shebang_valid_rust_syntax() {
// https://github.com/rust-lang/rust/issues/70528
let input = "#! [bad_attribute]";
let input = "#![attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_shebang_second_line() {
// Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash";
let input = "#! /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));

let input = "#! [attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_shebang_space() {
let input = "#! /bin/bash";
let input = "#! /* blah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));
}

#[test]
fn test_shebang_empty_shebang() {
let input = "#! \n[attribute(foo)]";
let input = "#! /* blah */ [attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_invalid_shebang_comment() {
let input = "#!//bin/ami/a/comment\n[";
assert_eq!(strip_shebang(input), None)
}
let input = "#! // blah\n/bin/bash";
assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline

#[test]
fn test_invalid_shebang_another_comment() {
let input = "#!/*bin/ami/a/comment*/\n[attribute";
assert_eq!(strip_shebang(input), None)
}
let input = "#! // blah\n[attribute]";
assert_eq!(strip_shebang(input), None);

#[test]
fn test_shebang_valid_rust_after() {
let input = "#!/*bin/ami/a/comment*/\npub fn main() {}";
assert_eq!(strip_shebang(input), Some(23))
}
let input = "#! /* blah\nblah\nblah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(10));

#[test]
fn test_shebang_followed_by_attrib() {
let input = "#!/bin/rust-scripts\n#![allow_unused(true)]";
assert_eq!(strip_shebang(input), Some(19));
let input = "#! /* blah\nblah\nblah */ [attribute]";
assert_eq!(strip_shebang(input), None);

let input = "#!\n/bin/sh";
assert_eq!(strip_shebang(input), Some(2));

let input = "#!\n[attribute]";
assert_eq!(strip_shebang(input), None);

// Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash";
assert_eq!(strip_shebang(input), None);

let input = "\n#![attribute]";
assert_eq!(strip_shebang(input), None);
}

fn check_lexing(src: &str, expect: Expect) {
Expand Down
75 changes: 44 additions & 31 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol;
use rustc_span::{BytePos, Pos, Span};
use tracing::debug;

use crate::lexer::diagnostics::TokenTreeDiagInfo;
use crate::lexer::unicode_chars::UNICODE_ARRAY;
use crate::{errors, make_unclosed_delims_error};

Expand Down Expand Up @@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
}

let cursor = Cursor::new(src);
let string_reader = StringReader {
let mut lexer = Lexer {
psess,
start_pos,
pos: start_pos,
Expand All @@ -65,34 +66,31 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
override_span,
nbsp_is_whitespace: false,
last_lifetime: None,
token: Token::dummy(),
diag_info: TokenTreeDiagInfo::default(),
};
let (stream, res, unmatched_delims) =
tokentrees::TokenTreesReader::lex_all_token_trees(string_reader);
match res {
Ok(()) if unmatched_delims.is_empty() => Ok(stream),
_ => {
// Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error

let mut buffer = Vec::with_capacity(1);
for unmatched in unmatched_delims {
if let Some(err) = make_unclosed_delims_error(unmatched, psess) {
buffer.push(err);
}
}
if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors
for err in errs {
buffer.push(err);
}
}
Err(buffer)
let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false);
let unmatched_delims = lexer.diag_info.unmatched_delims;

if res.is_ok() && unmatched_delims.is_empty() {
Ok(stream)
} else {
// Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error
let mut buffer: Vec<_> = unmatched_delims
.into_iter()
.filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
.collect();
if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors
buffer.extend(errs);
}
Err(buffer)
}
}

struct StringReader<'psess, 'src> {
struct Lexer<'psess, 'src> {
psess: &'psess ParseSess,
/// Initial position, read-only.
start_pos: BytePos,
Expand All @@ -111,9 +109,14 @@ struct StringReader<'psess, 'src> {
/// Track the `Span` for the leading `'` of the last lifetime. Used for
/// diagnostics to detect possible typo where `"` was meant.
last_lifetime: Option<Span>,

/// The current token.
token: Token,

diag_info: TokenTreeDiagInfo,
}

impl<'psess, 'src> StringReader<'psess, 'src> {
impl<'psess, 'src> Lexer<'psess, 'src> {
fn dcx(&self) -> DiagCtxtHandle<'psess> {
self.psess.dcx()
}
Expand All @@ -124,7 +127,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {

/// Returns the next token, paired with a bool indicating if the token was
/// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) {
fn next_token_from_cursor(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false;
let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens
Expand Down Expand Up @@ -231,7 +234,8 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
.push(span);
token::Ident(sym, IdentIsRaw::No)
}
// split up (raw) c string literals to an ident and a string literal when edition < 2021.
// split up (raw) c string literals to an ident and a string literal when edition <
// 2021.
rustc_lexer::TokenKind::Literal {
kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
suffix_start: _,
Expand All @@ -252,7 +256,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
rustc_lexer::TokenKind::GuardedStrPrefix => {
self.maybe_report_guarded_str(start, str_before)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -296,13 +302,20 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
if prefix_span.at_least_rust_2021() {
let span = self.mk_sp(start, self.pos);

let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start));
let lifetime_name_without_tick =
Symbol::intern(&self.str_from(ident_start));
if !lifetime_name_without_tick.can_be_raw() {
self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick });
self.dcx().emit_err(
errors::CannotBeRawLifetime {
span,
ident: lifetime_name_without_tick
}
);
}

// Put the `'` back onto the lifetime name.
let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
let mut lifetime_name =
String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
lifetime_name.push('\'');
lifetime_name += lifetime_name_without_tick.as_str();
let sym = Symbol::intern(&lifetime_name);
Expand Down
Loading
Loading