From d1f05fd1848fc68ed89d17f7937e358dacd8aed4 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 1 Aug 2024 06:44:39 +1000 Subject: [PATCH] Distinguish the two kinds of token range. When collecting tokens there are two kinds of range: - a range relative to the parser's full token stream (which we get when we are parsing); - a range relative to a single AST node's token stream (which we use within `LazyAttrTokenStreamImpl` when replacing tokens). These are currently both represented with `Range` and it's easy to mix them up -- until now I hadn't properly understood the difference. This commit introduces `ParserRange` and `NodeRange` to distinguish them. This also requires splitting `ReplaceRange` in two, giving the new types `ParserReplacement` and `NodeReplacement`. (These latter two names reduce the overloading of the word "range".) The commit also rewrites some comments to be clearer. The end result is a little more verbose, but much clearer. --- compiler/rustc_builtin_macros/src/cfg_eval.rs | 2 +- compiler/rustc_parse/src/parser/attr.rs | 6 +- .../rustc_parse/src/parser/attr_wrapper.rs | 124 ++++++++++-------- compiler/rustc_parse/src/parser/mod.rs | 70 +++++++--- 4 files changed, 124 insertions(+), 78 deletions(-) diff --git a/compiler/rustc_builtin_macros/src/cfg_eval.rs b/compiler/rustc_builtin_macros/src/cfg_eval.rs index dc1874bfecbda..4b05c144d37d7 100644 --- a/compiler/rustc_builtin_macros/src/cfg_eval.rs +++ b/compiler/rustc_builtin_macros/src/cfg_eval.rs @@ -202,7 +202,7 @@ impl CfgEval<'_> { } // Now that we have our re-parsed `AttrTokenStream`, recursively configuring - // our attribute target will correctly the tokens as well. + // our attribute target will correctly configure the tokens as well. flat_map_annotatable(self, annotatable) } } diff --git a/compiler/rustc_parse/src/parser/attr.rs b/compiler/rustc_parse/src/parser/attr.rs index 12b9414d1f760..a98382efc3eba 100644 --- a/compiler/rustc_parse/src/parser/attr.rs +++ b/compiler/rustc_parse/src/parser/attr.rs @@ -8,7 +8,7 @@ use rustc_span::{sym, BytePos, Span}; use thin_vec::ThinVec; use tracing::debug; -use super::{AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, PathStyle}; +use super::{AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, ParserRange, PathStyle}; use crate::{errors, fluent_generated as fluent, maybe_whole}; // Public for rustfmt usage @@ -307,8 +307,8 @@ impl<'a> Parser<'a> { // inner attribute, for possible later processing in a `LazyAttrTokenStream`. if let Capturing::Yes = self.capture_state.capturing { let end_pos = self.num_bump_calls; - let range = start_pos..end_pos; - self.capture_state.inner_attr_ranges.insert(attr.id, range); + let parser_range = ParserRange(start_pos..end_pos); + self.capture_state.inner_attr_parser_ranges.insert(attr.id, parser_range); } attrs.push(attr); } else { diff --git a/compiler/rustc_parse/src/parser/attr_wrapper.rs b/compiler/rustc_parse/src/parser/attr_wrapper.rs index ba20bd9daf708..abf61036c2deb 100644 --- a/compiler/rustc_parse/src/parser/attr_wrapper.rs +++ b/compiler/rustc_parse/src/parser/attr_wrapper.rs @@ -10,7 +10,10 @@ use rustc_errors::PResult; use rustc_session::parse::ParseSess; use rustc_span::{sym, Span, DUMMY_SP}; -use super::{Capturing, FlatToken, ForceCollect, Parser, ReplaceRange, TokenCursor}; +use super::{ + Capturing, FlatToken, ForceCollect, NodeRange, NodeReplacement, Parser, ParserRange, + TokenCursor, +}; /// A wrapper type to ensure that the parser handles outer attributes correctly. /// When we parse outer attributes, we need to ensure that we capture tokens @@ -28,8 +31,8 @@ use super::{Capturing, FlatToken, ForceCollect, Parser, ReplaceRange, TokenCurso #[derive(Debug, Clone)] pub struct AttrWrapper { attrs: AttrVec, - // The start of the outer attributes in the token cursor. - // This allows us to create a `ReplaceRange` for the entire attribute + // The start of the outer attributes in the parser's token stream. + // This lets us create a `NodeReplacement` for the entire attribute // target, including outer attributes. start_pos: u32, } @@ -88,7 +91,7 @@ struct LazyAttrTokenStreamImpl { cursor_snapshot: TokenCursor, num_calls: u32, break_last_token: bool, - replace_ranges: Box<[ReplaceRange]>, + node_replacements: Box<[NodeReplacement]>, } impl ToAttrTokenStream for LazyAttrTokenStreamImpl { @@ -103,21 +106,24 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl { .chain(iter::repeat_with(|| FlatToken::Token(cursor_snapshot.next()))) .take(self.num_calls as usize); - if self.replace_ranges.is_empty() { + if self.node_replacements.is_empty() { make_attr_token_stream(tokens, self.break_last_token) } else { let mut tokens: Vec<_> = tokens.collect(); - let mut replace_ranges = self.replace_ranges.to_vec(); - replace_ranges.sort_by_key(|(range, _)| range.start); + let mut node_replacements = self.node_replacements.to_vec(); + node_replacements.sort_by_key(|(range, _)| range.0.start); #[cfg(debug_assertions)] - for [(range, tokens), (next_range, next_tokens)] in replace_ranges.array_windows() { + for [(node_range, tokens), (next_node_range, next_tokens)] in + node_replacements.array_windows() + { assert!( - range.end <= next_range.start || range.end >= next_range.end, - "Replace ranges should either be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})", - range, + node_range.0.end <= next_node_range.0.start + || node_range.0.end >= next_node_range.0.end, + "Node ranges should be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})", + node_range, tokens, - next_range, + next_node_range, next_tokens, ); } @@ -135,20 +141,23 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl { // start position, we ensure that any (outer) replace range which // encloses another (inner) replace range will fully overwrite the // inner range's replacement. - for (range, target) in replace_ranges.into_iter().rev() { - assert!(!range.is_empty(), "Cannot replace an empty range: {range:?}"); + for (node_range, target) in node_replacements.into_iter().rev() { + assert!( + !node_range.0.is_empty(), + "Cannot replace an empty node range: {:?}", + node_range.0 + ); // Replace the tokens in range with zero or one `FlatToken::AttrsTarget`s, plus // enough `FlatToken::Empty`s to fill up the rest of the range. This keeps the // total length of `tokens` constant throughout the replacement process, allowing - // us to use all of the `ReplaceRanges` entries without adjusting indices. + // us to do all replacements without adjusting indices. let target_len = target.is_some() as usize; tokens.splice( - (range.start as usize)..(range.end as usize), - target - .into_iter() - .map(|target| FlatToken::AttrsTarget(target)) - .chain(iter::repeat(FlatToken::Empty).take(range.len() - target_len)), + (node_range.0.start as usize)..(node_range.0.end as usize), + target.into_iter().map(|target| FlatToken::AttrsTarget(target)).chain( + iter::repeat(FlatToken::Empty).take(node_range.0.len() - target_len), + ), ); } make_attr_token_stream(tokens.into_iter(), self.break_last_token) @@ -215,7 +224,7 @@ impl<'a> Parser<'a> { let cursor_snapshot = self.token_cursor.clone(); let start_pos = self.num_bump_calls; let has_outer_attrs = !attrs.attrs.is_empty(); - let replace_ranges_start = self.capture_state.replace_ranges.len(); + let parser_replacements_start = self.capture_state.parser_replacements.len(); // We set and restore `Capturing::Yes` on either side of the call to // `f`, so we can distinguish the outermost call to @@ -270,7 +279,7 @@ impl<'a> Parser<'a> { return Ok(ret); } - let replace_ranges_end = self.capture_state.replace_ranges.len(); + let parser_replacements_end = self.capture_state.parser_replacements.len(); assert!( !(self.break_last_token && capture_trailing), @@ -287,15 +296,16 @@ impl<'a> Parser<'a> { let num_calls = end_pos - start_pos; - // Take the captured ranges for any inner attributes that we parsed in - // `Parser::parse_inner_attributes`, and pair them in a `ReplaceRange` - // with `None`, which means the relevant tokens will be removed. (More - // details below.) - let mut inner_attr_replace_ranges = Vec::new(); + // Take the captured `ParserRange`s for any inner attributes that we parsed in + // `Parser::parse_inner_attributes`, and pair them in a `ParserReplacement` with `None`, + // which means the relevant tokens will be removed. (More details below.) + let mut inner_attr_parser_replacements = Vec::new(); for attr in ret.attrs() { if attr.style == ast::AttrStyle::Inner { - if let Some(attr_range) = self.capture_state.inner_attr_ranges.remove(&attr.id) { - inner_attr_replace_ranges.push((attr_range, None)); + if let Some(inner_attr_parser_range) = + self.capture_state.inner_attr_parser_ranges.remove(&attr.id) + { + inner_attr_parser_replacements.push((inner_attr_parser_range, None)); } else { self.dcx().span_delayed_bug(attr.span, "Missing token range for attribute"); } @@ -304,37 +314,41 @@ impl<'a> Parser<'a> { // This is hot enough for `deep-vector` that checking the conditions for an empty iterator // is measurably faster than actually executing the iterator. - let replace_ranges: Box<[ReplaceRange]> = - if replace_ranges_start == replace_ranges_end && inner_attr_replace_ranges.is_empty() { - Box::new([]) - } else { - // Grab any replace ranges that occur *inside* the current AST node. We will - // perform the actual replacement only when we convert the `LazyAttrTokenStream` to - // an `AttrTokenStream`. - self.capture_state.replace_ranges[replace_ranges_start..replace_ranges_end] - .iter() - .cloned() - .chain(inner_attr_replace_ranges.iter().cloned()) - .map(|(range, data)| ((range.start - start_pos)..(range.end - start_pos), data)) - .collect() - }; + let node_replacements: Box<[_]> = if parser_replacements_start == parser_replacements_end + && inner_attr_parser_replacements.is_empty() + { + Box::new([]) + } else { + // Grab any replace ranges that occur *inside* the current AST node. Convert them + // from `ParserRange` form to `NodeRange` form. We will perform the actual + // replacement only when we convert the `LazyAttrTokenStream` to an + // `AttrTokenStream`. + self.capture_state.parser_replacements + [parser_replacements_start..parser_replacements_end] + .iter() + .cloned() + .chain(inner_attr_parser_replacements.iter().cloned()) + .map(|(parser_range, data)| (NodeRange::new(parser_range, start_pos), data)) + .collect() + }; // What is the status here when parsing the example code at the top of this method? // // When parsing `g`: // - `start_pos..end_pos` is `12..33` (`fn g { ... }`, excluding the outer attr). - // - `inner_attr_replace_ranges` has one entry (`5..15`, when counting from `fn`), to + // - `inner_attr_parser_replacements` has one entry (`ParserRange(17..27)`), to // delete the inner attr's tokens. - // - This entry is put into the lazy tokens for `g`, i.e. deleting the inner attr from - // those tokens (if they get evaluated). + // - This entry is converted to `NodeRange(5..15)` (relative to the `fn`) and put into + // the lazy tokens for `g`, i.e. deleting the inner attr from those tokens (if they get + // evaluated). // - Those lazy tokens are also put into an `AttrsTarget` that is appended to `self`'s // replace ranges at the bottom of this function, for processing when parsing `m`. - // - `replace_ranges_start..replace_ranges_end` is empty. + // - `parser_replacements_start..parser_replacements_end` is empty. // // When parsing `m`: // - `start_pos..end_pos` is `0..34` (`mod m`, excluding the `#[cfg_eval]` attribute). - // - `inner_attr_replace_ranges` is empty. - // - `replace_range_start..replace_ranges_end` has one entry. + // - `inner_attr_parser_replacements` is empty. + // - `parser_replacements_start..parser_replacements_end` has one entry. // - One `AttrsTarget` (added below when parsing `g`) to replace all of `g` (`3..33`, // including its outer attribute), with: // - `attrs`: includes the outer and the inner attr. @@ -345,7 +359,7 @@ impl<'a> Parser<'a> { num_calls, cursor_snapshot, break_last_token: self.break_last_token, - replace_ranges, + node_replacements, }); // If we support tokens and don't already have them, store the newly captured tokens. @@ -366,7 +380,7 @@ impl<'a> Parser<'a> { // What is the status here when parsing the example code at the top of this method? // // When parsing `g`, we add one entry: - // - The `start_pos..end_pos` (`3..33`) entry has a new `AttrsTarget` with: + // - The pushed entry (`ParserRange(3..33)`) has a new `AttrsTarget` with: // - `attrs`: includes the outer and the inner attr. // - `tokens`: lazy tokens for `g` (with its inner attr deleted). // @@ -377,12 +391,14 @@ impl<'a> Parser<'a> { // cfg-expand this AST node. let start_pos = if has_outer_attrs { attrs.start_pos } else { start_pos }; let target = AttrsTarget { attrs: ret.attrs().iter().cloned().collect(), tokens }; - self.capture_state.replace_ranges.push((start_pos..end_pos, Some(target))); + self.capture_state + .parser_replacements + .push((ParserRange(start_pos..end_pos), Some(target))); } else if matches!(self.capture_state.capturing, Capturing::No) { // Only clear the ranges once we've finished capturing entirely, i.e. we've finished // the outermost call to this method. - self.capture_state.replace_ranges.clear(); - self.capture_state.inner_attr_ranges.clear(); + self.capture_state.parser_replacements.clear(); + self.capture_state.inner_attr_parser_ranges.clear(); } Ok(ret) } diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs index 26ee5bfdee42c..722fb41cd81ec 100644 --- a/compiler/rustc_parse/src/parser/mod.rs +++ b/compiler/rustc_parse/src/parser/mod.rs @@ -192,24 +192,54 @@ struct ClosureSpans { body: Span, } -/// Indicates a range of tokens that should be replaced by -/// the tokens in the provided `AttrsTarget`. This is used in two -/// places during token collection: +/// A token range within a `Parser`'s full token stream. +#[derive(Clone, Debug)] +struct ParserRange(Range); + +/// A token range within an individual AST node's (lazy) token stream, i.e. +/// relative to that node's first token. Distinct from `ParserRange` so the two +/// kinds of range can't be mixed up. +#[derive(Clone, Debug)] +struct NodeRange(Range); + +/// Indicates a range of tokens that should be replaced by an `AttrsTarget` +/// (replacement) or be replaced by nothing (deletion). This is used in two +/// places during token collection. +/// +/// 1. Replacement. During the parsing of an AST node that may have a +/// `#[derive]` attribute, when we parse a nested AST node that has `#[cfg]` +/// or `#[cfg_attr]`, we replace the entire inner AST node with +/// `FlatToken::AttrsTarget`. This lets us perform eager cfg-expansion on an +/// `AttrTokenStream`. /// -/// 1. During the parsing of an AST node that may have a `#[derive]` -/// attribute, we parse a nested AST node that has `#[cfg]` or `#[cfg_attr]` -/// In this case, we use a `ReplaceRange` to replace the entire inner AST node -/// with `FlatToken::AttrsTarget`, allowing us to perform eager cfg-expansion -/// on an `AttrTokenStream`. +/// 2. Deletion. We delete inner attributes from all collected token streams, +/// and instead track them through the `attrs` field on the AST node. This +/// lets us manipulate them similarly to outer attributes. When we create a +/// `TokenStream`, the inner attributes are inserted into the proper place +/// in the token stream. /// -/// 2. When we parse an inner attribute while collecting tokens. We -/// remove inner attributes from the token stream entirely, and -/// instead track them through the `attrs` field on the AST node. -/// This allows us to easily manipulate them (for example, removing -/// the first macro inner attribute to invoke a proc-macro). -/// When create a `TokenStream`, the inner attributes get inserted -/// into the proper place in the token stream. -type ReplaceRange = (Range, Option); +/// Each replacement starts off in `ParserReplacement` form but is converted to +/// `NodeReplacement` form when it is attached to a single AST node, via +/// `LazyAttrTokenStreamImpl`. +type ParserReplacement = (ParserRange, Option); + +/// See the comment on `ParserReplacement`. +type NodeReplacement = (NodeRange, Option); + +impl NodeRange { + // Converts a range within a parser's tokens to a range within a + // node's tokens beginning at `start_pos`. + // + // For example, imagine a parser with 50 tokens in its token stream, a + // function that spans `ParserRange(20..40)` and an inner attribute within + // that function that spans `ParserRange(30..35)`. We would find the inner + // attribute's range within the function's tokens by subtracting 20, which + // is the position of the function's start token. This gives + // `NodeRange(10..15)`. + fn new(ParserRange(parser_range): ParserRange, start_pos: u32) -> NodeRange { + NodeRange((parser_range.start - start_pos)..(parser_range.end - start_pos)) + } +} /// Controls how we capture tokens. Capturing can be expensive, /// so we try to avoid performing capturing in cases where @@ -226,8 +256,8 @@ enum Capturing { #[derive(Clone, Debug)] struct CaptureState { capturing: Capturing, - replace_ranges: Vec, - inner_attr_ranges: FxHashMap>, + parser_replacements: Vec, + inner_attr_parser_ranges: FxHashMap, } /// Iterator over a `TokenStream` that produces `Token`s. It's a bit odd that @@ -417,8 +447,8 @@ impl<'a> Parser<'a> { subparser_name, capture_state: CaptureState { capturing: Capturing::No, - replace_ranges: Vec::new(), - inner_attr_ranges: Default::default(), + parser_replacements: Vec::new(), + inner_attr_parser_ranges: Default::default(), }, current_closure: None, recovery: Recovery::Allowed,