Skip to content

Commit

Permalink
Distinguish the two kinds of token range.
Browse files Browse the repository at this point in the history
When collecting tokens there are two kinds of range:
- a range relative to the parser's full token stream (which we get when
  we are parsing);
- a range relative to a single AST node's token stream (which we use
  within `LazyAttrTokenStreamImpl` when replacing tokens).

These are currently both represented with `Range<u32>` and it's easy to
mix them up -- until now I hadn't properly understood the difference.

This commit introduces `ParserRange` and `NodeRange` to distinguish
them. This also requires splitting `ReplaceRange` in two, giving the new
types `ParserReplacement` and `NodeReplacement`. (These latter two names
reduce the overloading of the word "range".)

The commit also rewrites some comments to be clearer.

The end result is a little more verbose, but much clearer.
  • Loading branch information
nnethercote committed Aug 1, 2024
1 parent 9d77d17 commit d1f05fd
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 78 deletions.
2 changes: 1 addition & 1 deletion compiler/rustc_builtin_macros/src/cfg_eval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ impl CfgEval<'_> {
}

// Now that we have our re-parsed `AttrTokenStream`, recursively configuring
// our attribute target will correctly the tokens as well.
// our attribute target will correctly configure the tokens as well.
flat_map_annotatable(self, annotatable)
}
}
Expand Down
6 changes: 3 additions & 3 deletions compiler/rustc_parse/src/parser/attr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use rustc_span::{sym, BytePos, Span};
use thin_vec::ThinVec;
use tracing::debug;

use super::{AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, PathStyle};
use super::{AttrWrapper, Capturing, FnParseMode, ForceCollect, Parser, ParserRange, PathStyle};
use crate::{errors, fluent_generated as fluent, maybe_whole};

// Public for rustfmt usage
Expand Down Expand Up @@ -307,8 +307,8 @@ impl<'a> Parser<'a> {
// inner attribute, for possible later processing in a `LazyAttrTokenStream`.
if let Capturing::Yes = self.capture_state.capturing {
let end_pos = self.num_bump_calls;
let range = start_pos..end_pos;
self.capture_state.inner_attr_ranges.insert(attr.id, range);
let parser_range = ParserRange(start_pos..end_pos);
self.capture_state.inner_attr_parser_ranges.insert(attr.id, parser_range);
}
attrs.push(attr);
} else {
Expand Down
124 changes: 70 additions & 54 deletions compiler/rustc_parse/src/parser/attr_wrapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ use rustc_errors::PResult;
use rustc_session::parse::ParseSess;
use rustc_span::{sym, Span, DUMMY_SP};

use super::{Capturing, FlatToken, ForceCollect, Parser, ReplaceRange, TokenCursor};
use super::{
Capturing, FlatToken, ForceCollect, NodeRange, NodeReplacement, Parser, ParserRange,
TokenCursor,
};

/// A wrapper type to ensure that the parser handles outer attributes correctly.
/// When we parse outer attributes, we need to ensure that we capture tokens
Expand All @@ -28,8 +31,8 @@ use super::{Capturing, FlatToken, ForceCollect, Parser, ReplaceRange, TokenCurso
#[derive(Debug, Clone)]
pub struct AttrWrapper {
attrs: AttrVec,
// The start of the outer attributes in the token cursor.
// This allows us to create a `ReplaceRange` for the entire attribute
// The start of the outer attributes in the parser's token stream.
// This lets us create a `NodeReplacement` for the entire attribute
// target, including outer attributes.
start_pos: u32,
}
Expand Down Expand Up @@ -88,7 +91,7 @@ struct LazyAttrTokenStreamImpl {
cursor_snapshot: TokenCursor,
num_calls: u32,
break_last_token: bool,
replace_ranges: Box<[ReplaceRange]>,
node_replacements: Box<[NodeReplacement]>,
}

impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
Expand All @@ -103,21 +106,24 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
.chain(iter::repeat_with(|| FlatToken::Token(cursor_snapshot.next())))
.take(self.num_calls as usize);

if self.replace_ranges.is_empty() {
if self.node_replacements.is_empty() {
make_attr_token_stream(tokens, self.break_last_token)
} else {
let mut tokens: Vec<_> = tokens.collect();
let mut replace_ranges = self.replace_ranges.to_vec();
replace_ranges.sort_by_key(|(range, _)| range.start);
let mut node_replacements = self.node_replacements.to_vec();
node_replacements.sort_by_key(|(range, _)| range.0.start);

#[cfg(debug_assertions)]
for [(range, tokens), (next_range, next_tokens)] in replace_ranges.array_windows() {
for [(node_range, tokens), (next_node_range, next_tokens)] in
node_replacements.array_windows()
{
assert!(
range.end <= next_range.start || range.end >= next_range.end,
"Replace ranges should either be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})",
range,
node_range.0.end <= next_node_range.0.start
|| node_range.0.end >= next_node_range.0.end,
"Node ranges should be disjoint or nested: ({:?}, {:?}) ({:?}, {:?})",
node_range,
tokens,
next_range,
next_node_range,
next_tokens,
);
}
Expand All @@ -135,20 +141,23 @@ impl ToAttrTokenStream for LazyAttrTokenStreamImpl {
// start position, we ensure that any (outer) replace range which
// encloses another (inner) replace range will fully overwrite the
// inner range's replacement.
for (range, target) in replace_ranges.into_iter().rev() {
assert!(!range.is_empty(), "Cannot replace an empty range: {range:?}");
for (node_range, target) in node_replacements.into_iter().rev() {
assert!(
!node_range.0.is_empty(),
"Cannot replace an empty node range: {:?}",
node_range.0
);

// Replace the tokens in range with zero or one `FlatToken::AttrsTarget`s, plus
// enough `FlatToken::Empty`s to fill up the rest of the range. This keeps the
// total length of `tokens` constant throughout the replacement process, allowing
// us to use all of the `ReplaceRanges` entries without adjusting indices.
// us to do all replacements without adjusting indices.
let target_len = target.is_some() as usize;
tokens.splice(
(range.start as usize)..(range.end as usize),
target
.into_iter()
.map(|target| FlatToken::AttrsTarget(target))
.chain(iter::repeat(FlatToken::Empty).take(range.len() - target_len)),
(node_range.0.start as usize)..(node_range.0.end as usize),
target.into_iter().map(|target| FlatToken::AttrsTarget(target)).chain(
iter::repeat(FlatToken::Empty).take(node_range.0.len() - target_len),
),
);
}
make_attr_token_stream(tokens.into_iter(), self.break_last_token)
Expand Down Expand Up @@ -215,7 +224,7 @@ impl<'a> Parser<'a> {
let cursor_snapshot = self.token_cursor.clone();
let start_pos = self.num_bump_calls;
let has_outer_attrs = !attrs.attrs.is_empty();
let replace_ranges_start = self.capture_state.replace_ranges.len();
let parser_replacements_start = self.capture_state.parser_replacements.len();

// We set and restore `Capturing::Yes` on either side of the call to
// `f`, so we can distinguish the outermost call to
Expand Down Expand Up @@ -270,7 +279,7 @@ impl<'a> Parser<'a> {
return Ok(ret);
}

let replace_ranges_end = self.capture_state.replace_ranges.len();
let parser_replacements_end = self.capture_state.parser_replacements.len();

assert!(
!(self.break_last_token && capture_trailing),
Expand All @@ -287,15 +296,16 @@ impl<'a> Parser<'a> {

let num_calls = end_pos - start_pos;

// Take the captured ranges for any inner attributes that we parsed in
// `Parser::parse_inner_attributes`, and pair them in a `ReplaceRange`
// with `None`, which means the relevant tokens will be removed. (More
// details below.)
let mut inner_attr_replace_ranges = Vec::new();
// Take the captured `ParserRange`s for any inner attributes that we parsed in
// `Parser::parse_inner_attributes`, and pair them in a `ParserReplacement` with `None`,
// which means the relevant tokens will be removed. (More details below.)
let mut inner_attr_parser_replacements = Vec::new();
for attr in ret.attrs() {
if attr.style == ast::AttrStyle::Inner {
if let Some(attr_range) = self.capture_state.inner_attr_ranges.remove(&attr.id) {
inner_attr_replace_ranges.push((attr_range, None));
if let Some(inner_attr_parser_range) =
self.capture_state.inner_attr_parser_ranges.remove(&attr.id)
{
inner_attr_parser_replacements.push((inner_attr_parser_range, None));
} else {
self.dcx().span_delayed_bug(attr.span, "Missing token range for attribute");
}
Expand All @@ -304,37 +314,41 @@ impl<'a> Parser<'a> {

// This is hot enough for `deep-vector` that checking the conditions for an empty iterator
// is measurably faster than actually executing the iterator.
let replace_ranges: Box<[ReplaceRange]> =
if replace_ranges_start == replace_ranges_end && inner_attr_replace_ranges.is_empty() {
Box::new([])
} else {
// Grab any replace ranges that occur *inside* the current AST node. We will
// perform the actual replacement only when we convert the `LazyAttrTokenStream` to
// an `AttrTokenStream`.
self.capture_state.replace_ranges[replace_ranges_start..replace_ranges_end]
.iter()
.cloned()
.chain(inner_attr_replace_ranges.iter().cloned())
.map(|(range, data)| ((range.start - start_pos)..(range.end - start_pos), data))
.collect()
};
let node_replacements: Box<[_]> = if parser_replacements_start == parser_replacements_end
&& inner_attr_parser_replacements.is_empty()
{
Box::new([])
} else {
// Grab any replace ranges that occur *inside* the current AST node. Convert them
// from `ParserRange` form to `NodeRange` form. We will perform the actual
// replacement only when we convert the `LazyAttrTokenStream` to an
// `AttrTokenStream`.
self.capture_state.parser_replacements
[parser_replacements_start..parser_replacements_end]
.iter()
.cloned()
.chain(inner_attr_parser_replacements.iter().cloned())
.map(|(parser_range, data)| (NodeRange::new(parser_range, start_pos), data))
.collect()
};

// What is the status here when parsing the example code at the top of this method?
//
// When parsing `g`:
// - `start_pos..end_pos` is `12..33` (`fn g { ... }`, excluding the outer attr).
// - `inner_attr_replace_ranges` has one entry (`5..15`, when counting from `fn`), to
// - `inner_attr_parser_replacements` has one entry (`ParserRange(17..27)`), to
// delete the inner attr's tokens.
// - This entry is put into the lazy tokens for `g`, i.e. deleting the inner attr from
// those tokens (if they get evaluated).
// - This entry is converted to `NodeRange(5..15)` (relative to the `fn`) and put into
// the lazy tokens for `g`, i.e. deleting the inner attr from those tokens (if they get
// evaluated).
// - Those lazy tokens are also put into an `AttrsTarget` that is appended to `self`'s
// replace ranges at the bottom of this function, for processing when parsing `m`.
// - `replace_ranges_start..replace_ranges_end` is empty.
// - `parser_replacements_start..parser_replacements_end` is empty.
//
// When parsing `m`:
// - `start_pos..end_pos` is `0..34` (`mod m`, excluding the `#[cfg_eval]` attribute).
// - `inner_attr_replace_ranges` is empty.
// - `replace_range_start..replace_ranges_end` has one entry.
// - `inner_attr_parser_replacements` is empty.
// - `parser_replacements_start..parser_replacements_end` has one entry.
// - One `AttrsTarget` (added below when parsing `g`) to replace all of `g` (`3..33`,
// including its outer attribute), with:
// - `attrs`: includes the outer and the inner attr.
Expand All @@ -345,7 +359,7 @@ impl<'a> Parser<'a> {
num_calls,
cursor_snapshot,
break_last_token: self.break_last_token,
replace_ranges,
node_replacements,
});

// If we support tokens and don't already have them, store the newly captured tokens.
Expand All @@ -366,7 +380,7 @@ impl<'a> Parser<'a> {
// What is the status here when parsing the example code at the top of this method?
//
// When parsing `g`, we add one entry:
// - The `start_pos..end_pos` (`3..33`) entry has a new `AttrsTarget` with:
// - The pushed entry (`ParserRange(3..33)`) has a new `AttrsTarget` with:
// - `attrs`: includes the outer and the inner attr.
// - `tokens`: lazy tokens for `g` (with its inner attr deleted).
//
Expand All @@ -377,12 +391,14 @@ impl<'a> Parser<'a> {
// cfg-expand this AST node.
let start_pos = if has_outer_attrs { attrs.start_pos } else { start_pos };
let target = AttrsTarget { attrs: ret.attrs().iter().cloned().collect(), tokens };
self.capture_state.replace_ranges.push((start_pos..end_pos, Some(target)));
self.capture_state
.parser_replacements
.push((ParserRange(start_pos..end_pos), Some(target)));
} else if matches!(self.capture_state.capturing, Capturing::No) {
// Only clear the ranges once we've finished capturing entirely, i.e. we've finished
// the outermost call to this method.
self.capture_state.replace_ranges.clear();
self.capture_state.inner_attr_ranges.clear();
self.capture_state.parser_replacements.clear();
self.capture_state.inner_attr_parser_ranges.clear();
}
Ok(ret)
}
Expand Down
70 changes: 50 additions & 20 deletions compiler/rustc_parse/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,24 +192,54 @@ struct ClosureSpans {
body: Span,
}

/// Indicates a range of tokens that should be replaced by
/// the tokens in the provided `AttrsTarget`. This is used in two
/// places during token collection:
/// A token range within a `Parser`'s full token stream.
#[derive(Clone, Debug)]
struct ParserRange(Range<u32>);

/// A token range within an individual AST node's (lazy) token stream, i.e.
/// relative to that node's first token. Distinct from `ParserRange` so the two
/// kinds of range can't be mixed up.
#[derive(Clone, Debug)]
struct NodeRange(Range<u32>);

/// Indicates a range of tokens that should be replaced by an `AttrsTarget`
/// (replacement) or be replaced by nothing (deletion). This is used in two
/// places during token collection.
///
/// 1. Replacement. During the parsing of an AST node that may have a
/// `#[derive]` attribute, when we parse a nested AST node that has `#[cfg]`
/// or `#[cfg_attr]`, we replace the entire inner AST node with
/// `FlatToken::AttrsTarget`. This lets us perform eager cfg-expansion on an
/// `AttrTokenStream`.
///
/// 1. During the parsing of an AST node that may have a `#[derive]`
/// attribute, we parse a nested AST node that has `#[cfg]` or `#[cfg_attr]`
/// In this case, we use a `ReplaceRange` to replace the entire inner AST node
/// with `FlatToken::AttrsTarget`, allowing us to perform eager cfg-expansion
/// on an `AttrTokenStream`.
/// 2. Deletion. We delete inner attributes from all collected token streams,
/// and instead track them through the `attrs` field on the AST node. This
/// lets us manipulate them similarly to outer attributes. When we create a
/// `TokenStream`, the inner attributes are inserted into the proper place
/// in the token stream.
///
/// 2. When we parse an inner attribute while collecting tokens. We
/// remove inner attributes from the token stream entirely, and
/// instead track them through the `attrs` field on the AST node.
/// This allows us to easily manipulate them (for example, removing
/// the first macro inner attribute to invoke a proc-macro).
/// When create a `TokenStream`, the inner attributes get inserted
/// into the proper place in the token stream.
type ReplaceRange = (Range<u32>, Option<AttrsTarget>);
/// Each replacement starts off in `ParserReplacement` form but is converted to
/// `NodeReplacement` form when it is attached to a single AST node, via
/// `LazyAttrTokenStreamImpl`.
type ParserReplacement = (ParserRange, Option<AttrsTarget>);

/// See the comment on `ParserReplacement`.
type NodeReplacement = (NodeRange, Option<AttrsTarget>);

impl NodeRange {
// Converts a range within a parser's tokens to a range within a
// node's tokens beginning at `start_pos`.
//
// For example, imagine a parser with 50 tokens in its token stream, a
// function that spans `ParserRange(20..40)` and an inner attribute within
// that function that spans `ParserRange(30..35)`. We would find the inner
// attribute's range within the function's tokens by subtracting 20, which
// is the position of the function's start token. This gives
// `NodeRange(10..15)`.
fn new(ParserRange(parser_range): ParserRange, start_pos: u32) -> NodeRange {
NodeRange((parser_range.start - start_pos)..(parser_range.end - start_pos))
}
}

/// Controls how we capture tokens. Capturing can be expensive,
/// so we try to avoid performing capturing in cases where
Expand All @@ -226,8 +256,8 @@ enum Capturing {
#[derive(Clone, Debug)]
struct CaptureState {
capturing: Capturing,
replace_ranges: Vec<ReplaceRange>,
inner_attr_ranges: FxHashMap<AttrId, Range<u32>>,
parser_replacements: Vec<ParserReplacement>,
inner_attr_parser_ranges: FxHashMap<AttrId, ParserRange>,
}

/// Iterator over a `TokenStream` that produces `Token`s. It's a bit odd that
Expand Down Expand Up @@ -417,8 +447,8 @@ impl<'a> Parser<'a> {
subparser_name,
capture_state: CaptureState {
capturing: Capturing::No,
replace_ranges: Vec::new(),
inner_attr_ranges: Default::default(),
parser_replacements: Vec::new(),
inner_attr_parser_ranges: Default::default(),
},
current_closure: None,
recovery: Recovery::Allowed,
Expand Down

0 comments on commit d1f05fd

Please sign in to comment.