From 4465b2fbf3487e8f45e2eee4da187776574febec Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 3 May 2018 17:14:04 +1000 Subject: [PATCH 1/6] Inline `char_at()` and `record_width`. Because `bump()` is hot. --- src/libsyntax/str.rs | 1 + src/libsyntax_pos/lib.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/src/libsyntax/str.rs b/src/libsyntax/str.rs index d0f47629b10e5..281861918fd8e 100644 --- a/src/libsyntax/str.rs +++ b/src/libsyntax/str.rs @@ -8,6 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +#[inline] pub fn char_at(s: &str, byte: usize) -> char { s[byte..].chars().next().unwrap() } diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index d30d3d78ca540..2bab958dceda8 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -1047,6 +1047,7 @@ impl FileMap { self.multibyte_chars.borrow_mut().push(mbc); } + #[inline] pub fn record_width(&self, pos: BytePos, ch: char) { let width = match ch { '\t' => From b1aae607c56d26333589dc45daa20859950bf6e1 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 4 May 2018 06:38:15 +1000 Subject: [PATCH 2/6] Tweak naming and ordering in `StringReader::bump()`. This patch removes the "old"/"new" names in favour of "foo"/"next_foo", which matches the field names. It also moves the setting of `self.{ch,pos,next_pos}` in the common case to the end, so that the meaning of "foo"/"next_foo" is consistent until the end. --- src/libsyntax/parse/lexer/mod.rs | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 22a0261d8c6b1..a24af857af142 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -442,36 +442,35 @@ impl<'a> StringReader<'a> { /// Advance the StringReader by one character. If a newline is /// discovered, add it to the FileMap's list of line start offsets. pub fn bump(&mut self) { - let new_pos = self.next_pos; - let new_byte_offset = self.byte_offset(new_pos).to_usize(); + let next_byte_offset = self.byte_offset(self.next_pos).to_usize(); let end = self.terminator.map_or(self.source_text.len(), |t| { self.byte_offset(t).to_usize() }); - if new_byte_offset < end { - let old_ch_is_newline = self.ch.unwrap() == '\n'; - let new_ch = char_at(&self.source_text, new_byte_offset); - let new_ch_len = new_ch.len_utf8(); - - self.ch = Some(new_ch); - self.pos = new_pos; - self.next_pos = new_pos + Pos::from_usize(new_ch_len); - if old_ch_is_newline { + if next_byte_offset < end { + let next_ch = char_at(&self.source_text, next_byte_offset); + let next_ch_len = next_ch.len_utf8(); + + if self.ch.unwrap() == '\n' { if self.save_new_lines_and_multibyte { - self.filemap.next_line(self.pos); + self.filemap.next_line(self.next_pos); } self.col = CharPos(0); } else { self.col = self.col + CharPos(1); } - if new_ch_len > 1 { + if next_ch_len > 1 { if self.save_new_lines_and_multibyte { - self.filemap.record_multibyte_char(self.pos, new_ch_len); + self.filemap.record_multibyte_char(self.next_pos, next_ch_len); } } - self.filemap.record_width(self.pos, new_ch); + self.filemap.record_width(self.next_pos, next_ch); + + self.ch = Some(next_ch); + self.pos = self.next_pos; + self.next_pos = self.next_pos + Pos::from_usize(next_ch_len); } else { self.ch = None; - self.pos = new_pos; + self.pos = self.next_pos; } } From 7a090fbe02b6b20f0dce8acf07a0328375f91d68 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 9 May 2018 12:49:39 +1000 Subject: [PATCH 3/6] Rename some stuff in `StringReader`. - `source_text` becomes `src`, matching `FileMap::src`. - `byte_offset()` becomes `src_index()`, which makes it clearer that it's an index into `src`. (Likewise for variables containing `byte_offset` in their name.) This function also now returns a `usize` instead of a `BytePos`, because every callsite immediately converted the `BytePos` to a `usize`. --- src/libsyntax/parse/lexer/mod.rs | 53 +++++++++++++++----------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index a24af857af142..e2030ff62c579 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -61,7 +61,7 @@ pub struct StringReader<'a> { pub fatal_errs: Vec>, // cache a direct reference to the source text, so that we don't have to // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time. - source_text: Lrc, + src: Lrc, /// Stack of open delimiters and their spans. Used for error message. token: token::Token, span: Span, @@ -176,7 +176,7 @@ impl<'a> StringReader<'a> { filemap.name)); } - let source_text = (*filemap.src.as_ref().unwrap()).clone(); + let src = (*filemap.src.as_ref().unwrap()).clone(); StringReader { sess, @@ -190,7 +190,7 @@ impl<'a> StringReader<'a> { // dummy values; not read peek_tok: token::Eof, peek_span: syntax_pos::DUMMY_SP, - source_text, + src, fatal_errs: Vec::new(), token: token::Eof, span: syntax_pos::DUMMY_SP, @@ -326,9 +326,7 @@ impl<'a> StringReader<'a> { /// offending string to the error message fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError { m.push_str(": "); - let from = self.byte_offset(from_pos).to_usize(); - let to = self.byte_offset(to_pos).to_usize(); - m.push_str(&self.source_text[from..to]); + m.push_str(&self.src[self.src_index(from_pos)..self.src_index(to_pos)]); self.fatal_span_(from_pos, to_pos, &m[..]) } @@ -354,8 +352,9 @@ impl<'a> StringReader<'a> { Ok(()) } - fn byte_offset(&self, pos: BytePos) -> BytePos { - (pos - self.filemap.start_pos) + #[inline] + fn src_index(&self, pos: BytePos) -> usize { + (pos - self.filemap.start_pos).to_usize() } /// Calls `f` with a string slice of the source text spanning from `start` @@ -386,7 +385,7 @@ impl<'a> StringReader<'a> { fn with_str_from_to(&self, start: BytePos, end: BytePos, f: F) -> T where F: FnOnce(&str) -> T { - f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()]) + f(&self.src[self.src_index(start)..self.src_index(end)]) } /// Converts CRLF to LF in the given string, raising an error on bare CR. @@ -438,16 +437,13 @@ impl<'a> StringReader<'a> { } } - /// Advance the StringReader by one character. If a newline is /// discovered, add it to the FileMap's list of line start offsets. pub fn bump(&mut self) { - let next_byte_offset = self.byte_offset(self.next_pos).to_usize(); - let end = self.terminator.map_or(self.source_text.len(), |t| { - self.byte_offset(t).to_usize() - }); - if next_byte_offset < end { - let next_ch = char_at(&self.source_text, next_byte_offset); + let next_src_index = self.src_index(self.next_pos); + let end_src_index = self.terminator.map_or(self.src.len(), |t| self.src_index(t)); + if next_src_index < end_src_index { + let next_ch = char_at(&self.src, next_src_index); let next_ch_len = next_ch.len_utf8(); if self.ch.unwrap() == '\n' { @@ -475,9 +471,9 @@ impl<'a> StringReader<'a> { } pub fn nextch(&self) -> Option { - let offset = self.byte_offset(self.next_pos).to_usize(); - if offset < self.source_text.len() { - Some(char_at(&self.source_text, offset)) + let next_src_index = self.src_index(self.next_pos); + if next_src_index < self.src.len() { + Some(char_at(&self.src, next_src_index)) } else { None } @@ -488,14 +484,14 @@ impl<'a> StringReader<'a> { } pub fn nextnextch(&self) -> Option { - let offset = self.byte_offset(self.next_pos).to_usize(); - let s = &self.source_text[..]; - if offset >= s.len() { + let next_src_index = self.src_index(self.next_pos); + let s = &self.src[..]; + if next_src_index >= s.len() { return None; } - let next = offset + char_at(s, offset).len_utf8(); - if next < s.len() { - Some(char_at(s, next)) + let next_next_src_index = next_src_index + char_at(s, next_src_index).len_utf8(); + if next_next_src_index < s.len() { + Some(char_at(s, next_next_src_index)) } else { None } @@ -1358,8 +1354,8 @@ impl<'a> StringReader<'a> { loop { self.bump(); if self.ch_is('\'') { - let start = self.byte_offset(start).to_usize(); - let end = self.byte_offset(self.pos).to_usize(); + let start = self.src_index(start); + let end = self.src_index(self.pos); self.bump(); let span = self.mk_sp(start_with_quote, self.pos); self.sess.span_diagnostic @@ -1368,8 +1364,7 @@ impl<'a> StringReader<'a> { .span_suggestion(span, "if you meant to write a `str` literal, \ use double quotes", - format!("\"{}\"", - &self.source_text[start..end])) + format!("\"{}\"", &self.src[start..end])) .emit(); return Ok(token::Literal(token::Str_(Symbol::intern("??")), None)) } From 548067e00f64861915d2374082adc7796c73ceb8 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 9 May 2018 12:55:13 +1000 Subject: [PATCH 4/6] Remove `StringReader::terminator`. It's silly for a hot function like `bump()` to have such an expensive bounds check. This patch replaces terminator with `end_src_index`. Note that the `self.terminator` check in `is_eof()` wasn't necessary because of the way `StringReader` is initialized. --- src/libsyntax/parse/lexer/mod.rs | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index e2030ff62c579..f80eaf0a9b446 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -49,8 +49,8 @@ pub struct StringReader<'a> { /// The current character (which has been read from self.pos) pub ch: Option, pub filemap: Lrc, - /// If Some, stop reading the source at this position (inclusive). - pub terminator: Option, + /// Stop reading src at this index. + pub end_src_index: usize, /// Whether to record new-lines and multibyte chars in filemap. /// This is only necessary the first time a filemap is lexed. /// If part of a filemap is being re-lexed, this should be set to false. @@ -113,14 +113,7 @@ impl<'a> StringReader<'a> { self.unwrap_or_abort(res) } fn is_eof(&self) -> bool { - if self.ch.is_none() { - return true; - } - - match self.terminator { - Some(t) => self.next_pos > t, - None => false, - } + self.ch.is_none() } /// Return the next token. EFFECT: advances the string_reader. pub fn try_next_token(&mut self) -> Result { @@ -185,7 +178,7 @@ impl<'a> StringReader<'a> { col: CharPos(0), ch: Some('\n'), filemap, - terminator: None, + end_src_index: src.len(), save_new_lines_and_multibyte: true, // dummy values; not read peek_tok: token::Eof, @@ -222,7 +215,7 @@ impl<'a> StringReader<'a> { // Seek the lexer to the right byte range. sr.save_new_lines_and_multibyte = false; sr.next_pos = span.lo(); - sr.terminator = Some(span.hi()); + sr.end_src_index = sr.src_index(span.hi()); sr.bump(); @@ -441,8 +434,7 @@ impl<'a> StringReader<'a> { /// discovered, add it to the FileMap's list of line start offsets. pub fn bump(&mut self) { let next_src_index = self.src_index(self.next_pos); - let end_src_index = self.terminator.map_or(self.src.len(), |t| self.src_index(t)); - if next_src_index < end_src_index { + if next_src_index < self.end_src_index { let next_ch = char_at(&self.src, next_src_index); let next_ch_len = next_ch.len_utf8(); @@ -472,7 +464,7 @@ impl<'a> StringReader<'a> { pub fn nextch(&self) -> Option { let next_src_index = self.src_index(self.next_pos); - if next_src_index < self.src.len() { + if next_src_index < self.end_src_index { Some(char_at(&self.src, next_src_index)) } else { None @@ -485,13 +477,12 @@ impl<'a> StringReader<'a> { pub fn nextnextch(&self) -> Option { let next_src_index = self.src_index(self.next_pos); - let s = &self.src[..]; - if next_src_index >= s.len() { + if next_src_index >= self.end_src_index { return None; } - let next_next_src_index = next_src_index + char_at(s, next_src_index).len_utf8(); - if next_next_src_index < s.len() { - Some(char_at(s, next_next_src_index)) + let next_next_src_index = next_src_index + char_at(&self.src, next_src_index).len_utf8(); + if next_next_src_index < self.end_src_index { + Some(char_at(&self.src, next_next_src_index)) } else { None } From 444b770f4cd8d817e7b7fec683ea301620034d13 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 9 May 2018 13:20:55 +1000 Subject: [PATCH 5/6] Make `nextnextch()` more closely resemble `nextch()`. --- src/libsyntax/parse/lexer/mod.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index f80eaf0a9b446..3968b7f113b95 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -477,15 +477,14 @@ impl<'a> StringReader<'a> { pub fn nextnextch(&self) -> Option { let next_src_index = self.src_index(self.next_pos); - if next_src_index >= self.end_src_index { - return None; - } - let next_next_src_index = next_src_index + char_at(&self.src, next_src_index).len_utf8(); - if next_next_src_index < self.end_src_index { - Some(char_at(&self.src, next_next_src_index)) - } else { - None + if next_src_index < self.end_src_index { + let next_next_src_index = + next_src_index + char_at(&self.src, next_src_index).len_utf8(); + if next_next_src_index < self.end_src_index { + return Some(char_at(&self.src, next_next_src_index)); + } } + None } pub fn nextnextch_is(&self, c: char) -> bool { From e913d692117e57ea2908461865aa08037f0bd2b9 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 8 May 2018 19:58:54 +1000 Subject: [PATCH 6/6] Remove `StringReader::col`. It only has a single use, within code handling indented block comments. We can replace that with the new `FileMap::col_pos()`, which computes the col position (BytePos instead of CharPos) based on the record of the last newline char (which we already record). This is actually an improvement, because `trim_whitespace_prefix_and_push_line()` was using `col`, which is a `CharPos`, as a slice index, which is a byte/char confusion. --- src/libsyntax/parse/lexer/comments.rs | 14 +++++++++++++- src/libsyntax/parse/lexer/mod.rs | 6 ------ src/libsyntax_pos/lib.rs | 9 +++++++++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs index 63aa5d28ce8dc..672b0b9bbd16d 100644 --- a/src/libsyntax/parse/lexer/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -238,7 +238,19 @@ fn read_block_comment(rdr: &mut StringReader, debug!(">>> block comment"); let p = rdr.pos; let mut lines: Vec = Vec::new(); - let col = rdr.col; + + // Count the number of chars since the start of the line by rescanning. + let mut src_index = rdr.src_index(rdr.filemap.line_begin_pos()); + let end_src_index = rdr.src_index(rdr.pos); + assert!(src_index <= end_src_index); + let mut n = 0; + while src_index < end_src_index { + let c = char_at(&rdr.src, src_index); + src_index += c.len_utf8(); + n += 1; + } + let col = CharPos(n); + rdr.bump(); rdr.bump(); diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 3968b7f113b95..566e0c213b119 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -44,8 +44,6 @@ pub struct StringReader<'a> { pub next_pos: BytePos, /// The absolute offset within the codemap of the current character pub pos: BytePos, - /// The column of the next character to read - pub col: CharPos, /// The current character (which has been read from self.pos) pub ch: Option, pub filemap: Lrc, @@ -175,7 +173,6 @@ impl<'a> StringReader<'a> { sess, next_pos: filemap.start_pos, pos: filemap.start_pos, - col: CharPos(0), ch: Some('\n'), filemap, end_src_index: src.len(), @@ -442,9 +439,6 @@ impl<'a> StringReader<'a> { if self.save_new_lines_and_multibyte { self.filemap.next_line(self.next_pos); } - self.col = CharPos(0); - } else { - self.col = self.col + CharPos(1); } if next_ch_len > 1 { if self.save_new_lines_and_multibyte { diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index 2bab958dceda8..26ab5d0a34ba5 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -971,6 +971,15 @@ impl FileMap { lines.push(pos); } + /// Return the BytePos of the beginning of the current line. + pub fn line_begin_pos(&self) -> BytePos { + let lines = self.lines.borrow(); + match lines.last() { + Some(&line_pos) => line_pos, + None => self.start_pos, + } + } + /// Add externally loaded source. /// If the hash of the input doesn't match or no input is supplied via None, /// it is interpreted as an error and the corresponding enum variant is set.