diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index c02abe6b89f97..afef307a0ed37 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -352,7 +352,6 @@ impl Cursor<'_> { loop { match self.nth_char(0) { '\n' => break, - '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, _ => { self.bump(); @@ -525,7 +524,6 @@ impl Cursor<'_> { match self.nth_char(0) { '/' if !first => break, '\n' if self.nth_char(1) != '\'' => break, - '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, '\'' => { self.bump(); diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs index d8e00d4c7c5ea..c709b7526082f 100644 --- a/src/librustc_lexer/src/unescape.rs +++ b/src/librustc_lexer/src/unescape.rs @@ -128,11 +128,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result Err(EscapeError::EscapeOnlyChar), - '\r' => Err(if chars.clone().next() == Some('\n') { - EscapeError::EscapeOnlyChar - } else { - EscapeError::BareCarriageReturn - }), + '\r' => Err(EscapeError::BareCarriageReturn), '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), _ => { @@ -244,27 +240,15 @@ where let unescaped_char = match first_char { '\\' => { - let (second_char, third_char) = { - let mut chars = chars.clone(); - (chars.next(), chars.next()) - }; - match (second_char, third_char) { - (Some('\n'), _) | (Some('\r'), Some('\n')) => { + let second_char = chars.clone().next(); + match second_char { + Some('\n') => { skip_ascii_whitespace(&mut chars); continue; } _ => scan_escape(first_char, &mut chars, mode), } } - '\r' => { - let second_char = chars.clone().next(); - if second_char == Some('\n') { - chars.next(); - Ok('\n') - } else { - scan_escape(first_char, &mut chars, mode) - } - } '\n' => Ok('\n'), '\t' => Ok('\t'), _ => scan_escape(first_char, &mut chars, mode), @@ -298,15 +282,11 @@ where while let Some(curr) = chars.next() { let start = initial_len - chars.as_str().len() - curr.len_utf8(); - let result = match (curr, chars.clone().next()) { - ('\r', Some('\n')) => { - chars.next(); - Ok('\n') - }, - ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), - (c, _) if mode.is_bytes() && !c.is_ascii() => + let result = match curr { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - (c, _) => Ok(c), + c => Ok(c), }; let end = initial_len - chars.as_str().len(); diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs index 496527eb265b0..e7b1ff6479d88 100644 --- a/src/librustc_lexer/src/unescape/tests.rs +++ b/src/librustc_lexer/src/unescape/tests.rs @@ -11,7 +11,6 @@ fn test_unescape_char_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); - check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -31,6 +30,7 @@ fn test_unescape_char_bad() { check(r"\v", EscapeError::InvalidEscape); check(r"\💩", EscapeError::InvalidEscape); check(r"\●", EscapeError::InvalidEscape); + check("\\\r", EscapeError::InvalidEscape); check(r"\x", EscapeError::TooShortHexEscape); check(r"\x0", EscapeError::TooShortHexEscape); @@ -116,10 +116,9 @@ fn test_unescape_str_good() { check("foo", "foo"); check("", ""); - check(" \t\n\r\n", " \t\n\n"); + check(" \t\n", " \t\n"); check("hello \\\n world", "hello world"); - check("hello \\\r\n world", "hello world"); check("thread's", "thread's") } @@ -134,7 +133,6 @@ fn test_unescape_byte_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); - check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -238,10 +236,9 @@ fn test_unescape_byte_str_good() { check("foo", b"foo"); check("", b""); - check(" \t\n\r\n", b" \t\n\n"); + check(" \t\n", b" \t\n"); check("hello \\\n world", b"hello world"); - check("hello \\\r\n world", b"hello world"); check("thread's", b"thread's") } @@ -253,7 +250,6 @@ fn test_unescape_raw_str() { assert_eq!(unescaped, expected); } - check("\r\n", &[(0..2, Ok('\n'))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); } @@ -266,7 +262,6 @@ fn test_unescape_raw_byte_str() { assert_eq!(unescaped, expected); } - check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); check( diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 17629d392cd3d..bdf468a52bb39 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -8,9 +8,7 @@ use syntax_pos::{BytePos, Pos, Span}; use rustc_lexer::Base; use rustc_lexer::unescape; -use std::borrow::Cow; use std::char; -use std::iter; use std::convert::TryInto; use rustc_data_structures::sync::Lrc; use log::debug; @@ -181,18 +179,7 @@ impl<'a> StringReader<'a> { let string = self.str_from(start); // comments with only more "/"s are not doc comments let tok = if is_doc_comment(string) { - let mut idx = 0; - loop { - idx = match string[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1 - }; - if string[idx..].chars().next() != Some('\n') { - self.err_span_(start + BytePos(idx as u32 - 1), - start + BytePos(idx as u32), - "bare CR not allowed in doc-comment"); - } - } + self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment"); token::DocComment(Symbol::intern(string)) } else { token::Comment @@ -217,15 +204,10 @@ impl<'a> StringReader<'a> { } let tok = if is_doc_comment { - let has_cr = string.contains('\r'); - let string = if has_cr { - self.translate_crlf(start, - string, - "bare CR not allowed in block doc-comment") - } else { - string.into() - }; - token::DocComment(Symbol::intern(&string[..])) + self.forbid_bare_cr(start, + string, + "bare CR not allowed in block doc-comment"); + token::DocComment(Symbol::intern(string)) } else { token::Comment }; @@ -516,49 +498,16 @@ impl<'a> StringReader<'a> { &self.src[self.src_index(start)..self.src_index(end)] } - /// Converts CRLF to LF in the given string, raising an error on bare CR. - fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { - let mut chars = s.char_indices().peekable(); - while let Some((i, ch)) = chars.next() { - if ch == '\r' { - if let Some((lf_idx, '\n')) = chars.peek() { - return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into(); - } - let pos = start + BytePos(i as u32); - let end_pos = start + BytePos((i + ch.len_utf8()) as u32); - self.err_span_(pos, end_pos, errmsg); - } - } - return s.into(); - - fn translate_crlf_(rdr: &StringReader<'_>, - start: BytePos, - s: &str, - mut j: usize, - mut chars: iter::Peekable>, - errmsg: &str) - -> String { - let mut buf = String::with_capacity(s.len()); - // Skip first CR - buf.push_str(&s[.. j - 1]); - while let Some((i, ch)) = chars.next() { - if ch == '\r' { - if j < i { - buf.push_str(&s[j..i]); - } - let next = i + ch.len_utf8(); - j = next; - if chars.peek().map(|(_, ch)| *ch) != Some('\n') { - let pos = start + BytePos(i as u32); - let end_pos = start + BytePos(next as u32); - rdr.err_span_(pos, end_pos, errmsg); - } - } - } - if j < s.len() { - buf.push_str(&s[j..]); - } - buf + fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) { + let mut idx = 0; + loop { + idx = match s[idx..].find('\r') { + None => break, + Some(it) => idx + it + 1 + }; + self.err_span_(start + BytePos(idx as u32 - 1), + start + BytePos(idx as u32), + errmsg); } } diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index aa36fe27d8ec4..a17cd7625fb19 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -1043,6 +1043,7 @@ impl SourceFile { mut src: String, start_pos: BytePos) -> Result { remove_bom(&mut src); + normalize_newlines(&mut src); let src_hash = { let mut hasher: StableHasher = StableHasher::new(); @@ -1210,6 +1211,61 @@ fn remove_bom(src: &mut String) { } } + +/// Replaces `\r\n` with `\n` in-place in `src`. +/// +/// Returns error if there's a lone `\r` in the string +fn normalize_newlines(src: &mut String) { + if !src.as_bytes().contains(&b'\r') { + return; + } + + // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding. + // While we *can* call `as_mut_vec` and do surgery on the live string + // directly, let's rather steal the contents of `src`. This makes the code + // safe even if a panic occurs. + + let mut buf = std::mem::replace(src, String::new()).into_bytes(); + let mut gap_len = 0; + let mut tail = buf.as_mut_slice(); + loop { + let idx = match find_crlf(&tail[gap_len..]) { + None => tail.len(), + Some(idx) => idx + gap_len, + }; + tail.copy_within(gap_len..idx, 0); + tail = &mut tail[idx - gap_len..]; + if tail.len() == gap_len { + break; + } + gap_len += 1; + } + + // Account for removed `\r`. + // After `set_len`, `buf` is guaranteed to contain utf-8 again. + let new_len = buf.len() - gap_len; + unsafe { + buf.set_len(new_len); + *src = String::from_utf8_unchecked(buf); + } + + fn find_crlf(src: &[u8]) -> Option { + let mut search_idx = 0; + while let Some(idx) = find_cr(&src[search_idx..]) { + if src[search_idx..].get(idx + 1) != Some(&b'\n') { + search_idx += idx + 1; + continue; + } + return Some(search_idx + idx); + } + None + } + + fn find_cr(src: &[u8]) -> Option { + src.iter().position(|&b| b == b'\r') + } +} + // _____________________________________________________________________________ // Pos, BytePos, CharPos // diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs index 78c4e18e6aee0..6bd6016020a27 100644 --- a/src/libsyntax_pos/tests.rs +++ b/src/libsyntax_pos/tests.rs @@ -16,3 +16,23 @@ fn test_lookup_line() { assert_eq!(lookup_line(lines, BytePos(28)), 2); assert_eq!(lookup_line(lines, BytePos(29)), 2); } + +#[test] +fn test_normalize_newlines() { + fn check(before: &str, after: &str) { + let mut actual = before.to_string(); + normalize_newlines(&mut actual); + assert_eq!(actual.as_str(), after); + } + check("", ""); + check("\n", "\n"); + check("\r", "\r"); + check("\r\r", "\r\r"); + check("\r\n", "\n"); + check("hello world", "hello world"); + check("hello\nworld", "hello\nworld"); + check("hello\r\nworld", "hello\nworld"); + check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n"); + check("\r\r\n", "\r\n"); + check("hello\rworld", "hello\rworld"); +}