diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 30a5175d8cdb0..08608cfe98164 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -268,6 +268,7 @@ impl Cursor<'_> { loop { match self.nth_char(0) { '\n' => break, + '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, _ => { self.bump(); @@ -440,6 +441,7 @@ impl Cursor<'_> { match self.nth_char(0) { '/' if !first => break, '\n' if self.nth_char(1) != '\'' => break, + '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, '\'' => { self.bump(); diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs index c709b7526082f..d8e00d4c7c5ea 100644 --- a/src/librustc_lexer/src/unescape.rs +++ b/src/librustc_lexer/src/unescape.rs @@ -128,7 +128,11 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), + '\r' => Err(if chars.clone().next() == Some('\n') { + EscapeError::EscapeOnlyChar + } else { + EscapeError::BareCarriageReturn + }), '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), _ => { @@ -240,15 +244,27 @@ where let unescaped_char = match first_char { '\\' => { - let second_char = chars.clone().next(); - match second_char { - Some('\n') => { + let (second_char, third_char) = { + let mut chars = chars.clone(); + (chars.next(), chars.next()) + }; + match (second_char, third_char) { + (Some('\n'), _) | (Some('\r'), Some('\n')) => { skip_ascii_whitespace(&mut chars); continue; } _ => scan_escape(first_char, &mut chars, mode), } } + '\r' => { + let second_char = chars.clone().next(); + if second_char == Some('\n') { + chars.next(); + Ok('\n') + } else { + scan_escape(first_char, &mut chars, mode) + } + } '\n' => Ok('\n'), '\t' => Ok('\t'), _ => scan_escape(first_char, &mut chars, mode), @@ -282,11 +298,15 @@ where while let Some(curr) = chars.next() { let start = initial_len - chars.as_str().len() - curr.len_utf8(); - let result = match curr { - '\r' => Err(EscapeError::BareCarriageReturnInRawString), - c if mode.is_bytes() && !c.is_ascii() => + let result = match (curr, chars.clone().next()) { + ('\r', Some('\n')) => { + chars.next(); + Ok('\n') + }, + ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), + (c, _) if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - c => Ok(c), + (c, _) => Ok(c), }; let end = initial_len - chars.as_str().len(); diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs index e7b1ff6479d88..496527eb265b0 100644 --- a/src/librustc_lexer/src/unescape/tests.rs +++ b/src/librustc_lexer/src/unescape/tests.rs @@ -11,6 +11,7 @@ fn test_unescape_char_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -30,7 +31,6 @@ fn test_unescape_char_bad() { check(r"\v", EscapeError::InvalidEscape); check(r"\💩", EscapeError::InvalidEscape); check(r"\●", EscapeError::InvalidEscape); - check("\\\r", EscapeError::InvalidEscape); check(r"\x", EscapeError::TooShortHexEscape); check(r"\x0", EscapeError::TooShortHexEscape); @@ -116,9 +116,10 @@ fn test_unescape_str_good() { check("foo", "foo"); check("", ""); - check(" \t\n", " \t\n"); + check(" \t\n\r\n", " \t\n\n"); check("hello \\\n world", "hello world"); + check("hello \\\r\n world", "hello world"); check("thread's", "thread's") } @@ -133,6 +134,7 @@ fn test_unescape_byte_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -236,9 +238,10 @@ fn test_unescape_byte_str_good() { check("foo", b"foo"); check("", b""); - check(" \t\n", b" \t\n"); + check(" \t\n\r\n", b" \t\n\n"); check("hello \\\n world", b"hello world"); + check("hello \\\r\n world", b"hello world"); check("thread's", b"thread's") } @@ -250,6 +253,7 @@ fn test_unescape_raw_str() { assert_eq!(unescaped, expected); } + check("\r\n", &[(0..2, Ok('\n'))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); } @@ -262,6 +266,7 @@ fn test_unescape_raw_byte_str() { assert_eq!(unescaped, expected); } + check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); check( diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 66add869359d8..d16889a91e406 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -8,7 +8,9 @@ use syntax_pos::{BytePos, Pos, Span}; use rustc_lexer::Base; use rustc_lexer::unescape; +use std::borrow::Cow; use std::char; +use std::iter; use std::convert::TryInto; use rustc_data_structures::sync::Lrc; use log::debug; @@ -179,7 +181,18 @@ impl<'a> StringReader<'a> { let string = self.str_from(start); // comments with only more "/"s are not doc comments let tok = if is_doc_comment(string) { - self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment"); + let mut idx = 0; + loop { + idx = match string[idx..].find('\r') { + None => break, + Some(it) => idx + it + 1 + }; + if string[idx..].chars().next() != Some('\n') { + self.err_span_(start + BytePos(idx as u32 - 1), + start + BytePos(idx as u32), + "bare CR not allowed in doc-comment"); + } + } token::DocComment(Symbol::intern(string)) } else { token::Comment @@ -204,10 +217,15 @@ impl<'a> StringReader<'a> { } let tok = if is_doc_comment { - self.forbid_bare_cr(start, - string, - "bare CR not allowed in block doc-comment"); - token::DocComment(Symbol::intern(string)) + let has_cr = string.contains('\r'); + let string = if has_cr { + self.translate_crlf(start, + string, + "bare CR not allowed in block doc-comment") + } else { + string.into() + }; + token::DocComment(Symbol::intern(&string[..])) } else { token::Comment }; @@ -473,16 +491,49 @@ impl<'a> StringReader<'a> { &self.src[self.src_index(start)..self.src_index(end)] } - fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) { - let mut idx = 0; - loop { - idx = match s[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1 - }; - self.err_span_(start + BytePos(idx as u32 - 1), - start + BytePos(idx as u32), - errmsg); + /// Converts CRLF to LF in the given string, raising an error on bare CR. + fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { + let mut chars = s.char_indices().peekable(); + while let Some((i, ch)) = chars.next() { + if ch == '\r' { + if let Some((lf_idx, '\n')) = chars.peek() { + return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into(); + } + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos((i + ch.len_utf8()) as u32); + self.err_span_(pos, end_pos, errmsg); + } + } + return s.into(); + + fn translate_crlf_(rdr: &StringReader<'_>, + start: BytePos, + s: &str, + mut j: usize, + mut chars: iter::Peekable>, + errmsg: &str) + -> String { + let mut buf = String::with_capacity(s.len()); + // Skip first CR + buf.push_str(&s[.. j - 1]); + while let Some((i, ch)) = chars.next() { + if ch == '\r' { + if j < i { + buf.push_str(&s[j..i]); + } + let next = i + ch.len_utf8(); + j = next; + if chars.peek().map(|(_, ch)| *ch) != Some('\n') { + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos(next as u32); + rdr.err_span_(pos, end_pos, errmsg); + } + } + } + if j < s.len() { + buf.push_str(&s[j..]); + } + buf } } diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index ca177eb4a3616..e48637498f0e7 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -1064,7 +1064,6 @@ impl SourceFile { mut src: String, start_pos: BytePos) -> Result { remove_bom(&mut src); - normalize_newlines(&mut src); let src_hash = { let mut hasher: StableHasher = StableHasher::new(); @@ -1232,61 +1231,6 @@ fn remove_bom(src: &mut String) { } } - -/// Replaces `\r\n` with `\n` in-place in `src`. -/// -/// Returns error if there's a lone `\r` in the string -fn normalize_newlines(src: &mut String) { - if !src.as_bytes().contains(&b'\r') { - return; - } - - // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding. - // While we *can* call `as_mut_vec` and do surgery on the live string - // directly, let's rather steal the contents of `src`. This makes the code - // safe even if a panic occurs. - - let mut buf = std::mem::replace(src, String::new()).into_bytes(); - let mut gap_len = 0; - let mut tail = buf.as_mut_slice(); - loop { - let idx = match find_crlf(&tail[gap_len..]) { - None => tail.len(), - Some(idx) => idx + gap_len, - }; - tail.copy_within(gap_len..idx, 0); - tail = &mut tail[idx - gap_len..]; - if tail.len() == gap_len { - break; - } - gap_len += 1; - } - - // Account for removed `\r`. - // After `set_len`, `buf` is guaranteed to contain utf-8 again. - let new_len = buf.len() - gap_len; - unsafe { - buf.set_len(new_len); - *src = String::from_utf8_unchecked(buf); - } - - fn find_crlf(src: &[u8]) -> Option { - let mut search_idx = 0; - while let Some(idx) = find_cr(&src[search_idx..]) { - if src[search_idx..].get(idx + 1) != Some(&b'\n') { - search_idx += idx + 1; - continue; - } - return Some(search_idx + idx); - } - None - } - - fn find_cr(src: &[u8]) -> Option { - src.iter().position(|&b| b == b'\r') - } -} - // _____________________________________________________________________________ // Pos, BytePos, CharPos // diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs index 6bd6016020a27..78c4e18e6aee0 100644 --- a/src/libsyntax_pos/tests.rs +++ b/src/libsyntax_pos/tests.rs @@ -16,23 +16,3 @@ fn test_lookup_line() { assert_eq!(lookup_line(lines, BytePos(28)), 2); assert_eq!(lookup_line(lines, BytePos(29)), 2); } - -#[test] -fn test_normalize_newlines() { - fn check(before: &str, after: &str) { - let mut actual = before.to_string(); - normalize_newlines(&mut actual); - assert_eq!(actual.as_str(), after); - } - check("", ""); - check("\n", "\n"); - check("\r", "\r"); - check("\r\r", "\r\r"); - check("\r\n", "\n"); - check("hello world", "hello world"); - check("hello\nworld", "hello\nworld"); - check("hello\r\nworld", "hello\nworld"); - check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n"); - check("\r\r\n", "\r\n"); - check("hello\rworld", "hello\rworld"); -}