From 5a21f766f02ec6d3a6f91772f5f0f2dcef902b80 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Thu, 10 Oct 2019 18:13:33 +0300 Subject: [PATCH] Revert "Auto merge of #62948 - matklad:failable-file-loading, r=petrochenkov" This reverts commit ef1ecbefb8719e408150738664d443a73f757ffd, reversing changes made to fc8765d6d8623b2b5b4ca1d526ed1d7beb3fce18. That changed unfortunately broke rustfix on windows: https://github.com/rust-lang-nursery/rustfix/issues/176 Specifically, what ef1ecbefb8719e408150738664d443a73f757ffd did was to enforce normalization of \r\n to \n at file loading time, similarly to how we deal with Byte Order Mark. Normalization changes raw offsets in files, which are exposed via `--error-format=json`, and used by rusfix. The proper solution here (which also handles the latent case with BOM) is https://github.com/rust-lang/rust/pull/65074 However, since it's somewhat involved, and we are time sensitive, we prefer to revert the original change on beta. --- src/librustc_lexer/src/lib.rs | 2 + src/librustc_lexer/src/unescape.rs | 36 ++++++++--- src/librustc_lexer/src/unescape/tests.rs | 11 +++- src/libsyntax/parse/lexer/mod.rs | 81 +++++++++++++++++++----- src/libsyntax_pos/lib.rs | 56 ---------------- src/libsyntax_pos/tests.rs | 20 ------ 6 files changed, 104 insertions(+), 102 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 30a5175d8cdb0..08608cfe98164 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -268,6 +268,7 @@ impl Cursor<'_> { loop { match self.nth_char(0) { '\n' => break, + '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, _ => { self.bump(); @@ -440,6 +441,7 @@ impl Cursor<'_> { match self.nth_char(0) { '/' if !first => break, '\n' if self.nth_char(1) != '\'' => break, + '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, '\'' => { self.bump(); diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs index c709b7526082f..d8e00d4c7c5ea 100644 --- a/src/librustc_lexer/src/unescape.rs +++ b/src/librustc_lexer/src/unescape.rs @@ -128,7 +128,11 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result Err(EscapeError::EscapeOnlyChar), - '\r' => Err(EscapeError::BareCarriageReturn), + '\r' => Err(if chars.clone().next() == Some('\n') { + EscapeError::EscapeOnlyChar + } else { + EscapeError::BareCarriageReturn + }), '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), _ => { @@ -240,15 +244,27 @@ where let unescaped_char = match first_char { '\\' => { - let second_char = chars.clone().next(); - match second_char { - Some('\n') => { + let (second_char, third_char) = { + let mut chars = chars.clone(); + (chars.next(), chars.next()) + }; + match (second_char, third_char) { + (Some('\n'), _) | (Some('\r'), Some('\n')) => { skip_ascii_whitespace(&mut chars); continue; } _ => scan_escape(first_char, &mut chars, mode), } } + '\r' => { + let second_char = chars.clone().next(); + if second_char == Some('\n') { + chars.next(); + Ok('\n') + } else { + scan_escape(first_char, &mut chars, mode) + } + } '\n' => Ok('\n'), '\t' => Ok('\t'), _ => scan_escape(first_char, &mut chars, mode), @@ -282,11 +298,15 @@ where while let Some(curr) = chars.next() { let start = initial_len - chars.as_str().len() - curr.len_utf8(); - let result = match curr { - '\r' => Err(EscapeError::BareCarriageReturnInRawString), - c if mode.is_bytes() && !c.is_ascii() => + let result = match (curr, chars.clone().next()) { + ('\r', Some('\n')) => { + chars.next(); + Ok('\n') + }, + ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), + (c, _) if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - c => Ok(c), + (c, _) => Ok(c), }; let end = initial_len - chars.as_str().len(); diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs index e7b1ff6479d88..496527eb265b0 100644 --- a/src/librustc_lexer/src/unescape/tests.rs +++ b/src/librustc_lexer/src/unescape/tests.rs @@ -11,6 +11,7 @@ fn test_unescape_char_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -30,7 +31,6 @@ fn test_unescape_char_bad() { check(r"\v", EscapeError::InvalidEscape); check(r"\💩", EscapeError::InvalidEscape); check(r"\●", EscapeError::InvalidEscape); - check("\\\r", EscapeError::InvalidEscape); check(r"\x", EscapeError::TooShortHexEscape); check(r"\x0", EscapeError::TooShortHexEscape); @@ -116,9 +116,10 @@ fn test_unescape_str_good() { check("foo", "foo"); check("", ""); - check(" \t\n", " \t\n"); + check(" \t\n\r\n", " \t\n\n"); check("hello \\\n world", "hello world"); + check("hello \\\r\n world", "hello world"); check("thread's", "thread's") } @@ -133,6 +134,7 @@ fn test_unescape_byte_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -236,9 +238,10 @@ fn test_unescape_byte_str_good() { check("foo", b"foo"); check("", b""); - check(" \t\n", b" \t\n"); + check(" \t\n\r\n", b" \t\n\n"); check("hello \\\n world", b"hello world"); + check("hello \\\r\n world", b"hello world"); check("thread's", b"thread's") } @@ -250,6 +253,7 @@ fn test_unescape_raw_str() { assert_eq!(unescaped, expected); } + check("\r\n", &[(0..2, Ok('\n'))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); } @@ -262,6 +266,7 @@ fn test_unescape_raw_byte_str() { assert_eq!(unescaped, expected); } + check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); check( diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 66add869359d8..d16889a91e406 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -8,7 +8,9 @@ use syntax_pos::{BytePos, Pos, Span}; use rustc_lexer::Base; use rustc_lexer::unescape; +use std::borrow::Cow; use std::char; +use std::iter; use std::convert::TryInto; use rustc_data_structures::sync::Lrc; use log::debug; @@ -179,7 +181,18 @@ impl<'a> StringReader<'a> { let string = self.str_from(start); // comments with only more "/"s are not doc comments let tok = if is_doc_comment(string) { - self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment"); + let mut idx = 0; + loop { + idx = match string[idx..].find('\r') { + None => break, + Some(it) => idx + it + 1 + }; + if string[idx..].chars().next() != Some('\n') { + self.err_span_(start + BytePos(idx as u32 - 1), + start + BytePos(idx as u32), + "bare CR not allowed in doc-comment"); + } + } token::DocComment(Symbol::intern(string)) } else { token::Comment @@ -204,10 +217,15 @@ impl<'a> StringReader<'a> { } let tok = if is_doc_comment { - self.forbid_bare_cr(start, - string, - "bare CR not allowed in block doc-comment"); - token::DocComment(Symbol::intern(string)) + let has_cr = string.contains('\r'); + let string = if has_cr { + self.translate_crlf(start, + string, + "bare CR not allowed in block doc-comment") + } else { + string.into() + }; + token::DocComment(Symbol::intern(&string[..])) } else { token::Comment }; @@ -473,16 +491,49 @@ impl<'a> StringReader<'a> { &self.src[self.src_index(start)..self.src_index(end)] } - fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) { - let mut idx = 0; - loop { - idx = match s[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1 - }; - self.err_span_(start + BytePos(idx as u32 - 1), - start + BytePos(idx as u32), - errmsg); + /// Converts CRLF to LF in the given string, raising an error on bare CR. + fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { + let mut chars = s.char_indices().peekable(); + while let Some((i, ch)) = chars.next() { + if ch == '\r' { + if let Some((lf_idx, '\n')) = chars.peek() { + return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into(); + } + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos((i + ch.len_utf8()) as u32); + self.err_span_(pos, end_pos, errmsg); + } + } + return s.into(); + + fn translate_crlf_(rdr: &StringReader<'_>, + start: BytePos, + s: &str, + mut j: usize, + mut chars: iter::Peekable>, + errmsg: &str) + -> String { + let mut buf = String::with_capacity(s.len()); + // Skip first CR + buf.push_str(&s[.. j - 1]); + while let Some((i, ch)) = chars.next() { + if ch == '\r' { + if j < i { + buf.push_str(&s[j..i]); + } + let next = i + ch.len_utf8(); + j = next; + if chars.peek().map(|(_, ch)| *ch) != Some('\n') { + let pos = start + BytePos(i as u32); + let end_pos = start + BytePos(next as u32); + rdr.err_span_(pos, end_pos, errmsg); + } + } + } + if j < s.len() { + buf.push_str(&s[j..]); + } + buf } } diff --git a/src/libsyntax_pos/lib.rs b/src/libsyntax_pos/lib.rs index ca177eb4a3616..e48637498f0e7 100644 --- a/src/libsyntax_pos/lib.rs +++ b/src/libsyntax_pos/lib.rs @@ -1064,7 +1064,6 @@ impl SourceFile { mut src: String, start_pos: BytePos) -> Result { remove_bom(&mut src); - normalize_newlines(&mut src); let src_hash = { let mut hasher: StableHasher = StableHasher::new(); @@ -1232,61 +1231,6 @@ fn remove_bom(src: &mut String) { } } - -/// Replaces `\r\n` with `\n` in-place in `src`. -/// -/// Returns error if there's a lone `\r` in the string -fn normalize_newlines(src: &mut String) { - if !src.as_bytes().contains(&b'\r') { - return; - } - - // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding. - // While we *can* call `as_mut_vec` and do surgery on the live string - // directly, let's rather steal the contents of `src`. This makes the code - // safe even if a panic occurs. - - let mut buf = std::mem::replace(src, String::new()).into_bytes(); - let mut gap_len = 0; - let mut tail = buf.as_mut_slice(); - loop { - let idx = match find_crlf(&tail[gap_len..]) { - None => tail.len(), - Some(idx) => idx + gap_len, - }; - tail.copy_within(gap_len..idx, 0); - tail = &mut tail[idx - gap_len..]; - if tail.len() == gap_len { - break; - } - gap_len += 1; - } - - // Account for removed `\r`. - // After `set_len`, `buf` is guaranteed to contain utf-8 again. - let new_len = buf.len() - gap_len; - unsafe { - buf.set_len(new_len); - *src = String::from_utf8_unchecked(buf); - } - - fn find_crlf(src: &[u8]) -> Option { - let mut search_idx = 0; - while let Some(idx) = find_cr(&src[search_idx..]) { - if src[search_idx..].get(idx + 1) != Some(&b'\n') { - search_idx += idx + 1; - continue; - } - return Some(search_idx + idx); - } - None - } - - fn find_cr(src: &[u8]) -> Option { - src.iter().position(|&b| b == b'\r') - } -} - // _____________________________________________________________________________ // Pos, BytePos, CharPos // diff --git a/src/libsyntax_pos/tests.rs b/src/libsyntax_pos/tests.rs index 6bd6016020a27..78c4e18e6aee0 100644 --- a/src/libsyntax_pos/tests.rs +++ b/src/libsyntax_pos/tests.rs @@ -16,23 +16,3 @@ fn test_lookup_line() { assert_eq!(lookup_line(lines, BytePos(28)), 2); assert_eq!(lookup_line(lines, BytePos(29)), 2); } - -#[test] -fn test_normalize_newlines() { - fn check(before: &str, after: &str) { - let mut actual = before.to_string(); - normalize_newlines(&mut actual); - assert_eq!(actual.as_str(), after); - } - check("", ""); - check("\n", "\n"); - check("\r", "\r"); - check("\r\r", "\r\r"); - check("\r\n", "\n"); - check("hello world", "hello world"); - check("hello\nworld", "hello\nworld"); - check("hello\r\nworld", "hello\nworld"); - check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n"); - check("\r\r\n", "\r\n"); - check("hello\rworld", "hello\rworld"); -}