Skip to content

Commit

Permalink
Revert "Auto merge of rust-lang#62948 - matklad:failable-file-loading…
Browse files Browse the repository at this point in the history
…, r=petrochenkov"

This reverts commit ef1ecbe, reversing
changes made to fc8765d.

That changed unfortunately broke rustfix on windows:

rust-lang/rustfix#176

Specifically, what ef1ecbe did was to
enforce normalization of \r\n to \n at file loading time, similarly to
how we deal with Byte Order Mark. Normalization changes raw offsets in
files, which are exposed via `--error-format=json`, and used by rusfix.

The proper solution here (which also handles the latent case with BOM) is

rust-lang#65074

However, since it's somewhat involved, and we are time sensitive, we
prefer to revert the original change on beta.
  • Loading branch information
matklad committed Oct 10, 2019
1 parent 9689670 commit 5a21f76
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 102 deletions.
2 changes: 2 additions & 0 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ impl Cursor<'_> {
loop {
match self.nth_char(0) {
'\n' => break,
'\r' if self.nth_char(1) == '\n' => break,
EOF_CHAR if self.is_eof() => break,
_ => {
self.bump();
Expand Down Expand Up @@ -440,6 +441,7 @@ impl Cursor<'_> {
match self.nth_char(0) {
'/' if !first => break,
'\n' if self.nth_char(1) != '\'' => break,
'\r' if self.nth_char(1) == '\n' => break,
EOF_CHAR if self.is_eof() => break,
'\'' => {
self.bump();
Expand Down
36 changes: 28 additions & 8 deletions src/librustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
if first_char != '\\' {
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
'\r' => Err(if chars.clone().next() == Some('\n') {
EscapeError::EscapeOnlyChar
} else {
EscapeError::BareCarriageReturn
}),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
Expand Down Expand Up @@ -240,15 +244,27 @@ where

let unescaped_char = match first_char {
'\\' => {
let second_char = chars.clone().next();
match second_char {
Some('\n') => {
let (second_char, third_char) = {
let mut chars = chars.clone();
(chars.next(), chars.next())
};
match (second_char, third_char) {
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars, mode),
}
}
'\r' => {
let second_char = chars.clone().next();
if second_char == Some('\n') {
chars.next();
Ok('\n')
} else {
scan_escape(first_char, &mut chars, mode)
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode),
Expand Down Expand Up @@ -282,11 +298,15 @@ where
while let Some(curr) = chars.next() {
let start = initial_len - chars.as_str().len() - curr.len_utf8();

let result = match curr {
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
c if mode.is_bytes() && !c.is_ascii() =>
let result = match (curr, chars.clone().next()) {
('\r', Some('\n')) => {
chars.next();
Ok('\n')
},
('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
(c, _) if mode.is_bytes() && !c.is_ascii() =>
Err(EscapeError::NonAsciiCharInByteString),
c => Ok(c),
(c, _) => Ok(c),
};
let end = initial_len - chars.as_str().len();

Expand Down
11 changes: 8 additions & 3 deletions src/librustc_lexer/src/unescape/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ fn test_unescape_char_bad() {
check(r"\", EscapeError::LoneSlash);

check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
Expand All @@ -30,7 +31,6 @@ fn test_unescape_char_bad() {
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check("\\\r", EscapeError::InvalidEscape);

check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
Expand Down Expand Up @@ -116,9 +116,10 @@ fn test_unescape_str_good() {

check("foo", "foo");
check("", "");
check(" \t\n", " \t\n");
check(" \t\n\r\n", " \t\n\n");

check("hello \\\n world", "hello world");
check("hello \\\r\n world", "hello world");
check("thread's", "thread's")
}

Expand All @@ -133,6 +134,7 @@ fn test_unescape_byte_bad() {
check(r"\", EscapeError::LoneSlash);

check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
Expand Down Expand Up @@ -236,9 +238,10 @@ fn test_unescape_byte_str_good() {

check("foo", b"foo");
check("", b"");
check(" \t\n", b" \t\n");
check(" \t\n\r\n", b" \t\n\n");

check("hello \\\n world", b"hello world");
check("hello \\\r\n world", b"hello world");
check("thread's", b"thread's")
}

Expand All @@ -250,6 +253,7 @@ fn test_unescape_raw_str() {
assert_eq!(unescaped, expected);
}

check("\r\n", &[(0..2, Ok('\n'))]);
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
}
Expand All @@ -262,6 +266,7 @@ fn test_unescape_raw_byte_str() {
assert_eq!(unescaped, expected);
}

check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
check(
Expand Down
81 changes: 66 additions & 15 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ use syntax_pos::{BytePos, Pos, Span};
use rustc_lexer::Base;
use rustc_lexer::unescape;

use std::borrow::Cow;
use std::char;
use std::iter;
use std::convert::TryInto;
use rustc_data_structures::sync::Lrc;
use log::debug;
Expand Down Expand Up @@ -179,7 +181,18 @@ impl<'a> StringReader<'a> {
let string = self.str_from(start);
// comments with only more "/"s are not doc comments
let tok = if is_doc_comment(string) {
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
let mut idx = 0;
loop {
idx = match string[idx..].find('\r') {
None => break,
Some(it) => idx + it + 1
};
if string[idx..].chars().next() != Some('\n') {
self.err_span_(start + BytePos(idx as u32 - 1),
start + BytePos(idx as u32),
"bare CR not allowed in doc-comment");
}
}
token::DocComment(Symbol::intern(string))
} else {
token::Comment
Expand All @@ -204,10 +217,15 @@ impl<'a> StringReader<'a> {
}

let tok = if is_doc_comment {
self.forbid_bare_cr(start,
string,
"bare CR not allowed in block doc-comment");
token::DocComment(Symbol::intern(string))
let has_cr = string.contains('\r');
let string = if has_cr {
self.translate_crlf(start,
string,
"bare CR not allowed in block doc-comment")
} else {
string.into()
};
token::DocComment(Symbol::intern(&string[..]))
} else {
token::Comment
};
Expand Down Expand Up @@ -473,16 +491,49 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}

fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
let mut idx = 0;
loop {
idx = match s[idx..].find('\r') {
None => break,
Some(it) => idx + it + 1
};
self.err_span_(start + BytePos(idx as u32 - 1),
start + BytePos(idx as u32),
errmsg);
/// Converts CRLF to LF in the given string, raising an error on bare CR.
fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
let mut chars = s.char_indices().peekable();
while let Some((i, ch)) = chars.next() {
if ch == '\r' {
if let Some((lf_idx, '\n')) = chars.peek() {
return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
}
let pos = start + BytePos(i as u32);
let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
self.err_span_(pos, end_pos, errmsg);
}
}
return s.into();

fn translate_crlf_(rdr: &StringReader<'_>,
start: BytePos,
s: &str,
mut j: usize,
mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
errmsg: &str)
-> String {
let mut buf = String::with_capacity(s.len());
// Skip first CR
buf.push_str(&s[.. j - 1]);
while let Some((i, ch)) = chars.next() {
if ch == '\r' {
if j < i {
buf.push_str(&s[j..i]);
}
let next = i + ch.len_utf8();
j = next;
if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
let pos = start + BytePos(i as u32);
let end_pos = start + BytePos(next as u32);
rdr.err_span_(pos, end_pos, errmsg);
}
}
}
if j < s.len() {
buf.push_str(&s[j..]);
}
buf
}
}

Expand Down
56 changes: 0 additions & 56 deletions src/libsyntax_pos/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,6 @@ impl SourceFile {
mut src: String,
start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
remove_bom(&mut src);
normalize_newlines(&mut src);

let src_hash = {
let mut hasher: StableHasher<u128> = StableHasher::new();
Expand Down Expand Up @@ -1232,61 +1231,6 @@ fn remove_bom(src: &mut String) {
}
}


/// Replaces `\r\n` with `\n` in-place in `src`.
///
/// Returns error if there's a lone `\r` in the string
fn normalize_newlines(src: &mut String) {
if !src.as_bytes().contains(&b'\r') {
return;
}

// We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
// While we *can* call `as_mut_vec` and do surgery on the live string
// directly, let's rather steal the contents of `src`. This makes the code
// safe even if a panic occurs.

let mut buf = std::mem::replace(src, String::new()).into_bytes();
let mut gap_len = 0;
let mut tail = buf.as_mut_slice();
loop {
let idx = match find_crlf(&tail[gap_len..]) {
None => tail.len(),
Some(idx) => idx + gap_len,
};
tail.copy_within(gap_len..idx, 0);
tail = &mut tail[idx - gap_len..];
if tail.len() == gap_len {
break;
}
gap_len += 1;
}

// Account for removed `\r`.
// After `set_len`, `buf` is guaranteed to contain utf-8 again.
let new_len = buf.len() - gap_len;
unsafe {
buf.set_len(new_len);
*src = String::from_utf8_unchecked(buf);
}

fn find_crlf(src: &[u8]) -> Option<usize> {
let mut search_idx = 0;
while let Some(idx) = find_cr(&src[search_idx..]) {
if src[search_idx..].get(idx + 1) != Some(&b'\n') {
search_idx += idx + 1;
continue;
}
return Some(search_idx + idx);
}
None
}

fn find_cr(src: &[u8]) -> Option<usize> {
src.iter().position(|&b| b == b'\r')
}
}

// _____________________________________________________________________________
// Pos, BytePos, CharPos
//
Expand Down
20 changes: 0 additions & 20 deletions src/libsyntax_pos/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,3 @@ fn test_lookup_line() {
assert_eq!(lookup_line(lines, BytePos(28)), 2);
assert_eq!(lookup_line(lines, BytePos(29)), 2);
}

#[test]
fn test_normalize_newlines() {
fn check(before: &str, after: &str) {
let mut actual = before.to_string();
normalize_newlines(&mut actual);
assert_eq!(actual.as_str(), after);
}
check("", "");
check("\n", "\n");
check("\r", "\r");
check("\r\r", "\r\r");
check("\r\n", "\n");
check("hello world", "hello world");
check("hello\nworld", "hello\nworld");
check("hello\r\nworld", "hello\nworld");
check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
check("\r\r\n", "\r\n");
check("hello\rworld", "hello\rworld");
}

0 comments on commit 5a21f76

Please sign in to comment.