From 911398b96cc4825798c0887ec6ebce775ff5d2d1 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Wed, 14 Aug 2019 16:38:40 +0300 Subject: [PATCH] remove special handling of \r\n from the lexer --- src/librustc_lexer/src/lib.rs | 2 - src/librustc_lexer/src/unescape.rs | 36 +++-------- src/librustc_lexer/src/unescape/tests.rs | 11 +--- src/libsyntax/parse/lexer/mod.rs | 81 +++++------------------- 4 files changed, 26 insertions(+), 104 deletions(-) diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index c02abe6b89f97..afef307a0ed37 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -352,7 +352,6 @@ impl Cursor<'_> { loop { match self.nth_char(0) { '\n' => break, - '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, _ => { self.bump(); @@ -525,7 +524,6 @@ impl Cursor<'_> { match self.nth_char(0) { '/' if !first => break, '\n' if self.nth_char(1) != '\'' => break, - '\r' if self.nth_char(1) == '\n' => break, EOF_CHAR if self.is_eof() => break, '\'' => { self.bump(); diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs index d8e00d4c7c5ea..c709b7526082f 100644 --- a/src/librustc_lexer/src/unescape.rs +++ b/src/librustc_lexer/src/unescape.rs @@ -128,11 +128,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result Err(EscapeError::EscapeOnlyChar), - '\r' => Err(if chars.clone().next() == Some('\n') { - EscapeError::EscapeOnlyChar - } else { - EscapeError::BareCarriageReturn - }), + '\r' => Err(EscapeError::BareCarriageReturn), '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), _ => { @@ -244,27 +240,15 @@ where let unescaped_char = match first_char { '\\' => { - let (second_char, third_char) = { - let mut chars = chars.clone(); - (chars.next(), chars.next()) - }; - match (second_char, third_char) { - (Some('\n'), _) | (Some('\r'), Some('\n')) => { + let second_char = chars.clone().next(); + match second_char { + Some('\n') => { skip_ascii_whitespace(&mut chars); continue; } _ => scan_escape(first_char, &mut chars, mode), } } - '\r' => { - let second_char = chars.clone().next(); - if second_char == Some('\n') { - chars.next(); - Ok('\n') - } else { - scan_escape(first_char, &mut chars, mode) - } - } '\n' => Ok('\n'), '\t' => Ok('\t'), _ => scan_escape(first_char, &mut chars, mode), @@ -298,15 +282,11 @@ where while let Some(curr) = chars.next() { let start = initial_len - chars.as_str().len() - curr.len_utf8(); - let result = match (curr, chars.clone().next()) { - ('\r', Some('\n')) => { - chars.next(); - Ok('\n') - }, - ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), - (c, _) if mode.is_bytes() && !c.is_ascii() => + let result = match curr { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), - (c, _) => Ok(c), + c => Ok(c), }; let end = initial_len - chars.as_str().len(); diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs index 496527eb265b0..e7b1ff6479d88 100644 --- a/src/librustc_lexer/src/unescape/tests.rs +++ b/src/librustc_lexer/src/unescape/tests.rs @@ -11,7 +11,6 @@ fn test_unescape_char_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); - check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -31,6 +30,7 @@ fn test_unescape_char_bad() { check(r"\v", EscapeError::InvalidEscape); check(r"\💩", EscapeError::InvalidEscape); check(r"\●", EscapeError::InvalidEscape); + check("\\\r", EscapeError::InvalidEscape); check(r"\x", EscapeError::TooShortHexEscape); check(r"\x0", EscapeError::TooShortHexEscape); @@ -116,10 +116,9 @@ fn test_unescape_str_good() { check("foo", "foo"); check("", ""); - check(" \t\n\r\n", " \t\n\n"); + check(" \t\n", " \t\n"); check("hello \\\n world", "hello world"); - check("hello \\\r\n world", "hello world"); check("thread's", "thread's") } @@ -134,7 +133,6 @@ fn test_unescape_byte_bad() { check(r"\", EscapeError::LoneSlash); check("\n", EscapeError::EscapeOnlyChar); - check("\r\n", EscapeError::EscapeOnlyChar); check("\t", EscapeError::EscapeOnlyChar); check("'", EscapeError::EscapeOnlyChar); check("\r", EscapeError::BareCarriageReturn); @@ -238,10 +236,9 @@ fn test_unescape_byte_str_good() { check("foo", b"foo"); check("", b""); - check(" \t\n\r\n", b" \t\n\n"); + check(" \t\n", b" \t\n"); check("hello \\\n world", b"hello world"); - check("hello \\\r\n world", b"hello world"); check("thread's", b"thread's") } @@ -253,7 +250,6 @@ fn test_unescape_raw_str() { assert_eq!(unescaped, expected); } - check("\r\n", &[(0..2, Ok('\n'))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); } @@ -266,7 +262,6 @@ fn test_unescape_raw_byte_str() { assert_eq!(unescaped, expected); } - check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]); check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); check( diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index e86d4c7fde683..e811bf7a5817d 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -8,9 +8,7 @@ use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; use rustc_lexer::Base; use rustc_lexer::unescape; -use std::borrow::Cow; use std::char; -use std::iter; use std::convert::TryInto; use rustc_data_structures::sync::Lrc; use log::debug; @@ -181,18 +179,7 @@ impl<'a> StringReader<'a> { let string = self.str_from(start); // comments with only more "/"s are not doc comments let tok = if is_doc_comment(string) { - let mut idx = 0; - loop { - idx = match string[idx..].find('\r') { - None => break, - Some(it) => idx + it + 1 - }; - if string[idx..].chars().next() != Some('\n') { - self.err_span_(start + BytePos(idx as u32 - 1), - start + BytePos(idx as u32), - "bare CR not allowed in doc-comment"); - } - } + self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment"); token::DocComment(Symbol::intern(string)) } else { token::Comment @@ -217,15 +204,10 @@ impl<'a> StringReader<'a> { } let tok = if is_doc_comment { - let has_cr = string.contains('\r'); - let string = if has_cr { - self.translate_crlf(start, - string, - "bare CR not allowed in block doc-comment") - } else { - string.into() - }; - token::DocComment(Symbol::intern(&string[..])) + self.forbid_bare_cr(start, + string, + "bare CR not allowed in block doc-comment"); + token::DocComment(Symbol::intern(string)) } else { token::Comment }; @@ -516,49 +498,16 @@ impl<'a> StringReader<'a> { &self.src[self.src_index(start)..self.src_index(end)] } - /// Converts CRLF to LF in the given string, raising an error on bare CR. - fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> { - let mut chars = s.char_indices().peekable(); - while let Some((i, ch)) = chars.next() { - if ch == '\r' { - if let Some((lf_idx, '\n')) = chars.peek() { - return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into(); - } - let pos = start + BytePos(i as u32); - let end_pos = start + BytePos((i + ch.len_utf8()) as u32); - self.err_span_(pos, end_pos, errmsg); - } - } - return s.into(); - - fn translate_crlf_(rdr: &StringReader<'_>, - start: BytePos, - s: &str, - mut j: usize, - mut chars: iter::Peekable>, - errmsg: &str) - -> String { - let mut buf = String::with_capacity(s.len()); - // Skip first CR - buf.push_str(&s[.. j - 1]); - while let Some((i, ch)) = chars.next() { - if ch == '\r' { - if j < i { - buf.push_str(&s[j..i]); - } - let next = i + ch.len_utf8(); - j = next; - if chars.peek().map(|(_, ch)| *ch) != Some('\n') { - let pos = start + BytePos(i as u32); - let end_pos = start + BytePos(next as u32); - rdr.err_span_(pos, end_pos, errmsg); - } - } - } - if j < s.len() { - buf.push_str(&s[j..]); - } - buf + fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) { + let mut idx = 0; + loop { + idx = match s[idx..].find('\r') { + None => break, + Some(it) => idx + it + 1 + }; + self.err_span_(start + BytePos(idx as u32 - 1), + start + BytePos(idx as u32), + errmsg); } }