Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce unescape module #60261

Merged
merged 2 commits into from
May 6, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/librustc_errors/diagnostic_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ impl<'a> DiagnosticBuilder<'a> {
) -> &mut Self);
forward!(pub fn warn(&mut self, msg: &str) -> &mut Self);
forward!(pub fn span_warn<S: Into<MultiSpan>>(&mut self, sp: S, msg: &str) -> &mut Self);
forward!(pub fn help(&mut self , msg: &str) -> &mut Self);
forward!(pub fn help(&mut self, msg: &str) -> &mut Self);
forward!(pub fn span_help<S: Into<MultiSpan>>(&mut self,
sp: S,
msg: &str,
Expand Down
585 changes: 142 additions & 443 deletions src/libsyntax/parse/lexer/mod.rs

Large diffs are not rendered by default.

262 changes: 40 additions & 222 deletions src/libsyntax/parse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ use log::debug;

use rustc_data_structures::fx::FxHashSet;
use std::borrow::Cow;
use std::iter;
use std::path::{Path, PathBuf};
use std::str;

Expand All @@ -33,6 +32,11 @@ pub mod attr;

pub mod classify;

pub(crate) mod unescape;
use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError};

pub(crate) mod unescape_error_reporting;

/// Info about a parsing session.
pub struct ParseSess {
pub span_diagnostic: Handler,
Expand Down Expand Up @@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
Parser::new(sess, stream, None, true, false)
}

/// Parses a string representing a character literal into its final form.
/// Rather than just accepting/rejecting a given literal, unescapes it as
/// well. Can take any slice prefixed by a character escape. Returns the
/// character and the number of characters consumed.
fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
use std::char;

// Handle non-escaped chars first.
if lit.as_bytes()[0] != b'\\' {
// If the first byte isn't '\\' it might part of a multi-byte char, so
// get the char with chars().
let c = lit.chars().next().unwrap();
return (c, 1);
}

// Handle escaped chars.
petrochenkov marked this conversation as resolved.
Show resolved Hide resolved
match lit.as_bytes()[1] as char {
'"' => ('"', 2),
'n' => ('\n', 2),
'r' => ('\r', 2),
't' => ('\t', 2),
'\\' => ('\\', 2),
'\'' => ('\'', 2),
'0' => ('\0', 2),
'x' => {
let v = u32::from_str_radix(&lit[2..4], 16).unwrap();
let c = char::from_u32(v).unwrap();
(c, 4)
}
'u' => {
assert_eq!(lit.as_bytes()[2], b'{');
let idx = lit.find('}').unwrap();

// All digits and '_' are ascii, so treat each byte as a char.
let mut v: u32 = 0;
for c in lit[3..idx].bytes() {
let c = char::from(c);
if c != '_' {
let x = c.to_digit(16).unwrap();
v = v.checked_mul(16).unwrap().checked_add(x).unwrap();
}
}
let c = char::from_u32(v).unwrap_or_else(|| {
if let Some((span, diag)) = diag {
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
if v > 0x10FFFF {
diag.help("unicode escape must be at most 10FFFF").emit();
} else {
diag.help("unicode escape must not be a surrogate").emit();
}
}
'\u{FFFD}'
});
(c, (idx + 1) as isize)
}
_ => panic!("lexer should have rejected a bad character escape {}", lit)
}
}

/// Parses a string representing a string literal into its final form. Does unescaping.
fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
debug!("str_lit: given {}", lit.escape_default());
let mut res = String::with_capacity(lit.len());

let error = |i| format!("lexer should have rejected {} at {}", lit, i);

/// Eat everything up to a non-whitespace.
fn eat<'a>(it: &mut iter::Peekable<str::CharIndices<'a>>) {
loop {
match it.peek().map(|x| x.1) {
Some(' ') | Some('\n') | Some('\r') | Some('\t') => {
it.next();
},
_ => { break; }
}
}
}

let mut chars = lit.char_indices().peekable();
while let Some((i, c)) = chars.next() {
match c {
'\\' => {
let ch = chars.peek().unwrap_or_else(|| {
panic!("{}", error(i))
}).1;

if ch == '\n' {
eat(&mut chars);
} else if ch == '\r' {
chars.next();
let ch = chars.peek().unwrap_or_else(|| {
panic!("{}", error(i))
}).1;

if ch != '\n' {
panic!("lexer accepted bare CR");
}
eat(&mut chars);
} else {
// otherwise, a normal escape
let (c, n) = char_lit(&lit[i..], diag);
for _ in 0..n - 1 { // we don't need to move past the first \
chars.next();
}
res.push(c);
}
},
'\r' => {
let ch = chars.peek().unwrap_or_else(|| {
panic!("{}", error(i))
}).1;

if ch != '\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push('\n');
}
c => res.push(c),
}
}

res.shrink_to_fit(); // probably not going to do anything, unless there was an escape.
debug!("parse_str_lit: returning {}", res);
res
}

/// Parses a string representing a raw string literal into its final form. The
/// only operation this does is convert embedded CRLF into a single LF.
fn raw_str_lit(lit: &str) -> String {
Expand Down Expand Up @@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
use ast::LitKind;

match lit {
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),
token::Err(i) => (true, Some(LitKind::Err(i))),
token::Byte(i) => {
let lit_kind = match unescape_byte(&i.as_str()) {
Ok(c) => LitKind::Byte(c),
Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i),
Err(_) => LitKind::Byte(0),
matklad marked this conversation as resolved.
Show resolved Hide resolved
};
(true, Some(lit_kind))
},
token::Char(i) => {
let lit_kind = match unescape_char(&i.as_str()) {
Ok(c) => LitKind::Char(c),
Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i),
Err(_) => LitKind::Char('\u{FFFD}'),
};
(true, Some(lit_kind))
},
token::Err(i) => (true, Some(LitKind::Err(i))),

// There are some valid suffixes for integer and float literals,
// so all the handling is done internally.
Expand All @@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
// string in the Token.
let s = &sym.as_str();
if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') {
sym = Symbol::intern(&str_lit(s, diag));
let mut buf = String::with_capacity(s.len());
unescape_str(s, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(_) => buf.push('\u{FFFD}'),
}
});
sym = Symbol::intern(&buf)
}
(true, Some(LitKind::Str(sym, ast::StrStyle::Cooked)))
}
Expand All @@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
(true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n))))
}
token::ByteStr(i) => {
(true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str()))))
let s = &i.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_byte_str(s, &mut |_, unescaped_byte| {
match unescaped_byte {
Ok(c) => buf.push(c),
Err(_) => buf.push(0),
}
});
buf.shrink_to_fit();
(true, Some(LitKind::ByteStr(Lrc::new(buf))))
}
token::ByteStrRaw(i, _) => {
(true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes()))))
Expand Down Expand Up @@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
filtered_float_lit(Symbol::intern(s), suffix, diag)
}

/// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
fn byte_lit(lit: &str) -> (u8, usize) {
let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i);

if lit.len() == 1 {
(lit.as_bytes()[0], 1)
} else {
assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0));
let b = match lit.as_bytes()[1] {
b'"' => b'"',
b'n' => b'\n',
b'r' => b'\r',
b't' => b'\t',
b'\\' => b'\\',
b'\'' => b'\'',
b'0' => b'\0',
_ => {
match u64::from_str_radix(&lit[2..4], 16).ok() {
Some(c) =>
if c > 0xFF {
panic!(err(2))
} else {
return (c as u8, 4)
},
None => panic!(err(3))
}
}
};
(b, 2)
}
}

fn byte_str_lit(lit: &str) -> Lrc<Vec<u8>> {
let mut res = Vec::with_capacity(lit.len());

let error = |i| panic!("lexer should have rejected {} at {}", lit, i);

/// Eat everything up to a non-whitespace.
fn eat<I: Iterator<Item=(usize, u8)>>(it: &mut iter::Peekable<I>) {
loop {
match it.peek().map(|x| x.1) {
Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => {
it.next();
},
_ => { break; }
}
}
}

// byte string literals *must* be ASCII, but the escapes don't have to be
let mut chars = lit.bytes().enumerate().peekable();
loop {
match chars.next() {
Some((i, b'\\')) => {
match chars.peek().unwrap_or_else(|| error(i)).1 {
b'\n' => eat(&mut chars),
b'\r' => {
chars.next();
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
panic!("lexer accepted bare CR");
}
eat(&mut chars);
}
_ => {
// otherwise, a normal escape
let (c, n) = byte_lit(&lit[i..]);
// we don't need to move past the first \
for _ in 0..n - 1 {
chars.next();
}
res.push(c);
}
}
},
Some((i, b'\r')) => {
if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' {
panic!("lexer accepted bare CR");
}
chars.next();
res.push(b'\n');
}
Some((_, c)) => res.push(c),
None => break,
}
}

Lrc::new(res)
}

fn integer_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
-> Option<ast::LitKind> {
// s can only be ascii, byte indexing is fine
Expand Down
Loading