Skip to content

Commit

Permalink
Fix Unicode character escape sequence parsing (#959)
Browse files Browse the repository at this point in the history
Co-authored-by: tofpie <tofpie@users.noreply.github.com>
  • Loading branch information
tofpie and tofpie authored Dec 11, 2020
1 parent 98e11d7 commit 6d866f8
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 12 deletions.
4 changes: 0 additions & 4 deletions boa/src/syntax/lexer/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,10 @@ impl<R> Tokenizer<R> for StringLiteral {
b'u' => {
// Support \u{X..X} (Unicode Codepoint)
if cursor.next_is(b'{')? {
cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'.

// TODO: use bytes for a bit better performance (using stack)
let mut code_point_buf = Vec::with_capacity(6);
cursor.take_until(b'}', &mut code_point_buf)?;

cursor.next_byte()?.expect("} character vanished"); // Consume the '}'.

let code_point_str = unsafe {
str::from_utf8_unchecked(code_point_buf.as_slice())
};
Expand Down
47 changes: 39 additions & 8 deletions boa/src/syntax/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ fn check_positions() {

#[test]
fn check_positions_codepoint() {
let s = r#"console.log("hello world\u{{2764}}"); // Test"#;
let s = r#"console.log("hello world\u{2764}"); // Test"#;
// --------123456789
let mut lexer = Lexer::new(s.as_bytes());

Expand All @@ -281,19 +281,19 @@ fn check_positions_codepoint() {
// String token starts on column 13
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 13), (1, 36))
span((1, 13), (1, 34))
);

// Close parenthesis token starts on column 36
// Close parenthesis token starts on column 34
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 36), (1, 37))
span((1, 34), (1, 35))
);

// Semi Colon token starts on column 37
// Semi Colon token starts on column 35
assert_eq!(
lexer.next().unwrap().unwrap().span(),
span((1, 37), (1, 38))
span((1, 35), (1, 36))
);
}

Expand Down Expand Up @@ -702,10 +702,10 @@ fn codepoint_with_no_braces() {
fn illegal_code_point_following_numeric_literal() {
// Checks as per https://tc39.es/ecma262/#sec-literals-numeric-literals that a NumericLiteral cannot
// be immediately followed by an IdentifierStart where the IdentifierStart
let mut lexer = Lexer::new(&br#"17.4\u{{2764}}"#[..]);
let mut lexer = Lexer::new(&br#"17.4\u{2764}"#[..]);
assert!(
lexer.next().is_err(),
"IdentifierStart \\u{{2764}} following NumericLiteral not rejected as expected"
"IdentifierStart \\u{2764} following NumericLiteral not rejected as expected"
);
}

Expand All @@ -723,6 +723,37 @@ fn non_english_str() {
expect_tokens(&mut lexer, &expected);
}

#[test]
fn unicode_escape_with_braces() {
let mut lexer = Lexer::new(&br#"'{\u{20ac}\u{a0}\u{a0}}'"#[..]);

let expected = [TokenKind::StringLiteral("{\u{20ac}\u{a0}\u{a0}}".into())];

expect_tokens(&mut lexer, &expected);

lexer = Lexer::new(&br#"\u{{a0}"#[..]);

if let Error::Syntax(_, pos) = lexer
.next()
.expect_err("Malformed Unicode character sequence expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}

lexer = Lexer::new(&br#"\u{{a0}}"#[..]);

if let Error::Syntax(_, pos) = lexer
.next()
.expect_err("Malformed Unicode character sequence expected")
{
assert_eq!(pos, Position::new(1, 1));
} else {
panic!("invalid error type");
}
}

mod carriage_return {
use super::*;

Expand Down

0 comments on commit 6d866f8

Please sign in to comment.