From 08f232fe99ceec594c66e822e04d387ccfd3d6c0 Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Sat, 22 May 2021 02:05:44 -0700 Subject: [PATCH] Fix unicode escape in identifiers (#1102) --- boa/src/syntax/ast/keyword.rs | 36 ++++++++ boa/src/syntax/lexer/cursor.rs | 2 + boa/src/syntax/lexer/identifier.rs | 129 +++++++++++++++++++++-------- boa/src/syntax/lexer/mod.rs | 7 +- boa/src/syntax/lexer/tests.rs | 5 +- 5 files changed, 140 insertions(+), 39 deletions(-) diff --git a/boa/src/syntax/ast/keyword.rs b/boa/src/syntax/ast/keyword.rs index 0441b085cf7..597a355ad0c 100644 --- a/boa/src/syntax/ast/keyword.rs +++ b/boa/src/syntax/ast/keyword.rs @@ -199,6 +199,16 @@ pub enum Keyword { /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Classes/extends Extends, + /// The `false` keyword. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean + False, + /// The `finally` keyword. /// /// More information: @@ -301,6 +311,16 @@ pub enum Keyword { /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/new New, + /// The `null` keyword. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#prod-NullLiteral + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/null + Null, + /// The `of` keyword. /// /// More information: @@ -369,6 +389,16 @@ pub enum Keyword { /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Functions/Arrow_functions Throw, + /// The `true` keyword + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#prod-BooleanLiteral + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Boolean + True, + /// The `try` keyword. /// /// More information: @@ -479,6 +509,7 @@ impl Keyword { Self::Enum => "enum", Self::Extends => "extends", Self::Export => "export", + Self::False => "false", Self::Finally => "finally", Self::For => "for", Self::Function => "function", @@ -488,12 +519,14 @@ impl Keyword { Self::Import => "import", Self::Let => "let", Self::New => "new", + Self::Null => "null", Self::Of => "of", Self::Return => "return", Self::Super => "super", Self::Switch => "switch", Self::This => "this", Self::Throw => "throw", + Self::True => "true", Self::Try => "try", Self::TypeOf => "typeof", Self::Var => "var", @@ -552,6 +585,7 @@ impl FromStr for Keyword { "enum" => Ok(Self::Enum), "extends" => Ok(Self::Extends), "export" => Ok(Self::Export), + "false" => Ok(Self::False), "finally" => Ok(Self::Finally), "for" => Ok(Self::For), "function" => Ok(Self::Function), @@ -561,12 +595,14 @@ impl FromStr for Keyword { "import" => Ok(Self::Import), "let" => Ok(Self::Let), "new" => Ok(Self::New), + "null" => Ok(Self::Null), "of" => Ok(Self::Of), "return" => Ok(Self::Return), "super" => Ok(Self::Super), "switch" => Ok(Self::Switch), "this" => Ok(Self::This), "throw" => Ok(Self::Throw), + "true" => Ok(Self::True), "try" => Ok(Self::Try), "typeof" => Ok(Self::TypeOf), "var" => Ok(Self::Var), diff --git a/boa/src/syntax/lexer/cursor.rs b/boa/src/syntax/lexer/cursor.rs index d512bbd4abf..b23ae791315 100644 --- a/boa/src/syntax/lexer/cursor.rs +++ b/boa/src/syntax/lexer/cursor.rs @@ -130,6 +130,7 @@ where /// predicate on the ascii char /// /// The buffer is not incremented. + #[allow(dead_code)] #[inline] pub(super) fn next_is_char_pred(&mut self, pred: &F) -> io::Result where @@ -191,6 +192,7 @@ where /// It also stops when there is no next character. /// /// Note that all characters up until the stop character are added to the buffer, including the character right before. + #[allow(dead_code)] pub(super) fn take_while_char_pred(&mut self, buf: &mut Vec, pred: &F) -> io::Result<()> where F: Fn(u32) -> bool, diff --git a/boa/src/syntax/lexer/identifier.rs b/boa/src/syntax/lexer/identifier.rs index 937bafa5a67..5fef7bf7d13 100644 --- a/boa/src/syntax/lexer/identifier.rs +++ b/boa/src/syntax/lexer/identifier.rs @@ -5,7 +5,7 @@ use crate::{ profiler::BoaProfiler, syntax::{ ast::{Keyword, Position, Span}, - lexer::{Token, TokenKind}, + lexer::{StringLiteral, Token, TokenKind}, }, }; use boa_unicode::UnicodeProperties; @@ -86,43 +86,100 @@ impl Tokenizer for Identifier { { let _timer = BoaProfiler::global().start_event("Identifier", "Lexing"); - let mut init_buf = [0u8; 4]; - let mut buf = Vec::new(); - self.init.encode_utf8(&mut init_buf); - buf.extend(init_buf.iter().take(self.init.len_utf8())); - - cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?; - - let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) }; - let tk = match token_str { - "true" => TokenKind::BooleanLiteral(true), - "false" => TokenKind::BooleanLiteral(false), - "null" => TokenKind::NullLiteral, - slice => { - if let Ok(keyword) = slice.parse() { - if cursor.strict_mode() && keyword == Keyword::With { - return Err(Error::Syntax( - "using 'with' statement not allowed in strict mode".into(), - start_pos, - )); - } - TokenKind::Keyword(keyword) - } else { - if cursor.strict_mode() && STRICT_FORBIDDEN_IDENTIFIERS.contains(&slice) { - return Err(Error::Syntax( - format!( - "using future reserved keyword '{}' not allowed in strict mode", - slice - ) - .into(), - start_pos, - )); - } - TokenKind::identifier(slice) - } + let (identifier_name, contains_escaped_chars) = + Self::take_identifier_name(cursor, start_pos, self.init)?; + + let token_kind = if let Ok(keyword) = identifier_name.parse() { + if contains_escaped_chars { + return Err(Error::Syntax( + "unicode escaped characters are not allowed in keyword".into(), + start_pos, + )); + } + + if cursor.strict_mode() && keyword == Keyword::With { + return Err(Error::Syntax( + "using 'with' statement not allowed in strict mode".into(), + start_pos, + )); + } + + match keyword { + Keyword::True => TokenKind::BooleanLiteral(true), + Keyword::False => TokenKind::BooleanLiteral(false), + Keyword::Null => TokenKind::NullLiteral, + _ => TokenKind::Keyword(keyword), } + } else { + if cursor.strict_mode() + && STRICT_FORBIDDEN_IDENTIFIERS.contains(&identifier_name.as_str()) + { + return Err(Error::Syntax( + format!( + "using future reserved keyword '{}' not allowed in strict mode", + identifier_name + ) + .into(), + start_pos, + )); + } + TokenKind::identifier(identifier_name.into_boxed_str()) }; - Ok(Token::new(tk, Span::new(start_pos, cursor.pos()))) + Ok(Token::new(token_kind, Span::new(start_pos, cursor.pos()))) + } +} + +impl Identifier { + #[inline] + fn take_identifier_name( + cursor: &mut Cursor, + start_pos: Position, + init: char, + ) -> Result<(String, bool), Error> + where + R: Read, + { + let mut contains_escaped_chars = false; + let mut identifier_name = if init == '\\' && cursor.next_is(b'u')? { + let ch = StringLiteral::take_unicode_escape_sequence(cursor, start_pos)?; + + if Self::is_identifier_start(ch) { + contains_escaped_chars = true; + String::from(char::try_from(ch).unwrap()) + } else { + return Err(Error::Syntax("invalid identifier start".into(), start_pos)); + } + } else { + // The caller guarantees that `init` is a valid identifier start + String::from(init) + }; + + loop { + let ch = match cursor.peek_char()? { + Some(0x005C /* \ */) if cursor.peek_n(2)? >> 8 == 0x0075 /* u */ => { + let pos = cursor.pos(); + let _ = cursor.next_byte(); + let _ = cursor.next_byte(); + let ch = StringLiteral::take_unicode_escape_sequence(cursor, pos)?; + + if Self::is_identifier_part(ch) { + contains_escaped_chars = true; + ch + } else { + return Err(Error::Syntax("invalid identifier part".into(), pos)); + } + } + Some(ch) if Self::is_identifier_part(ch) => { + let _ = cursor.next_char()?; + ch + }, + _ => break, + }; + + identifier_name.push(char::try_from(ch).unwrap()); + } + + Ok((identifier_name, contains_escaped_chars)) } } diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs index 504da9dca68..8c13d209508 100644 --- a/boa/src/syntax/lexer/mod.rs +++ b/boa/src/syntax/lexer/mod.rs @@ -246,12 +246,15 @@ impl Lexer { '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => { Operator::new(next_ch as u8).lex(&mut self.cursor, start) } - _ if c.is_digit(10) => { - NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start) + '\\' if self.cursor.peek()? == Some(b'u') => { + Identifier::new(c).lex(&mut self.cursor, start) } _ if Identifier::is_identifier_start(c as u32) => { Identifier::new(c).lex(&mut self.cursor, start) } + _ if c.is_digit(10) => { + NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start) + } _ => { let details = format!( "unexpected '{}' at line {}, column {}", diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index eb6c4f71a2a..6ae0cdffb23 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -73,7 +73,7 @@ fn check_multi_line_comment() { #[test] fn check_identifier() { - let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}"; + let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D} \\u0078 \\u0078\\u0078 \\u{0078}x\\u{0078}"; let mut lexer = Lexer::new(s.as_bytes()); let expected = [ @@ -86,6 +86,9 @@ fn check_identifier() { TokenKind::identifier("Ѐ"), TokenKind::identifier("ЀЀ"), TokenKind::identifier("x\u{200C}\u{200D}"), + TokenKind::identifier("x"), + TokenKind::identifier("xx"), + TokenKind::identifier("xxx"), ]; expect_tokens(&mut lexer, &expected);