From cc473855f15ecd46a0391b6239e33b0b062880af Mon Sep 17 00:00:00 2001 From: Jevan Chan Date: Thu, 3 Dec 2020 07:46:20 -0800 Subject: [PATCH] Improve lexer by make cursor iterate over bytes (#915) --- boa/src/syntax/lexer/comment.rs | 10 +- boa/src/syntax/lexer/cursor.rs | 416 +++++++++++++++++++++-------- boa/src/syntax/lexer/identifier.rs | 18 +- boa/src/syntax/lexer/mod.rs | 179 +++++++------ boa/src/syntax/lexer/number.rs | 137 +++++----- boa/src/syntax/lexer/operator.rs | 62 ++--- boa/src/syntax/lexer/regex.rs | 67 +++-- boa/src/syntax/lexer/spread.rs | 4 +- boa/src/syntax/lexer/string.rs | 46 ++-- boa/src/syntax/lexer/template.rs | 24 +- boa/src/syntax/lexer/tests.rs | 101 +++++-- 11 files changed, 702 insertions(+), 362 deletions(-) diff --git a/boa/src/syntax/lexer/comment.rs b/boa/src/syntax/lexer/comment.rs index cedd084ca88..9f9c482d6a3 100644 --- a/boa/src/syntax/lexer/comment.rs +++ b/boa/src/syntax/lexer/comment.rs @@ -31,11 +31,11 @@ impl Tokenizer for SingleLineComment { // Skip either to the end of the line or to the end of the input while let Some(ch) = cursor.peek()? { - if ch == '\n' { + if ch == b'\n' { break; } else { // Consume char. - cursor.next_char()?.expect("Comment character vansihed"); + cursor.next_byte()?.expect("Comment character vansihed"); } } Ok(Token::new( @@ -66,10 +66,10 @@ impl Tokenizer for MultiLineComment { let mut new_line = false; loop { - if let Some(ch) = cursor.next_char()? { - if ch == '*' && cursor.next_is('/')? { + if let Some(ch) = cursor.next_byte()? { + if ch == b'*' && cursor.next_is(b'/')? { break; - } else if ch == '\n' { + } else if ch == b'\n' { new_line = true; } } else { diff --git a/boa/src/syntax/lexer/cursor.rs b/boa/src/syntax/lexer/cursor.rs index 539d7330cab..79dc6999695 100644 --- a/boa/src/syntax/lexer/cursor.rs +++ b/boa/src/syntax/lexer/cursor.rs @@ -1,5 +1,4 @@ //! Module implementing the lexer cursor. This is used for managing the input byte stream. - use crate::{profiler::BoaProfiler, syntax::ast::Position}; use std::io::{self, Bytes, Error, ErrorKind, Read}; @@ -57,22 +56,38 @@ where } } - /// Peeks the next character. + /// Peeks the next byte. #[inline] - pub(super) fn peek(&mut self) -> Result, Error> { + pub(super) fn peek(&mut self) -> Result, Error> { let _timer = BoaProfiler::global().start_event("cursor::peek()", "Lexing"); + self.iter.peek_byte() + } + + /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). + #[inline] + pub(super) fn peek_n(&mut self, n: u8) -> Result { + let _timer = BoaProfiler::global().start_event("cursor::peek_n()", "Lexing"); + + self.iter.peek_n_bytes(n) + } + + /// Peeks the next UTF-8 character in u32 code point. + #[inline] + pub(super) fn peek_char(&mut self) -> Result, Error> { + let _timer = BoaProfiler::global().start_event("cursor::peek_char()", "Lexing"); + self.iter.peek_char() } - /// Compares the character passed in to the next character, if they match true is returned and the buffer is incremented + /// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented #[inline] - pub(super) fn next_is(&mut self, peek: char) -> io::Result { + pub(super) fn next_is(&mut self, byte: u8) -> io::Result { let _timer = BoaProfiler::global().start_event("cursor::next_is()", "Lexing"); Ok(match self.peek()? { - Some(next) if next == peek => { - let _ = self.iter.next_char(); + Some(next) if next == byte => { + let _ = self.next_byte()?; true } _ => false, @@ -80,34 +95,57 @@ where } /// Applies the predicate to the next character and returns the result. - /// Returns false if there is no next character. + /// Returns false if the next character is not a valid ascii or there is no next character. + /// Otherwise returns the result from the predicate on the ascii in char /// /// The buffer is not incremented. #[inline] - pub(super) fn next_is_pred(&mut self, pred: &F) -> io::Result + pub(super) fn next_is_ascii_pred(&mut self, pred: &F) -> io::Result where F: Fn(char) -> bool, { - let _timer = BoaProfiler::global().start_event("cursor::next_is_pred()", "Lexing"); + let _timer = BoaProfiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing"); + + Ok(match self.peek()? { + Some(byte) => match byte { + 0..=0x7F => pred(char::from(byte)), + _ => false, + }, + None => false, + }) + } + + /// Applies the predicate to the next UTF-8 character and returns the result. + /// Returns false if there is no next character, otherwise returns the result from the + /// predicate on the ascii char + /// + /// The buffer is not incremented. + #[inline] + pub(super) fn next_is_char_pred(&mut self, pred: &F) -> io::Result + where + F: Fn(u32) -> bool, + { + let _timer = BoaProfiler::global().start_event("cursor::next_is_char_pred()", "Lexing"); - Ok(if let Some(peek) = self.peek()? { + Ok(if let Some(peek) = self.peek_char()? { pred(peek) } else { false }) } - /// Fills the buffer with all characters until the stop character is found. + /// Fills the buffer with all bytes until the stop byte is found. + /// Returns error when reaching the end of the buffer. /// - /// Note: It will not add the stop character to the buffer. - pub(super) fn take_until(&mut self, stop: char, buf: &mut String) -> io::Result<()> { + /// Note that all bytes up until the stop byte are added to the buffer, including the byte right before. + pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec) -> io::Result<()> { let _timer = BoaProfiler::global().start_event("cursor::take_until()", "Lexing"); loop { if self.next_is(stop)? { return Ok(()); - } else if let Some(ch) = self.next_char()? { - buf.push(ch); + } else if let Some(byte) = self.next_byte()? { + buf.push(byte); } else { return Err(io::Error::new( ErrorKind::UnexpectedEof, @@ -117,21 +155,45 @@ where } } - /// Fills the buffer with characters until the first character (x) for which the predicate (pred) is false - /// (or the next character is none). + /// Fills the buffer with characters until the first ascii character for which the predicate (pred) is false. + /// It also stops when the next character is not an ascii or there is no next character. /// - /// Note that all characters up until x are added to the buffer including the character right before. - pub(super) fn take_while_pred(&mut self, buf: &mut String, pred: &F) -> io::Result<()> + /// Note that all characters up until the stop character are added to the buffer, including the character right before. + pub(super) fn take_while_ascii_pred(&mut self, buf: &mut Vec, pred: &F) -> io::Result<()> where F: Fn(char) -> bool, { - let _timer = BoaProfiler::global().start_event("cursor::take_while_pred()", "Lexing"); + let _timer = BoaProfiler::global().start_event("cursor::take_while_ascii_pred()", "Lexing"); + + loop { + if !self.next_is_ascii_pred(pred)? { + return Ok(()); + } else if let Some(byte) = self.next_byte()? { + buf.push(byte); + } else { + // next_is_pred will return false if the next value is None so the None case should already be handled. + unreachable!(); + } + } + } + + /// Fills the buffer with characters until the first character for which the predicate (pred) is false. + /// It also stops when there is no next character. + /// + /// Note that all characters up until the stop character are added to the buffer, including the character right before. + pub(super) fn take_while_char_pred(&mut self, buf: &mut Vec, pred: &F) -> io::Result<()> + where + F: Fn(u32) -> bool, + { + let _timer = BoaProfiler::global().start_event("cursor::take_while_char_pred()", "Lexing"); loop { - if !self.next_is_pred(pred)? { + if !self.next_is_char_pred(pred)? { return Ok(()); - } else if let Some(ch) = self.next_char()? { - buf.push(ch); + } else if let Some(ch) = self.peek_char()? { + for _ in 0..utf8_len(ch) { + buf.push(self.next_byte()?.unwrap()); + } } else { // next_is_pred will return false if the next value is None so the None case should already be handled. unreachable!(); @@ -139,7 +201,7 @@ where } } - /// It will fill the buffer with checked ASCII bytes. + /// It will fill the buffer with bytes. /// /// This expects for the buffer to be fully filled. If it's not, it will fail with an /// `UnexpectedEof` I/O error. @@ -150,28 +212,63 @@ where self.iter.fill_bytes(buf) } + /// Retrieves the next byte. + #[inline] + pub(crate) fn next_byte(&mut self) -> Result, Error> { + let _timer = BoaProfiler::global().start_event("cursor::next_byte()", "Lexing"); + + let byte = self.iter.next_byte()?; + + match byte { + Some(b'\r') => { + // Try to take a newline if it's next, for windows "\r\n" newlines + // Otherwise, treat as a Mac OS9 bare '\r' newline + if self.peek()? == Some(b'\n') { + let _ = self.iter.next_byte(); + } + self.next_line(); + } + Some(b'\n') => self.next_line(), + Some(0xE2) => { + // Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) + let next_bytes = self.peek_n(2)?; + if next_bytes == 0xA8_80 || next_bytes == 0xA9_80 { + self.next_line(); + } else { + // 0xE2 is a utf8 first byte + self.next_column(); + } + } + Some(b) if utf8_is_first_byte(b) => self.next_column(), + _ => {} + } + + Ok(byte) + } + /// Retrieves the next UTF-8 character. #[inline] - pub(crate) fn next_char(&mut self) -> Result, Error> { + pub(crate) fn next_char(&mut self) -> Result, Error> { let _timer = BoaProfiler::global().start_event("cursor::next_char()", "Lexing"); - let chr = self.iter.next_char()?; + let ch = self.iter.next_char()?; - match chr { - Some('\r') => { + match ch { + Some(0xD) => { // Try to take a newline if it's next, for windows "\r\n" newlines // Otherwise, treat as a Mac OS9 bare '\r' newline - if self.peek()? == Some('\n') { - let _ = self.iter.next_char(); + if self.peek()? == Some(0xA) { + let _ = self.iter.next_byte(); } self.next_line(); } - Some('\n') | Some('\u{2028}') | Some('\u{2029}') => self.next_line(), + // '\n' | '\u{2028}' | '\u{2029}' + Some(0xA) | Some(0x2028) | Some(0x2029) => self.next_line(), Some(_) => self.next_column(), - None => {} + _ => {} } - Ok(chr) + Ok(ch) } } @@ -179,7 +276,9 @@ where #[derive(Debug)] struct InnerIter { iter: Bytes, - peeked_char: Option>, + num_peeked_bytes: u8, + peeked_bytes: u32, + peeked_char: Option>, } impl InnerIter { @@ -188,6 +287,8 @@ impl InnerIter { fn new(iter: Bytes) -> Self { Self { iter, + num_peeked_bytes: 0, + peeked_bytes: 0, peeked_char: None, } } @@ -197,14 +298,14 @@ impl InnerIter where R: Read, { - /// It will fill the buffer with checked ASCII bytes. + /// It will fill the buffer with checked ascii bytes. /// /// This expects for the buffer to be fully filled. If it's not, it will fail with an /// `UnexpectedEof` I/O error. #[inline] fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> { for byte in buf.iter_mut() { - *byte = self.next_ascii()?.ok_or_else(|| { + *byte = self.next_byte()?.ok_or_else(|| { io::Error::new( io::ErrorKind::UnexpectedEof, "unexpected EOF when filling buffer", @@ -214,90 +315,197 @@ where Ok(()) } - /// Peeks the next UTF-8 checked character. + /// Increments the iter by n bytes. #[inline] - pub(super) fn peek_char(&mut self) -> Result, Error> { - if let Some(v) = self.peeked_char { - Ok(v) + fn increment(&mut self, n: u32) -> Result<(), Error> { + for _ in 0..n { + if None == self.next_byte()? { + break; + } + } + Ok(()) + } + + /// Peeks the next byte. + #[inline] + pub(super) fn peek_byte(&mut self) -> Result, Error> { + if self.num_peeked_bytes > 0 { + let byte = self.peeked_bytes as u8; + Ok(Some(byte)) } else { - let chr = self.next_char()?; - self.peeked_char = Some(chr); - Ok(chr) + match self.iter.next().transpose()? { + Some(byte) => { + self.num_peeked_bytes = 1; + self.peeked_bytes = byte as u32; + Ok(Some(byte)) + } + None => Ok(None), + } } } - /// Retrieves the next UTF-8 checked character. - fn next_char(&mut self) -> io::Result> { - if let Some(v) = self.peeked_char.take() { - return Ok(v); + /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4). + #[inline] + pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result { + while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 { + match self.iter.next().transpose()? { + Some(byte) => { + self.peeked_bytes |= (byte as u32) << (self.num_peeked_bytes * 8); + self.num_peeked_bytes += 1; + } + None => break, + }; } - let first_byte = match self.iter.next().transpose()? { - Some(b) => b, - None => return Ok(None), - }; + match n { + 0 => Ok(0), + 1 => Ok(self.peeked_bytes & 0xFF), + 2 => Ok(self.peeked_bytes & 0xFFFF), + 3 => Ok(self.peeked_bytes & 0xFFFFFF), + _ => Ok(self.peeked_bytes), + } + } - let chr: char = if first_byte < 0x80 { - // 0b0xxx_xxxx - first_byte.into() + /// Peeks the next unchecked character in u32 code point. + #[inline] + pub(super) fn peek_char(&mut self) -> Result, Error> { + if let Some(ch) = self.peeked_char { + Ok(ch) } else { - let mut buf = [first_byte, 0u8, 0u8, 0u8]; - let num_bytes = if first_byte < 0xE0 { - // 0b110x_xxxx - 2 - } else if first_byte < 0xF0 { - // 0b1110_xxxx - 3 - } else { - // 0b1111_0xxx - 4 + // Decode UTF-8 + let x = match self.peek_byte()? { + Some(b) if b < 128 => { + self.peeked_char = Some(Some(b as u32)); + return Ok(Some(b as u32)); + } + Some(b) => b, + None => { + self.peeked_char = None; + return Ok(None); + } }; - for b in buf.iter_mut().take(num_bytes).skip(1) { - let next = match self.iter.next() { - Some(Ok(b)) => b, - Some(Err(e)) => return Err(e), - None => { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "stream did not contain valid UTF-8", - )) - } - }; - - *b = next; + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + let y = (self.peek_n_bytes(2)? >> 8) as u8; + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = (self.peek_n_bytes(3)? >> 16) as u8; + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = (self.peek_n_bytes(4)? >> 24) as u8; + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + }; + + self.peeked_char = Some(Some(ch)); + Ok(Some(ch)) + } + } + + /// Retrieves the next byte + #[inline] + fn next_byte(&mut self) -> io::Result> { + self.peeked_char = None; + if self.num_peeked_bytes > 0 { + let byte = (self.peeked_bytes & 0xFF) as u8; + self.num_peeked_bytes -= 1; + self.peeked_bytes >>= 8; + Ok(Some(byte)) + } else { + self.iter.next().transpose() + } + } + + /// Retrieves the next unchecked char in u32 code point. + #[inline] + fn next_char(&mut self) -> io::Result> { + if let Some(ch) = self.peeked_char.take() { + if let Some(c) = ch { + self.increment(utf8_len(c))?; } + return Ok(ch); + } - if let Ok(s) = std::str::from_utf8(&buf) { - if let Some(chr) = s.chars().next() { - chr - } else { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "stream did not contain valid UTF-8", - )); - } - } else { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "stream did not contain valid UTF-8", - )); + // Decode UTF-8 + let x = match self.next_byte()? { + Some(b) if b < 128 => return Ok(Some(b as u32)), + Some(b) => b, + None => return Ok(None), + }; + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + let y = unwrap_or_0(self.next_byte()?); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + let z = unwrap_or_0(self.next_byte()?); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + let w = unwrap_or_0(self.next_byte()?); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } }; - Ok(Some(chr)) + Ok(Some(ch)) } +} - /// Retrieves the next ASCII checked character. - #[inline] - fn next_ascii(&mut self) -> io::Result> { - match self.next_char() { - Ok(Some(chr)) if chr.is_ascii() => Ok(Some(chr as u8)), - Ok(None) => Ok(None), - _ => Err(io::Error::new( - io::ErrorKind::InvalidData, - "non-ASCII byte found", - )), - } +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the +/// bits `11`). +#[inline] +fn utf8_is_first_byte(byte: u8) -> bool { + byte <= 0x7F || (byte >> 6) == 0x11 +} + +#[inline] +fn unwrap_or_0(opt: Option) -> u8 { + match opt { + Some(byte) => byte, + None => 0, + } +} + +#[inline] +fn utf8_len(ch: u32) -> u32 { + if ch <= 0x7F { + 1 + } else if ch <= 0x7FF { + 2 + } else if ch <= 0xFFFF { + 3 + } else { + 4 } } diff --git a/boa/src/syntax/lexer/identifier.rs b/boa/src/syntax/lexer/identifier.rs index 19dd6dc608d..b8d35eecee6 100644 --- a/boa/src/syntax/lexer/identifier.rs +++ b/boa/src/syntax/lexer/identifier.rs @@ -8,7 +8,9 @@ use crate::{ lexer::{Token, TokenKind}, }, }; +use core::convert::TryFrom; use std::io::Read; +use std::str; const STRICT_FORBIDDEN_IDENTIFIERS: [&str; 11] = [ "eval", @@ -51,13 +53,21 @@ impl Tokenizer for Identifier { { let _timer = BoaProfiler::global().start_event("Identifier", "Lexing"); - let mut buf = self.init.to_string(); + let mut init_buf = [0u8; 4]; + let mut buf = Vec::new(); + self.init.encode_utf8(&mut init_buf); + buf.extend(init_buf.iter().take(self.init.len_utf8())); - cursor.take_while_pred(&mut buf, &|c: char| { - c.is_alphabetic() || c.is_digit(10) || c == '_' + cursor.take_while_char_pred(&mut buf, &|c: u32| { + if let Ok(c) = char::try_from(c) { + c.is_alphabetic() || c.is_digit(10) || c == '_' + } else { + false + } })?; - let tk = match buf.as_str() { + let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) }; + let tk = match token_str { "true" => TokenKind::BooleanLiteral(true), "false" => TokenKind::BooleanLiteral(false), "null" => TokenKind::NullLiteral, diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs index f5f356b496e..a779453f65c 100644 --- a/boa/src/syntax/lexer/mod.rs +++ b/boa/src/syntax/lexer/mod.rs @@ -42,6 +42,7 @@ use self::{ }; use crate::syntax::ast::{Punctuator, Span}; pub use crate::{profiler::BoaProfiler, syntax::ast::Position}; +use core::convert::TryFrom; pub use error::Error; use std::io::Read; pub use token::{Token, TokenKind}; @@ -69,12 +70,12 @@ impl Lexer { /// * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}` /// /// [More information](https://tc39.es/ecma262/#table-32) - fn is_whitespace(ch: char) -> bool { + fn is_whitespace(ch: u32) -> bool { matches!( ch, - '\u{0020}' | '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{00A0}' | '\u{FEFF}' | + 0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF | // Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above) - '\u{1680}' | '\u{2000}'..='\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}' + 0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000 ) } @@ -127,12 +128,12 @@ impl Lexer { if let Some(c) = self.cursor.peek()? { match c { - '/' => { - self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/' + b'/' => { + self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/' SingleLineComment.lex(&mut self.cursor, start) } - '*' => { - self.cursor.next_char()?.expect("* token vanished"); // Consume the '*' + b'*' => { + self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*' MultiLineComment.lex(&mut self.cursor, start) } ch => { @@ -140,9 +141,9 @@ impl Lexer { InputElement::Div | InputElement::TemplateTail => { // Only div punctuator allowed, regex not. - if ch == '=' { + if ch == b'=' { // Indicates this is an AssignDiv. - self.cursor.next_char()?.expect("= token vanished"); // Consume the '=' + self.cursor.next_byte()?.expect("= token vanished"); // Consume the '=' Ok(Token::new( Punctuator::AssignDiv.into(), Span::new(start, self.cursor.pos()), @@ -178,90 +179,104 @@ impl Lexer { { let _timer = BoaProfiler::global().start_event("next()", "Lexing"); - let (start, next_chr) = loop { + let (start, next_ch) = loop { let start = self.cursor.pos(); - if let Some(next_chr) = self.cursor.next_char()? { + if let Some(next_ch) = self.cursor.next_char()? { // Ignore whitespace - if !Self::is_whitespace(next_chr) { - break (start, next_chr); + if !Self::is_whitespace(next_ch) { + break (start, next_ch); } } else { return Ok(None); } }; - let token = match next_chr { - '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new( - TokenKind::LineTerminator, - Span::new(start, self.cursor.pos()), - )), - '"' | '\'' => StringLiteral::new(next_chr).lex(&mut self.cursor, start), - '`' => TemplateLiteral.lex(&mut self.cursor, start), - _ if next_chr.is_digit(10) => NumberLiteral::new(next_chr).lex(&mut self.cursor, start), - _ if next_chr.is_alphabetic() || next_chr == '$' || next_chr == '_' => { - Identifier::new(next_chr).lex(&mut self.cursor, start) - } - ';' => Ok(Token::new( - Punctuator::Semicolon.into(), - Span::new(start, self.cursor.pos()), - )), - ':' => Ok(Token::new( - Punctuator::Colon.into(), - Span::new(start, self.cursor.pos()), - )), - '.' => SpreadLiteral::new().lex(&mut self.cursor, start), - '(' => Ok(Token::new( - Punctuator::OpenParen.into(), - Span::new(start, self.cursor.pos()), - )), - ')' => Ok(Token::new( - Punctuator::CloseParen.into(), - Span::new(start, self.cursor.pos()), - )), - ',' => Ok(Token::new( - Punctuator::Comma.into(), - Span::new(start, self.cursor.pos()), - )), - '{' => Ok(Token::new( - Punctuator::OpenBlock.into(), - Span::new(start, self.cursor.pos()), - )), - '}' => Ok(Token::new( - Punctuator::CloseBlock.into(), - Span::new(start, self.cursor.pos()), - )), - '[' => Ok(Token::new( - Punctuator::OpenBracket.into(), - Span::new(start, self.cursor.pos()), - )), - ']' => Ok(Token::new( - Punctuator::CloseBracket.into(), - Span::new(start, self.cursor.pos()), - )), - '?' => Ok(Token::new( - Punctuator::Question.into(), - Span::new(start, self.cursor.pos()), - )), - '/' => self.lex_slash_token(start), - '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => { - Operator::new(next_chr).lex(&mut self.cursor, start) + if let Ok(c) = char::try_from(next_ch) { + let token = match c { + '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new( + TokenKind::LineTerminator, + Span::new(start, self.cursor.pos()), + )), + '"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start), + '`' => TemplateLiteral.lex(&mut self.cursor, start), + _ if c.is_digit(10) => { + NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start) + } + _ if c.is_alphabetic() || c == '$' || c == '_' => { + Identifier::new(c).lex(&mut self.cursor, start) + } + ';' => Ok(Token::new( + Punctuator::Semicolon.into(), + Span::new(start, self.cursor.pos()), + )), + ':' => Ok(Token::new( + Punctuator::Colon.into(), + Span::new(start, self.cursor.pos()), + )), + '.' => SpreadLiteral::new().lex(&mut self.cursor, start), + '(' => Ok(Token::new( + Punctuator::OpenParen.into(), + Span::new(start, self.cursor.pos()), + )), + ')' => Ok(Token::new( + Punctuator::CloseParen.into(), + Span::new(start, self.cursor.pos()), + )), + ',' => Ok(Token::new( + Punctuator::Comma.into(), + Span::new(start, self.cursor.pos()), + )), + '{' => Ok(Token::new( + Punctuator::OpenBlock.into(), + Span::new(start, self.cursor.pos()), + )), + '}' => Ok(Token::new( + Punctuator::CloseBlock.into(), + Span::new(start, self.cursor.pos()), + )), + '[' => Ok(Token::new( + Punctuator::OpenBracket.into(), + Span::new(start, self.cursor.pos()), + )), + ']' => Ok(Token::new( + Punctuator::CloseBracket.into(), + Span::new(start, self.cursor.pos()), + )), + '?' => Ok(Token::new( + Punctuator::Question.into(), + Span::new(start, self.cursor.pos()), + )), + '/' => self.lex_slash_token(start), + '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => { + Operator::new(next_ch as u8).lex(&mut self.cursor, start) + } + _ => { + let details = format!( + "unexpected '{}' at line {}, column {}", + c, + start.line_number(), + start.column_number() + ); + Err(Error::syntax(details, start)) + } + }?; + + if token.kind() == &TokenKind::Comment { + // Skip comment + self.next() + } else { + Ok(Some(token)) } - _ => { - let details = format!( - "unexpected '{}' at line {}, column {}", - next_chr, + } else { + Err(Error::syntax( + format!( + "unexpected utf-8 char '\\u{}' at line {}, column {}", + next_ch, start.line_number(), start.column_number() - ); - Err(Error::syntax(details, start)) - } - }?; - - if token.kind() == &TokenKind::Comment { - // Skip comment - self.next() - } else { - Ok(Some(token)) + ), + start, + )) } } } diff --git a/boa/src/syntax/lexer/number.rs b/boa/src/syntax/lexer/number.rs index deb4db05162..4e5c97d3416 100644 --- a/boa/src/syntax/lexer/number.rs +++ b/boa/src/syntax/lexer/number.rs @@ -9,6 +9,7 @@ use crate::{ lexer::{token::Numeric, Token}, }, }; +use std::str; use std::{io::Read, str::FromStr}; /// Number literal lexing. @@ -23,12 +24,12 @@ use std::{io::Read, str::FromStr}; /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Number_type #[derive(Debug, Clone, Copy)] pub(super) struct NumberLiteral { - init: char, + init: u8, } impl NumberLiteral { /// Creates a new string literal lexer. - pub(super) fn new(init: char) -> Self { + pub(super) fn new(init: u8) -> Self { Self { init } } } @@ -63,8 +64,9 @@ impl NumericKind { } } +#[inline] fn take_signed_integer( - buf: &mut String, + buf: &mut Vec, cursor: &mut Cursor, kind: &NumericKind, ) -> Result<(), Error> @@ -73,30 +75,31 @@ where { // The next part must be SignedInteger. // This is optionally a '+' or '-' followed by 1 or more DecimalDigits. - match cursor.next_char()? { - Some('+') => { - buf.push('+'); - if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? { + match cursor.next_byte()? { + Some(b'+') => { + buf.push(b'+'); + if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? { // A digit must follow the + or - symbol. return Err(Error::syntax("No digit found after + symbol", cursor.pos())); } } - Some('-') => { - buf.push('-'); - if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? { + Some(b'-') => { + buf.push(b'-'); + if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? { // A digit must follow the + or - symbol. return Err(Error::syntax("No digit found after - symbol", cursor.pos())); } } - Some(c) if c.is_digit(kind.base()) => buf.push(c), - Some(c) => { - return Err(Error::syntax( - format!( - "When lexing exponential value found unexpected char: '{}'", - c - ), - cursor.pos(), - )); + Some(byte) => { + let ch = char::from(byte); + if ch.is_ascii() && ch.is_digit(kind.base()) { + buf.push(byte); + } else { + return Err(Error::syntax( + "When lexing exponential value found unexpected char", + cursor.pos(), + )); + } } None => { return Err(Error::syntax( @@ -107,7 +110,7 @@ where } // Consume the decimal digits. - cursor.take_while_pred(buf, &|c: char| c.is_digit(kind.base()))?; + cursor.take_while_ascii_pred(buf, &|ch| ch.is_digit(kind.base()))?; Ok(()) } @@ -118,12 +121,12 @@ where /// - [ECMAScript Specification][spec] /// /// [spec]: https://tc39.es/ecma262/#sec-literals-numeric-literals +#[inline] fn check_after_numeric_literal(cursor: &mut Cursor) -> Result<(), Error> where R: Read, { - let pred = |ch: char| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_'; - if cursor.next_is_pred(&pred)? { + if cursor.next_is_ascii_pred(&|ch| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_')? { Err(Error::syntax( "a numeric literal must not be followed by an alphanumeric, $ or _ characters", cursor.pos(), @@ -140,17 +143,17 @@ impl Tokenizer for NumberLiteral { { let _timer = BoaProfiler::global().start_event("NumberLiteral", "Lexing"); - let mut buf = self.init.to_string(); + let mut buf = vec![self.init]; // Default assume the number is a base 10 integer. let mut kind = NumericKind::Integer(10); let c = cursor.peek(); - if self.init == '0' { + if self.init == b'0' { if let Some(ch) = c? { match ch { - 'x' | 'X' => { + b'x' | b'X' => { // Remove the initial '0' from buffer. cursor.next_char()?.expect("x or X character vanished"); buf.pop(); @@ -159,16 +162,14 @@ impl Tokenizer for NumberLiteral { kind = NumericKind::Integer(16); // Checks if the next char after '0x' is a digit of that base. if not return an error. - if let Some(digit) = cursor.peek()? { - if !digit.is_digit(16) { - return Err(Error::syntax( - "expected hexadecimal digit after number base prefix", - cursor.pos(), - )); - } + if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(16))? { + return Err(Error::syntax( + "expected hexadecimal digit after number base prefix", + cursor.pos(), + )); } } - 'o' | 'O' => { + b'o' | b'O' => { // Remove the initial '0' from buffer. cursor.next_char()?.expect("o or O character vanished"); buf.pop(); @@ -177,16 +178,14 @@ impl Tokenizer for NumberLiteral { kind = NumericKind::Integer(8); // Checks if the next char after '0o' is a digit of that base. if not return an error. - if let Some(digit) = cursor.peek()? { - if !digit.is_digit(8) { - return Err(Error::syntax( - "expected hexadecimal digit after number base prefix", - cursor.pos(), - )); - } + if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(8))? { + return Err(Error::syntax( + "expected hexadecimal digit after number base prefix", + cursor.pos(), + )); } } - 'b' | 'B' => { + b'b' | b'B' => { // Remove the initial '0' from buffer. cursor.next_char()?.expect("b or B character vanished"); buf.pop(); @@ -195,16 +194,14 @@ impl Tokenizer for NumberLiteral { kind = NumericKind::Integer(2); // Checks if the next char after '0b' is a digit of that base. if not return an error. - if let Some(digit) = cursor.peek()? { - if !digit.is_digit(2) { - return Err(Error::syntax( - "expected hexadecimal digit after number base prefix", - cursor.pos(), - )); - } + if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(2))? { + return Err(Error::syntax( + "expected hexadecimal digit after number base prefix", + cursor.pos(), + )); } } - 'n' => { + b'n' => { cursor.next_char()?.expect("n character vanished"); // DecimalBigIntegerLiteral '0n' @@ -213,7 +210,8 @@ impl Tokenizer for NumberLiteral { Span::new(start_pos, cursor.pos()), )); } - ch => { + byte => { + let ch = char::from(byte); if ch.is_digit(8) { // LegacyOctalIntegerLiteral if cursor.strict_mode() { @@ -226,7 +224,7 @@ impl Tokenizer for NumberLiteral { // Remove the initial '0' from buffer. buf.pop(); - buf.push(cursor.next_char()?.expect("'0' character vanished")); + buf.push(cursor.next_byte()?.expect("'0' character vanished")); kind = NumericKind::Integer(8); } @@ -240,7 +238,7 @@ impl Tokenizer for NumberLiteral { start_pos, )); } else { - buf.push(cursor.next_char()?.expect("Number digit vanished")); + buf.push(cursor.next_byte()?.expect("Number digit vanished")); } } // Else indicates that the symbol is a non-number. } @@ -256,42 +254,42 @@ impl Tokenizer for NumberLiteral { } // Consume digits until a non-digit character is encountered or all the characters are consumed. - cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?; + cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?; // The non-digit character could be: // 'n' To indicate a BigIntLiteralSuffix. // '.' To indicate a decimal seperator. // 'e' | 'E' To indicate an ExponentPart. match cursor.peek()? { - Some('n') => { + Some(b'n') => { // DecimalBigIntegerLiteral // Lexing finished. // Consume the n - cursor.next_char()?.expect("n character vanished"); + cursor.next_byte()?.expect("n character vanished"); kind = kind.to_bigint(); } - Some('.') => { + Some(b'.') => { if kind.base() == 10 { // Only base 10 numbers can have a decimal seperator. // Number literal lexing finished if a . is found for a number in a different base. - cursor.next_char()?.expect(". token vanished"); - buf.push('.'); // Consume the . + cursor.next_byte()?.expect(". token vanished"); + buf.push(b'.'); // Consume the . kind = NumericKind::Rational; // Consume digits until a non-digit character is encountered or all the characters are consumed. - cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?; + cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?; // The non-digit character at this point must be an 'e' or 'E' to indicate an Exponent Part. // Another '.' or 'n' is not allowed. match cursor.peek()? { - Some('e') | Some('E') => { + Some(b'e') | Some(b'E') => { // Consume the ExponentIndicator. - cursor.next_char()?.expect("e or E token vanished"); + cursor.next_byte()?.expect("e or E token vanished"); - buf.push('E'); + buf.push(b'E'); take_signed_integer(&mut buf, cursor, &kind)?; } @@ -301,10 +299,10 @@ impl Tokenizer for NumberLiteral { } } } - Some('e') | Some('E') => { + Some(b'e') | Some(b'E') => { kind = NumericKind::Rational; - cursor.next_char()?.expect("e or E character vanished"); // Consume the ExponentIndicator. - buf.push('E'); + cursor.next_byte()?.expect("e or E character vanished"); // Consume the ExponentIndicator. + buf.push(b'E'); take_signed_integer(&mut buf, cursor, &kind)?; } Some(_) | None => { @@ -314,14 +312,15 @@ impl Tokenizer for NumberLiteral { check_after_numeric_literal(cursor)?; + let num_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) }; let num = match kind { NumericKind::BigInt(base) => { Numeric::BigInt( - BigInt::from_string_radix(&buf, base).expect("Could not convert to BigInt") + BigInt::from_string_radix(num_str, base).expect("Could not convert to BigInt") ) } NumericKind::Rational /* base: 10 */ => { - let val = f64::from_str(&buf).expect("Failed to parse float after checks"); + let val = f64::from_str(num_str).expect("Failed to parse float after checks"); let int_val = val as i32; // The truncated float should be identically to the non-truncated float for the conversion to be loss-less, @@ -335,12 +334,12 @@ impl Tokenizer for NumberLiteral { } }, NumericKind::Integer(base) => { - if let Ok(num) = i32::from_str_radix(&buf, base) { + if let Ok(num) = i32::from_str_radix(num_str, base) { Numeric::Integer(num) } else { let b = f64::from(base); let mut result = 0.0_f64; - for c in buf.chars() { + for c in num_str.chars() { let digit = f64::from(c.to_digit(base).expect("could not parse digit after already checking validity")); result = result * b + digit; } diff --git a/boa/src/syntax/lexer/operator.rs b/boa/src/syntax/lexer/operator.rs index 5aa72c7d559..11971d384c4 100644 --- a/boa/src/syntax/lexer/operator.rs +++ b/boa/src/syntax/lexer/operator.rs @@ -17,8 +17,8 @@ macro_rules! vop { ($cursor:ident, $assign_op:expr, $op:expr) => ({ match $cursor.peek()? { None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())), - Some('=') => { - $cursor.next_char()?.expect("= token vanished"); + Some(b'=') => { + $cursor.next_byte()?.expect("= token vanished"); $cursor.next_column(); $assign_op } @@ -28,13 +28,13 @@ macro_rules! vop { ($cursor:ident, $assign_op:expr, $op:expr, {$($case:pat => $block:expr), +}) => ({ match $cursor.peek()? { None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())), - Some('=') => { - $cursor.next_char()?.expect("= token vanished"); + Some(b'=') => { + $cursor.next_byte()?.expect("= token vanished"); $cursor.next_column(); $assign_op }, $($case => { - $cursor.next_char()?.expect("Token vanished"); + $cursor.next_byte()?.expect("Token vanished"); $cursor.next_column(); $block })+, @@ -44,7 +44,7 @@ macro_rules! vop { ($cursor:ident, $op:expr, {$($case:pat => $block:expr),+}) => { match $cursor.peek().ok_or_else(|| Error::syntax("could not preview next value", $cursor.pos()))? { $($case => { - $cursor.next_char()?; + $cursor.next_byte()?; $cursor.next_column(); $block })+, @@ -72,7 +72,7 @@ macro_rules! op { #[derive(Debug, Clone, Copy)] pub(super) struct Operator { - init: char, + init: u8, } /// Operator lexing. @@ -87,7 +87,7 @@ pub(super) struct Operator { /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators impl Operator { /// Creates a new operator lexer. - pub(super) fn new(init: char) -> Self { + pub(super) fn new(init: u8) -> Self { Self { init } } } @@ -100,61 +100,63 @@ impl Tokenizer for Operator { let _timer = BoaProfiler::global().start_event("Operator", "Lexing"); match self.init { - '*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), { - Some('*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp)) + b'*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), { + Some(b'*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp)) }), - '+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), { - Some('+') => Ok(Punctuator::Inc) + b'+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), { + Some(b'+') => Ok(Punctuator::Inc) }), - '-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), { - Some('-') => { + b'-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), { + Some(b'-') => { Ok(Punctuator::Dec) } }), - '%' => op!( + b'%' => op!( cursor, start_pos, Ok(Punctuator::AssignMod), Ok(Punctuator::Mod) ), - '|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), { - Some('|') => Ok(Punctuator::BoolOr) + b'|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), { + Some(b'|') => Ok(Punctuator::BoolOr) }), - '&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), { - Some('&') => Ok(Punctuator::BoolAnd) + b'&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), { + Some(b'&') => Ok(Punctuator::BoolAnd) }), - '^' => op!( + b'^' => op!( cursor, start_pos, Ok(Punctuator::AssignXor), Ok(Punctuator::Xor) ), - '=' => op!(cursor, start_pos, if cursor.next_is('=')? { + b'=' => op!(cursor, start_pos, if cursor.next_is(b'=')? { Ok(Punctuator::StrictEq) } else { Ok(Punctuator::Eq) }, Ok(Punctuator::Assign), { - Some('>') => { + Some(b'>') => { Ok(Punctuator::Arrow) } }), - '<' => op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), { - Some('<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh)) - }), - '>' => { + b'<' => { + op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), { + Some(b'<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh)) + }) + } + b'>' => { op!(cursor, start_pos, Ok(Punctuator::GreaterThanOrEq), Ok(Punctuator::GreaterThan), { - Some('>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), { - Some('>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh)) + Some(b'>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), { + Some(b'>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh)) }) }) } - '!' => op!( + b'!' => op!( cursor, start_pos, vop!(cursor, Ok(Punctuator::StrictNotEq), Ok(Punctuator::NotEq)), Ok(Punctuator::Not) ), - '~' => Ok(Token::new( + b'~' => Ok(Token::new( Punctuator::Neg.into(), Span::new(start_pos, cursor.pos()), )), diff --git a/boa/src/syntax/lexer/regex.rs b/boa/src/syntax/lexer/regex.rs index 2367c44d70f..9b3996c5313 100644 --- a/boa/src/syntax/lexer/regex.rs +++ b/boa/src/syntax/lexer/regex.rs @@ -9,6 +9,8 @@ use crate::{ }, }; use bitflags::bitflags; +use std::io::{self, ErrorKind}; +use std::str; use std::{ fmt::{self, Display, Formatter}, io::Read, @@ -39,11 +41,11 @@ impl Tokenizer for RegexLiteral { { let _timer = BoaProfiler::global().start_event("RegexLiteral", "Lexing"); - let mut body = String::new(); + let mut body = Vec::new(); // Lex RegularExpressionBody. loop { - match cursor.next_char()? { + match cursor.next_byte()? { None => { // Abrupt end. return Err(Error::syntax( @@ -51,29 +53,45 @@ impl Tokenizer for RegexLiteral { cursor.pos(), )); } - Some(c) => { - match c { - '/' => break, // RegularExpressionBody finished. - '\n' | '\r' | '\u{2028}' | '\u{2029}' => { + Some(b) => { + match b { + b'/' => break, // RegularExpressionBody finished. + b'\n' | b'\r' => { // Not allowed in Regex literal. return Err(Error::syntax( "new lines are not allowed in regular expressions", cursor.pos(), )); } - '\\' => { + 0xE2 if (cursor.peek_n(2)? == 0xA8_80 || cursor.peek_n(2)? == 0xA9_80) => { + // '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed + return Err(Error::syntax( + "new lines are not allowed in regular expressions", + cursor.pos(), + )); + } + b'\\' => { // Escape sequence - body.push('\\'); - if let Some(sc) = cursor.next_char()? { + body.push(b'\\'); + if let Some(sc) = cursor.next_byte()? { match sc { - '\n' | '\r' | '\u{2028}' | '\u{2029}' => { + b'\n' | b'\r' => { // Not allowed in Regex literal. return Err(Error::syntax( "new lines are not allowed in regular expressions", cursor.pos(), )); } - ch => body.push(ch), + 0xE2 if (cursor.peek_n(2)? == 0xA8_80 + || cursor.peek_n(2)? == 0xA9_80) => + { + // '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed + return Err(Error::syntax( + "new lines are not allowed in regular expressions", + cursor.pos(), + )); + } + b => body.push(b), } } else { // Abrupt end of regex. @@ -83,20 +101,31 @@ impl Tokenizer for RegexLiteral { )); } } - _ => body.push(c), + _ => body.push(b), } } } } - let mut flags = String::new(); + let mut flags = Vec::new(); let flags_start = cursor.pos(); - cursor.take_while_pred(&mut flags, &char::is_alphabetic)?; - - Ok(Token::new( - TokenKind::regular_expression_literal(body, parse_regex_flags(&flags, flags_start)?), - Span::new(start_pos, cursor.pos()), - )) + cursor.take_while_ascii_pred(&mut flags, &|c: char| c.is_alphabetic())?; + + let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) }; + if let Ok(body_str) = str::from_utf8(body.as_slice()) { + Ok(Token::new( + TokenKind::regular_expression_literal( + body_str, + parse_regex_flags(flags_str, flags_start)?, + ), + Span::new(start_pos, cursor.pos()), + )) + } else { + Err(Error::from(io::Error::new( + ErrorKind::InvalidData, + "Invalid UTF-8 character in regular expressions", + ))) + } } } diff --git a/boa/src/syntax/lexer/spread.rs b/boa/src/syntax/lexer/spread.rs index cc8e0ad36f9..422ed97ab9e 100644 --- a/boa/src/syntax/lexer/spread.rs +++ b/boa/src/syntax/lexer/spread.rs @@ -38,8 +38,8 @@ impl Tokenizer for SpreadLiteral { let _timer = BoaProfiler::global().start_event("SpreadLiteral", "Lexing"); // . or ... - if cursor.next_is('.')? { - if cursor.next_is('.')? { + if cursor.next_is(b'.')? { + if cursor.next_is(b'.')? { Ok(Token::new( Punctuator::Spread.into(), Span::new(start_pos, cursor.pos()), diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs index 4cbcbbae6e5..ad85c40e10c 100644 --- a/boa/src/syntax/lexer/string.rs +++ b/boa/src/syntax/lexer/string.rs @@ -8,6 +8,7 @@ use crate::{ lexer::{Token, TokenKind}, }, }; +use core::convert::TryFrom; use std::{ io::{self, ErrorKind, Read}, str, @@ -58,12 +59,13 @@ impl Tokenizer for StringLiteral { let mut buf: Vec = Vec::new(); loop { let next_chr_start = cursor.pos(); - let next_chr = cursor.next_char()?.ok_or_else(|| { + let next_chr = char::try_from(cursor.next_char()?.ok_or_else(|| { Error::from(io::Error::new( ErrorKind::UnexpectedEof, "unterminated string literal", )) - })?; + })?) + .unwrap(); match next_chr { '\'' if self.terminator == StringTerminator::SingleQuote => { @@ -76,22 +78,22 @@ impl Tokenizer for StringLiteral { let _timer = BoaProfiler::global() .start_event("StringLiteral - escape sequence", "Lexing"); - let escape = cursor.next_char()?.ok_or_else(|| { + let escape = cursor.next_byte()?.ok_or_else(|| { Error::from(io::Error::new( ErrorKind::UnexpectedEof, "unterminated escape sequence in string literal", )) })?; - if escape != '\n' { + if escape != b'\n' { match escape { - 'n' => buf.push('\n' as u16), - 'r' => buf.push('\r' as u16), - 't' => buf.push('\t' as u16), - 'b' => buf.push('\x08' as u16), - 'f' => buf.push('\x0c' as u16), - '0' => buf.push('\0' as u16), - 'x' => { + b'n' => buf.push('\n' as u16), + b'r' => buf.push('\r' as u16), + b't' => buf.push('\t' as u16), + b'b' => buf.push('\x08' as u16), + b'f' => buf.push('\x0c' as u16), + b'0' => buf.push('\0' as u16), + b'x' => { let mut code_point_utf8_bytes = [0u8; 2]; cursor.fill_bytes(&mut code_point_utf8_bytes)?; let code_point_str = str::from_utf8(&code_point_utf8_bytes) @@ -106,17 +108,20 @@ impl Tokenizer for StringLiteral { buf.push(code_point); } - 'u' => { + b'u' => { // Support \u{X..X} (Unicode Codepoint) - if cursor.next_is('{')? { - cursor.next_char()?.expect("{ character vanished"); // Consume the '{'. + if cursor.next_is(b'{')? { + cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'. // TODO: use bytes for a bit better performance (using stack) - let mut code_point_str = String::with_capacity(6); - cursor.take_until('}', &mut code_point_str)?; + let mut code_point_buf = Vec::with_capacity(6); + cursor.take_until(b'}', &mut code_point_buf)?; - cursor.next_char()?.expect("} character vanished"); // Consume the '}'. + cursor.next_byte()?.expect("} character vanished"); // Consume the '}'. + let code_point_str = unsafe { + str::from_utf8_unchecked(code_point_buf.as_slice()) + }; // We know this is a single unicode codepoint, convert to u32 let code_point = u32::from_str_radix(&code_point_str, 16) .map_err(|_| { @@ -156,13 +161,12 @@ impl Tokenizer for StringLiteral { buf.push(code_point); } } - '\'' | '"' | '\\' => buf.push(escape as u16), - ch => { + b'\'' | b'"' | b'\\' => buf.push(escape as u16), + _ => { let details = format!( - "invalid escape sequence `{}` at line {}, column {}", + "invalid escape sequence at line {}, column {}", next_chr_start.line_number(), next_chr_start.column_number(), - ch ); return Err(Error::syntax(details, cursor.pos())); } diff --git a/boa/src/syntax/lexer/template.rs b/boa/src/syntax/lexer/template.rs index c51763c7f33..28bccf5e1cb 100644 --- a/boa/src/syntax/lexer/template.rs +++ b/boa/src/syntax/lexer/template.rs @@ -9,6 +9,7 @@ use crate::{ }, }; use std::io::{self, ErrorKind, Read}; +use std::str; /// Template literal lexing. /// @@ -30,23 +31,30 @@ impl Tokenizer for TemplateLiteral { { let _timer = BoaProfiler::global().start_event("TemplateLiteral", "Lexing"); - let mut buf = String::new(); + let mut buf = Vec::new(); loop { - match cursor.next_char()? { + match cursor.next_byte()? { None => { return Err(Error::from(io::Error::new( ErrorKind::UnexpectedEof, "Unterminated template literal", ))); } - Some('`') => break, // Template literal finished. - Some(next_ch) => buf.push(next_ch), // TODO when there is an expression inside the literal + Some(b'`') => break, // Template literal finished. + Some(next_byte) => buf.push(next_byte), // TODO when there is an expression inside the literal } } - Ok(Token::new( - TokenKind::template_literal(buf), - Span::new(start_pos, cursor.pos()), - )) + if let Ok(s) = str::from_utf8(buf.as_slice()) { + Ok(Token::new( + TokenKind::template_literal(s), + Span::new(start_pos, cursor.pos()), + )) + } else { + Err(Error::from(io::Error::new( + ErrorKind::InvalidData, + "Invalid UTF-8 character in template literal", + ))) + } } } diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs index b78f5d77c7f..4dd22e194cf 100644 --- a/boa/src/syntax/lexer/tests.rs +++ b/boa/src/syntax/lexer/tests.rs @@ -6,6 +6,7 @@ use super::token::Numeric; use super::*; use super::{Error, Position}; use crate::syntax::ast::Keyword; +use std::str; fn span(start: (u32, u32), end: (u32, u32)) -> Span { Span::new(Position::new(start.0, start.1), Position::new(end.0, end.1)) @@ -280,19 +281,19 @@ fn check_positions_codepoint() { // String token starts on column 13 assert_eq!( lexer.next().unwrap().unwrap().span(), - span((1, 13), (1, 34)) + span((1, 13), (1, 36)) ); - // Close parenthesis token starts on column 34 + // Close parenthesis token starts on column 36 assert_eq!( lexer.next().unwrap().unwrap().span(), - span((1, 34), (1, 35)) + span((1, 36), (1, 37)) ); - // Semi Colon token starts on column 35 + // Semi Colon token starts on column 37 assert_eq!( lexer.next().unwrap().unwrap().span(), - span((1, 35), (1, 36)) + span((1, 37), (1, 38)) ); } @@ -554,38 +555,102 @@ fn addition_no_spaces_e_number() { } #[test] -fn take_while_pred_simple() { +fn take_while_ascii_pred_simple() { let mut cur = Cursor::new(&b"abcdefghijk"[..]); - let mut buf: String = String::new(); + let mut buf: Vec = Vec::new(); - cur.take_while_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c') + cur.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c') .unwrap(); - assert_eq!(buf, "abc"); + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc"); } #[test] -fn take_while_pred_immediate_stop() { +fn take_while_ascii_pred_immediate_stop() { let mut cur = Cursor::new(&b"abcdefghijk"[..]); - let mut buf: String = String::new(); + let mut buf: Vec = Vec::new(); - cur.take_while_pred(&mut buf, &|c| c == 'd').unwrap(); + cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap(); - assert_eq!(buf, ""); + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), ""); } #[test] -fn take_while_pred_entire_str() { +fn take_while_ascii_pred_entire_str() { let mut cur = Cursor::new(&b"abcdefghijk"[..]); - let mut buf: String = String::new(); + let mut buf: Vec = Vec::new(); - cur.take_while_pred(&mut buf, &|c| c.is_alphabetic()) - .unwrap(); + cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap(); + + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk"); +} + +#[test] +fn take_while_ascii_pred_non_ascii_stop() { + let mut cur = Cursor::new("abcde😀fghijk".as_bytes()); + + let mut buf: Vec = Vec::new(); + + cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap(); + + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde"); +} + +#[test] +fn take_while_char_pred_simple() { + let mut cur = Cursor::new(&b"abcdefghijk"[..]); + + let mut buf: Vec = Vec::new(); + + cur.take_while_char_pred(&mut buf, &|c| { + c == 'a' as u32 || c == 'b' as u32 || c == 'c' as u32 + }) + .unwrap(); + + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc"); +} + +#[test] +fn take_while_char_pred_immediate_stop() { + let mut cur = Cursor::new(&b"abcdefghijk"[..]); + + let mut buf: Vec = Vec::new(); + + cur.take_while_char_pred(&mut buf, &|_| false).unwrap(); + + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), ""); +} + +#[test] +fn take_while_char_pred_entire_str() { + let mut cur = Cursor::new(&b"abcdefghijk"[..]); + + let mut buf: Vec = Vec::new(); + + cur.take_while_char_pred(&mut buf, &|_| true).unwrap(); + + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk"); +} + +#[test] +fn take_while_char_pred_utf8_char() { + let mut cur = Cursor::new("abc😀defghijk".as_bytes()); + + let mut buf: Vec = Vec::new(); + + cur.take_while_char_pred(&mut buf, &|c| { + if let Ok(c) = char::try_from(c) { + c == 'a' || c == 'b' || c == 'c' || c == '😀' + } else { + false + } + }) + .unwrap(); - assert_eq!(buf, "abcdefghijk"); + assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc😀"); } #[test]