From cc473855f15ecd46a0391b6239e33b0b062880af Mon Sep 17 00:00:00 2001
From: Jevan Chan <jevan.cnchan@gmail.com>
Date: Thu, 3 Dec 2020 07:46:20 -0800
Subject: [PATCH] Improve lexer by make cursor iterate over bytes (#915)

---
 boa/src/syntax/lexer/comment.rs    |  10 +-
 boa/src/syntax/lexer/cursor.rs     | 416 +++++++++++++++++++++--------
 boa/src/syntax/lexer/identifier.rs |  18 +-
 boa/src/syntax/lexer/mod.rs        | 179 +++++++------
 boa/src/syntax/lexer/number.rs     | 137 +++++-----
 boa/src/syntax/lexer/operator.rs   |  62 ++---
 boa/src/syntax/lexer/regex.rs      |  67 +++--
 boa/src/syntax/lexer/spread.rs     |   4 +-
 boa/src/syntax/lexer/string.rs     |  46 ++--
 boa/src/syntax/lexer/template.rs   |  24 +-
 boa/src/syntax/lexer/tests.rs      | 101 +++++--
 11 files changed, 702 insertions(+), 362 deletions(-)
diff --git a/boa/src/syntax/lexer/comment.rs b/boa/src/syntax/lexer/comment.rs
index cedd084ca88..9f9c482d6a3 100644
--- a/boa/src/syntax/lexer/comment.rs
+++ b/boa/src/syntax/lexer/comment.rs
@@ -31,11 +31,11 @@ impl<R> Tokenizer<R> for SingleLineComment {
 
         // Skip either to the end of the line or to the end of the input
         while let Some(ch) = cursor.peek()? {
-            if ch == '\n' {
+            if ch == b'\n' {
                 break;
             } else {
                 // Consume char.
-                cursor.next_char()?.expect("Comment character vansihed");
+                cursor.next_byte()?.expect("Comment character vansihed");
             }
         }
         Ok(Token::new(
@@ -66,10 +66,10 @@ impl<R> Tokenizer<R> for MultiLineComment {
 
         let mut new_line = false;
         loop {
-            if let Some(ch) = cursor.next_char()? {
-                if ch == '*' && cursor.next_is('/')? {
+            if let Some(ch) = cursor.next_byte()? {
+                if ch == b'*' && cursor.next_is(b'/')? {
                     break;
-                } else if ch == '\n' {
+                } else if ch == b'\n' {
                     new_line = true;
                 }
             } else {
diff --git a/boa/src/syntax/lexer/cursor.rs b/boa/src/syntax/lexer/cursor.rs
index 539d7330cab..79dc6999695 100644
--- a/boa/src/syntax/lexer/cursor.rs
+++ b/boa/src/syntax/lexer/cursor.rs
@@ -1,5 +1,4 @@
 //! Module implementing the lexer cursor. This is used for managing the input byte stream.
-
 use crate::{profiler::BoaProfiler, syntax::ast::Position};
 use std::io::{self, Bytes, Error, ErrorKind, Read};
 
@@ -57,22 +56,38 @@ where
         }
     }
 
-    /// Peeks the next character.
+    /// Peeks the next byte.
     #[inline]
-    pub(super) fn peek(&mut self) -> Result<Option<char>, Error> {
+    pub(super) fn peek(&mut self) -> Result<Option<u8>, Error> {
         let _timer = BoaProfiler::global().start_event("cursor::peek()", "Lexing");
 
+        self.iter.peek_byte()
+    }
+
+    /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
+    #[inline]
+    pub(super) fn peek_n(&mut self, n: u8) -> Result<u32, Error> {
+        let _timer = BoaProfiler::global().start_event("cursor::peek_n()", "Lexing");
+
+        self.iter.peek_n_bytes(n)
+    }
+
+    /// Peeks the next UTF-8 character in u32 code point.
+    #[inline]
+    pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> {
+        let _timer = BoaProfiler::global().start_event("cursor::peek_char()", "Lexing");
+
         self.iter.peek_char()
     }
 
-    /// Compares the character passed in to the next character, if they match true is returned and the buffer is incremented
+    /// Compares the byte passed in to the next byte, if they match true is returned and the buffer is incremented
     #[inline]
-    pub(super) fn next_is(&mut self, peek: char) -> io::Result<bool> {
+    pub(super) fn next_is(&mut self, byte: u8) -> io::Result<bool> {
         let _timer = BoaProfiler::global().start_event("cursor::next_is()", "Lexing");
 
         Ok(match self.peek()? {
-            Some(next) if next == peek => {
-                let _ = self.iter.next_char();
+            Some(next) if next == byte => {
+                let _ = self.next_byte()?;
                 true
             }
             _ => false,
@@ -80,34 +95,57 @@ where
     }
 
     /// Applies the predicate to the next character and returns the result.
-    /// Returns false if there is no next character.
+    /// Returns false if the next character is not a valid ascii or there is no next character.
+    /// Otherwise returns the result from the predicate on the ascii in char
     ///
     /// The buffer is not incremented.
     #[inline]
-    pub(super) fn next_is_pred<F>(&mut self, pred: &F) -> io::Result<bool>
+    pub(super) fn next_is_ascii_pred<F>(&mut self, pred: &F) -> io::Result<bool>
     where
         F: Fn(char) -> bool,
     {
-        let _timer = BoaProfiler::global().start_event("cursor::next_is_pred()", "Lexing");
+        let _timer = BoaProfiler::global().start_event("cursor::next_is_ascii_pred()", "Lexing");
+
+        Ok(match self.peek()? {
+            Some(byte) => match byte {
+                0..=0x7F => pred(char::from(byte)),
+                _ => false,
+            },
+            None => false,
+        })
+    }
+
+    /// Applies the predicate to the next UTF-8 character and returns the result.
+    /// Returns false if there is no next character, otherwise returns the result from the
+    /// predicate on the ascii char
+    ///
+    /// The buffer is not incremented.
+    #[inline]
+    pub(super) fn next_is_char_pred<F>(&mut self, pred: &F) -> io::Result<bool>
+    where
+        F: Fn(u32) -> bool,
+    {
+        let _timer = BoaProfiler::global().start_event("cursor::next_is_char_pred()", "Lexing");
 
-        Ok(if let Some(peek) = self.peek()? {
+        Ok(if let Some(peek) = self.peek_char()? {
             pred(peek)
         } else {
             false
         })
     }
 
-    /// Fills the buffer with all characters until the stop character is found.
+    /// Fills the buffer with all bytes until the stop byte is found.
+    /// Returns error when reaching the end of the buffer.
     ///
-    /// Note: It will not add the stop character to the buffer.
-    pub(super) fn take_until(&mut self, stop: char, buf: &mut String) -> io::Result<()> {
+    /// Note that all bytes up until the stop byte are added to the buffer, including the byte right before.
+    pub(super) fn take_until(&mut self, stop: u8, buf: &mut Vec<u8>) -> io::Result<()> {
         let _timer = BoaProfiler::global().start_event("cursor::take_until()", "Lexing");
 
         loop {
             if self.next_is(stop)? {
                 return Ok(());
-            } else if let Some(ch) = self.next_char()? {
-                buf.push(ch);
+            } else if let Some(byte) = self.next_byte()? {
+                buf.push(byte);
             } else {
                 return Err(io::Error::new(
                     ErrorKind::UnexpectedEof,
@@ -117,21 +155,45 @@ where
         }
     }
 
-    /// Fills the buffer with characters until the first character (x) for which the predicate (pred) is false
-    /// (or the next character is none).
+    /// Fills the buffer with characters until the first ascii character for which the predicate (pred) is false.
+    /// It also stops when the next character is not an ascii or there is no next character.
     ///
-    /// Note that all characters up until x are added to the buffer including the character right before.
-    pub(super) fn take_while_pred<F>(&mut self, buf: &mut String, pred: &F) -> io::Result<()>
+    /// Note that all characters up until the stop character are added to the buffer, including the character right before.
+    pub(super) fn take_while_ascii_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
     where
         F: Fn(char) -> bool,
     {
-        let _timer = BoaProfiler::global().start_event("cursor::take_while_pred()", "Lexing");
+        let _timer = BoaProfiler::global().start_event("cursor::take_while_ascii_pred()", "Lexing");
+
+        loop {
+            if !self.next_is_ascii_pred(pred)? {
+                return Ok(());
+            } else if let Some(byte) = self.next_byte()? {
+                buf.push(byte);
+            } else {
+                // next_is_pred will return false if the next value is None so the None case should already be handled.
+                unreachable!();
+            }
+        }
+    }
+
+    /// Fills the buffer with characters until the first character for which the predicate (pred) is false.
+    /// It also stops when there is no next character.
+    ///
+    /// Note that all characters up until the stop character are added to the buffer, including the character right before.
+    pub(super) fn take_while_char_pred<F>(&mut self, buf: &mut Vec<u8>, pred: &F) -> io::Result<()>
+    where
+        F: Fn(u32) -> bool,
+    {
+        let _timer = BoaProfiler::global().start_event("cursor::take_while_char_pred()", "Lexing");
 
         loop {
-            if !self.next_is_pred(pred)? {
+            if !self.next_is_char_pred(pred)? {
                 return Ok(());
-            } else if let Some(ch) = self.next_char()? {
-                buf.push(ch);
+            } else if let Some(ch) = self.peek_char()? {
+                for _ in 0..utf8_len(ch) {
+                    buf.push(self.next_byte()?.unwrap());
+                }
             } else {
                 // next_is_pred will return false if the next value is None so the None case should already be handled.
                 unreachable!();
@@ -139,7 +201,7 @@ where
         }
     }
 
-    /// It will fill the buffer with checked ASCII bytes.
+    /// It will fill the buffer with bytes.
     ///
     /// This expects for the buffer to be fully filled. If it's not, it will fail with an
     /// `UnexpectedEof` I/O error.
@@ -150,28 +212,63 @@ where
         self.iter.fill_bytes(buf)
     }
 
+    /// Retrieves the next byte.
+    #[inline]
+    pub(crate) fn next_byte(&mut self) -> Result<Option<u8>, Error> {
+        let _timer = BoaProfiler::global().start_event("cursor::next_byte()", "Lexing");
+
+        let byte = self.iter.next_byte()?;
+
+        match byte {
+            Some(b'\r') => {
+                // Try to take a newline if it's next, for windows "\r\n" newlines
+                // Otherwise, treat as a Mac OS9 bare '\r' newline
+                if self.peek()? == Some(b'\n') {
+                    let _ = self.iter.next_byte();
+                }
+                self.next_line();
+            }
+            Some(b'\n') => self.next_line(),
+            Some(0xE2) => {
+                // Try to match '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9)
+                let next_bytes = self.peek_n(2)?;
+                if next_bytes == 0xA8_80 || next_bytes == 0xA9_80 {
+                    self.next_line();
+                } else {
+                    // 0xE2 is a utf8 first byte
+                    self.next_column();
+                }
+            }
+            Some(b) if utf8_is_first_byte(b) => self.next_column(),
+            _ => {}
+        }
+
+        Ok(byte)
+    }
+
     /// Retrieves the next UTF-8 character.
     #[inline]
-    pub(crate) fn next_char(&mut self) -> Result<Option<char>, Error> {
+    pub(crate) fn next_char(&mut self) -> Result<Option<u32>, Error> {
         let _timer = BoaProfiler::global().start_event("cursor::next_char()", "Lexing");
 
-        let chr = self.iter.next_char()?;
+        let ch = self.iter.next_char()?;
 
-        match chr {
-            Some('\r') => {
+        match ch {
+            Some(0xD) => {
                 // Try to take a newline if it's next, for windows "\r\n" newlines
                 // Otherwise, treat as a Mac OS9 bare '\r' newline
-                if self.peek()? == Some('\n') {
-                    let _ = self.iter.next_char();
+                if self.peek()? == Some(0xA) {
+                    let _ = self.iter.next_byte();
                 }
                 self.next_line();
             }
-            Some('\n') | Some('\u{2028}') | Some('\u{2029}') => self.next_line(),
+            // '\n' | '\u{2028}' | '\u{2029}'
+            Some(0xA) | Some(0x2028) | Some(0x2029) => self.next_line(),
             Some(_) => self.next_column(),
-            None => {}
+            _ => {}
         }
 
-        Ok(chr)
+        Ok(ch)
     }
 }
 
@@ -179,7 +276,9 @@ where
 #[derive(Debug)]
 struct InnerIter<R> {
     iter: Bytes<R>,
-    peeked_char: Option<Option<char>>,
+    num_peeked_bytes: u8,
+    peeked_bytes: u32,
+    peeked_char: Option<Option<u32>>,
 }
 
 impl<R> InnerIter<R> {
@@ -188,6 +287,8 @@ impl<R> InnerIter<R> {
     fn new(iter: Bytes<R>) -> Self {
         Self {
             iter,
+            num_peeked_bytes: 0,
+            peeked_bytes: 0,
             peeked_char: None,
         }
     }
@@ -197,14 +298,14 @@ impl<R> InnerIter<R>
 where
     R: Read,
 {
-    /// It will fill the buffer with checked ASCII bytes.
+    /// It will fill the buffer with checked ascii bytes.
     ///
     /// This expects for the buffer to be fully filled. If it's not, it will fail with an
     /// `UnexpectedEof` I/O error.
     #[inline]
     fn fill_bytes(&mut self, buf: &mut [u8]) -> io::Result<()> {
         for byte in buf.iter_mut() {
-            *byte = self.next_ascii()?.ok_or_else(|| {
+            *byte = self.next_byte()?.ok_or_else(|| {
                 io::Error::new(
                     io::ErrorKind::UnexpectedEof,
                     "unexpected EOF when filling buffer",
@@ -214,90 +315,197 @@ where
         Ok(())
     }
 
-    /// Peeks the next UTF-8 checked character.
+    /// Increments the iter by n bytes.
     #[inline]
-    pub(super) fn peek_char(&mut self) -> Result<Option<char>, Error> {
-        if let Some(v) = self.peeked_char {
-            Ok(v)
+    fn increment(&mut self, n: u32) -> Result<(), Error> {
+        for _ in 0..n {
+            if None == self.next_byte()? {
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    /// Peeks the next byte.
+    #[inline]
+    pub(super) fn peek_byte(&mut self) -> Result<Option<u8>, Error> {
+        if self.num_peeked_bytes > 0 {
+            let byte = self.peeked_bytes as u8;
+            Ok(Some(byte))
         } else {
-            let chr = self.next_char()?;
-            self.peeked_char = Some(chr);
-            Ok(chr)
+            match self.iter.next().transpose()? {
+                Some(byte) => {
+                    self.num_peeked_bytes = 1;
+                    self.peeked_bytes = byte as u32;
+                    Ok(Some(byte))
+                }
+                None => Ok(None),
+            }
         }
     }
 
-    /// Retrieves the next UTF-8 checked character.
-    fn next_char(&mut self) -> io::Result<Option<char>> {
-        if let Some(v) = self.peeked_char.take() {
-            return Ok(v);
+    /// Peeks the next n bytes, the maximum number of peeked bytes is 4 (n <= 4).
+    #[inline]
+    pub(super) fn peek_n_bytes(&mut self, n: u8) -> Result<u32, Error> {
+        while self.num_peeked_bytes < n && self.num_peeked_bytes < 4 {
+            match self.iter.next().transpose()? {
+                Some(byte) => {
+                    self.peeked_bytes |= (byte as u32) << (self.num_peeked_bytes * 8);
+                    self.num_peeked_bytes += 1;
+                }
+                None => break,
+            };
         }
 
-        let first_byte = match self.iter.next().transpose()? {
-            Some(b) => b,
-            None => return Ok(None),
-        };
+        match n {
+            0 => Ok(0),
+            1 => Ok(self.peeked_bytes & 0xFF),
+            2 => Ok(self.peeked_bytes & 0xFFFF),
+            3 => Ok(self.peeked_bytes & 0xFFFFFF),
+            _ => Ok(self.peeked_bytes),
+        }
+    }
 
-        let chr: char = if first_byte < 0x80 {
-            // 0b0xxx_xxxx
-            first_byte.into()
+    /// Peeks the next unchecked character in u32 code point.
+    #[inline]
+    pub(super) fn peek_char(&mut self) -> Result<Option<u32>, Error> {
+        if let Some(ch) = self.peeked_char {
+            Ok(ch)
         } else {
-            let mut buf = [first_byte, 0u8, 0u8, 0u8];
-            let num_bytes = if first_byte < 0xE0 {
-                // 0b110x_xxxx
-                2
-            } else if first_byte < 0xF0 {
-                // 0b1110_xxxx
-                3
-            } else {
-                // 0b1111_0xxx
-                4
+            // Decode UTF-8
+            let x = match self.peek_byte()? {
+                Some(b) if b < 128 => {
+                    self.peeked_char = Some(Some(b as u32));
+                    return Ok(Some(b as u32));
+                }
+                Some(b) => b,
+                None => {
+                    self.peeked_char = None;
+                    return Ok(None);
+                }
             };
 
-            for b in buf.iter_mut().take(num_bytes).skip(1) {
-                let next = match self.iter.next() {
-                    Some(Ok(b)) => b,
-                    Some(Err(e)) => return Err(e),
-                    None => {
-                        return Err(io::Error::new(
-                            io::ErrorKind::InvalidData,
-                            "stream did not contain valid UTF-8",
-                        ))
-                    }
-                };
-
-                *b = next;
+            // Multibyte case follows
+            // Decode from a byte combination out of: [[[x y] z] w]
+            // NOTE: Performance is sensitive to the exact formulation here
+            let init = utf8_first_byte(x, 2);
+            let y = (self.peek_n_bytes(2)? >> 8) as u8;
+            let mut ch = utf8_acc_cont_byte(init, y);
+            if x >= 0xE0 {
+                // [[x y z] w] case
+                // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+                let z = (self.peek_n_bytes(3)? >> 16) as u8;
+                let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+                ch = init << 12 | y_z;
+                if x >= 0xF0 {
+                    // [x y z w] case
+                    // use only the lower 3 bits of `init`
+                    let w = (self.peek_n_bytes(4)? >> 24) as u8;
+                    ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
+                }
+            };
+
+            self.peeked_char = Some(Some(ch));
+            Ok(Some(ch))
+        }
+    }
+
+    /// Retrieves the next byte
+    #[inline]
+    fn next_byte(&mut self) -> io::Result<Option<u8>> {
+        self.peeked_char = None;
+        if self.num_peeked_bytes > 0 {
+            let byte = (self.peeked_bytes & 0xFF) as u8;
+            self.num_peeked_bytes -= 1;
+            self.peeked_bytes >>= 8;
+            Ok(Some(byte))
+        } else {
+            self.iter.next().transpose()
+        }
+    }
+
+    /// Retrieves the next unchecked char in u32 code point.
+    #[inline]
+    fn next_char(&mut self) -> io::Result<Option<u32>> {
+        if let Some(ch) = self.peeked_char.take() {
+            if let Some(c) = ch {
+                self.increment(utf8_len(c))?;
             }
+            return Ok(ch);
+        }
 
-            if let Ok(s) = std::str::from_utf8(&buf) {
-                if let Some(chr) = s.chars().next() {
-                    chr
-                } else {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        "stream did not contain valid UTF-8",
-                    ));
-                }
-            } else {
-                return Err(io::Error::new(
-                    io::ErrorKind::InvalidData,
-                    "stream did not contain valid UTF-8",
-                ));
+        // Decode UTF-8
+        let x = match self.next_byte()? {
+            Some(b) if b < 128 => return Ok(Some(b as u32)),
+            Some(b) => b,
+            None => return Ok(None),
+        };
+
+        // Multibyte case follows
+        // Decode from a byte combination out of: [[[x y] z] w]
+        // NOTE: Performance is sensitive to the exact formulation here
+        let init = utf8_first_byte(x, 2);
+        let y = unwrap_or_0(self.next_byte()?);
+        let mut ch = utf8_acc_cont_byte(init, y);
+        if x >= 0xE0 {
+            // [[x y z] w] case
+            // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
+            let z = unwrap_or_0(self.next_byte()?);
+            let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
+            ch = init << 12 | y_z;
+            if x >= 0xF0 {
+                // [x y z w] case
+                // use only the lower 3 bits of `init`
+                let w = unwrap_or_0(self.next_byte()?);
+                ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
             }
         };
 
-        Ok(Some(chr))
+        Ok(Some(ch))
     }
+}
 
-    /// Retrieves the next ASCII checked character.
-    #[inline]
-    fn next_ascii(&mut self) -> io::Result<Option<u8>> {
-        match self.next_char() {
-            Ok(Some(chr)) if chr.is_ascii() => Ok(Some(chr as u8)),
-            Ok(None) => Ok(None),
-            _ => Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "non-ASCII byte found",
-            )),
-        }
+/// Mask of the value bits of a continuation byte.
+const CONT_MASK: u8 = 0b0011_1111;
+
+/// Returns the initial codepoint accumulator for the first byte.
+/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
+/// for width 3, and 3 bits for width 4.
+#[inline]
+fn utf8_first_byte(byte: u8, width: u32) -> u32 {
+    (byte & (0x7F >> width)) as u32
+}
+
+/// Returns the value of `ch` updated with continuation byte `byte`.
+#[inline]
+fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
+    (ch << 6) | (byte & CONT_MASK) as u32
+}
+
+/// Checks whether the byte is a UTF-8 first byte (i.e., ascii byte or starts with the
+/// bits `11`).
+#[inline]
+fn utf8_is_first_byte(byte: u8) -> bool {
+    byte <= 0x7F || (byte >> 6) == 0x11
+}
+
+#[inline]
+fn unwrap_or_0(opt: Option<u8>) -> u8 {
+    match opt {
+        Some(byte) => byte,
+        None => 0,
+    }
+}
+
+#[inline]
+fn utf8_len(ch: u32) -> u32 {
+    if ch <= 0x7F {
+        1
+    } else if ch <= 0x7FF {
+        2
+    } else if ch <= 0xFFFF {
+        3
+    } else {
+        4
     }
 }
diff --git a/boa/src/syntax/lexer/identifier.rs b/boa/src/syntax/lexer/identifier.rs
index 19dd6dc608d..b8d35eecee6 100644
--- a/boa/src/syntax/lexer/identifier.rs
+++ b/boa/src/syntax/lexer/identifier.rs
@@ -8,7 +8,9 @@ use crate::{
         lexer::{Token, TokenKind},
     },
 };
+use core::convert::TryFrom;
 use std::io::Read;
+use std::str;
 
 const STRICT_FORBIDDEN_IDENTIFIERS: [&str; 11] = [
     "eval",
@@ -51,13 +53,21 @@ impl<R> Tokenizer<R> for Identifier {
     {
         let _timer = BoaProfiler::global().start_event("Identifier", "Lexing");
 
-        let mut buf = self.init.to_string();
+        let mut init_buf = [0u8; 4];
+        let mut buf = Vec::new();
+        self.init.encode_utf8(&mut init_buf);
+        buf.extend(init_buf.iter().take(self.init.len_utf8()));
 
-        cursor.take_while_pred(&mut buf, &|c: char| {
-            c.is_alphabetic() || c.is_digit(10) || c == '_'
+        cursor.take_while_char_pred(&mut buf, &|c: u32| {
+            if let Ok(c) = char::try_from(c) {
+                c.is_alphabetic() || c.is_digit(10) || c == '_'
+            } else {
+                false
+            }
         })?;
 
-        let tk = match buf.as_str() {
+        let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
+        let tk = match token_str {
             "true" => TokenKind::BooleanLiteral(true),
             "false" => TokenKind::BooleanLiteral(false),
             "null" => TokenKind::NullLiteral,
diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs
index f5f356b496e..a779453f65c 100644
--- a/boa/src/syntax/lexer/mod.rs
+++ b/boa/src/syntax/lexer/mod.rs
@@ -42,6 +42,7 @@ use self::{
 };
 use crate::syntax::ast::{Punctuator, Span};
 pub use crate::{profiler::BoaProfiler, syntax::ast::Position};
+use core::convert::TryFrom;
 pub use error::Error;
 use std::io::Read;
 pub use token::{Token, TokenKind};
@@ -69,12 +70,12 @@ impl<R> Lexer<R> {
     ///  * ECMAScript standard uses `\{Space_Separator}` + `\u{0009}`, `\u{000B}`, `\u{000C}`, `\u{FEFF}`
     ///
     /// [More information](https://tc39.es/ecma262/#table-32)
-    fn is_whitespace(ch: char) -> bool {
+    fn is_whitespace(ch: u32) -> bool {
         matches!(
             ch,
-            '\u{0020}' | '\u{0009}' | '\u{000B}' | '\u{000C}' | '\u{00A0}' | '\u{FEFF}' |
+            0x0020 | 0x0009 | 0x000B | 0x000C | 0x00A0 | 0xFEFF |
             // Unicode Space_Seperator category (minus \u{0020} and \u{00A0} which are allready stated above)
-            '\u{1680}' | '\u{2000}'..='\u{200A}' | '\u{202F}' | '\u{205F}' | '\u{3000}'
+            0x1680 | 0x2000..=0x200A | 0x202F | 0x205F | 0x3000
         )
     }
 
@@ -127,12 +128,12 @@ impl<R> Lexer<R> {
 
         if let Some(c) = self.cursor.peek()? {
             match c {
-                '/' => {
-                    self.cursor.next_char()?.expect("/ token vanished"); // Consume the '/'
+                b'/' => {
+                    self.cursor.next_byte()?.expect("/ token vanished"); // Consume the '/'
                     SingleLineComment.lex(&mut self.cursor, start)
                 }
-                '*' => {
-                    self.cursor.next_char()?.expect("* token vanished"); // Consume the '*'
+                b'*' => {
+                    self.cursor.next_byte()?.expect("* token vanished"); // Consume the '*'
                     MultiLineComment.lex(&mut self.cursor, start)
                 }
                 ch => {
@@ -140,9 +141,9 @@ impl<R> Lexer<R> {
                         InputElement::Div | InputElement::TemplateTail => {
                             // Only div punctuator allowed, regex not.
 
-                            if ch == '=' {
+                            if ch == b'=' {
                                 // Indicates this is an AssignDiv.
-                                self.cursor.next_char()?.expect("= token vanished"); // Consume the '='
+                                self.cursor.next_byte()?.expect("= token vanished"); // Consume the '='
                                 Ok(Token::new(
                                     Punctuator::AssignDiv.into(),
                                     Span::new(start, self.cursor.pos()),
@@ -178,90 +179,104 @@ impl<R> Lexer<R> {
     {
         let _timer = BoaProfiler::global().start_event("next()", "Lexing");
 
-        let (start, next_chr) = loop {
+        let (start, next_ch) = loop {
             let start = self.cursor.pos();
-            if let Some(next_chr) = self.cursor.next_char()? {
+            if let Some(next_ch) = self.cursor.next_char()? {
                 // Ignore whitespace
-                if !Self::is_whitespace(next_chr) {
-                    break (start, next_chr);
+                if !Self::is_whitespace(next_ch) {
+                    break (start, next_ch);
                 }
             } else {
                 return Ok(None);
             }
         };
 
-        let token = match next_chr {
-            '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
-                TokenKind::LineTerminator,
-                Span::new(start, self.cursor.pos()),
-            )),
-            '"' | '\'' => StringLiteral::new(next_chr).lex(&mut self.cursor, start),
-            '`' => TemplateLiteral.lex(&mut self.cursor, start),
-            _ if next_chr.is_digit(10) => NumberLiteral::new(next_chr).lex(&mut self.cursor, start),
-            _ if next_chr.is_alphabetic() || next_chr == '$' || next_chr == '_' => {
-                Identifier::new(next_chr).lex(&mut self.cursor, start)
-            }
-            ';' => Ok(Token::new(
-                Punctuator::Semicolon.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ':' => Ok(Token::new(
-                Punctuator::Colon.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '.' => SpreadLiteral::new().lex(&mut self.cursor, start),
-            '(' => Ok(Token::new(
-                Punctuator::OpenParen.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ')' => Ok(Token::new(
-                Punctuator::CloseParen.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ',' => Ok(Token::new(
-                Punctuator::Comma.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '{' => Ok(Token::new(
-                Punctuator::OpenBlock.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '}' => Ok(Token::new(
-                Punctuator::CloseBlock.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '[' => Ok(Token::new(
-                Punctuator::OpenBracket.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            ']' => Ok(Token::new(
-                Punctuator::CloseBracket.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '?' => Ok(Token::new(
-                Punctuator::Question.into(),
-                Span::new(start, self.cursor.pos()),
-            )),
-            '/' => self.lex_slash_token(start),
-            '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => {
-                Operator::new(next_chr).lex(&mut self.cursor, start)
+        if let Ok(c) = char::try_from(next_ch) {
+            let token = match c {
+                '\r' | '\n' | '\u{2028}' | '\u{2029}' => Ok(Token::new(
+                    TokenKind::LineTerminator,
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start),
+                '`' => TemplateLiteral.lex(&mut self.cursor, start),
+                _ if c.is_digit(10) => {
+                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
+                _ if c.is_alphabetic() || c == '$' || c == '_' => {
+                    Identifier::new(c).lex(&mut self.cursor, start)
+                }
+                ';' => Ok(Token::new(
+                    Punctuator::Semicolon.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ':' => Ok(Token::new(
+                    Punctuator::Colon.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '.' => SpreadLiteral::new().lex(&mut self.cursor, start),
+                '(' => Ok(Token::new(
+                    Punctuator::OpenParen.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ')' => Ok(Token::new(
+                    Punctuator::CloseParen.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ',' => Ok(Token::new(
+                    Punctuator::Comma.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '{' => Ok(Token::new(
+                    Punctuator::OpenBlock.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '}' => Ok(Token::new(
+                    Punctuator::CloseBlock.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '[' => Ok(Token::new(
+                    Punctuator::OpenBracket.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                ']' => Ok(Token::new(
+                    Punctuator::CloseBracket.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '?' => Ok(Token::new(
+                    Punctuator::Question.into(),
+                    Span::new(start, self.cursor.pos()),
+                )),
+                '/' => self.lex_slash_token(start),
+                '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' => {
+                    Operator::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
+                _ => {
+                    let details = format!(
+                        "unexpected '{}' at line {}, column {}",
+                        c,
+                        start.line_number(),
+                        start.column_number()
+                    );
+                    Err(Error::syntax(details, start))
+                }
+            }?;
+
+            if token.kind() == &TokenKind::Comment {
+                // Skip comment
+                self.next()
+            } else {
+                Ok(Some(token))
             }
-            _ => {
-                let details = format!(
-                    "unexpected '{}' at line {}, column {}",
-                    next_chr,
+        } else {
+            Err(Error::syntax(
+                format!(
+                    "unexpected utf-8 char '\\u{}' at line {}, column {}",
+                    next_ch,
                     start.line_number(),
                     start.column_number()
-                );
-                Err(Error::syntax(details, start))
-            }
-        }?;
-
-        if token.kind() == &TokenKind::Comment {
-            // Skip comment
-            self.next()
-        } else {
-            Ok(Some(token))
+                ),
+                start,
+            ))
         }
     }
 }
diff --git a/boa/src/syntax/lexer/number.rs b/boa/src/syntax/lexer/number.rs
index deb4db05162..4e5c97d3416 100644
--- a/boa/src/syntax/lexer/number.rs
+++ b/boa/src/syntax/lexer/number.rs
@@ -9,6 +9,7 @@ use crate::{
         lexer::{token::Numeric, Token},
     },
 };
+use std::str;
 use std::{io::Read, str::FromStr};
 
 /// Number literal lexing.
@@ -23,12 +24,12 @@ use std::{io::Read, str::FromStr};
 /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Data_structures#Number_type
 #[derive(Debug, Clone, Copy)]
 pub(super) struct NumberLiteral {
-    init: char,
+    init: u8,
 }
 
 impl NumberLiteral {
     /// Creates a new string literal lexer.
-    pub(super) fn new(init: char) -> Self {
+    pub(super) fn new(init: u8) -> Self {
         Self { init }
     }
 }
@@ -63,8 +64,9 @@ impl NumericKind {
     }
 }
 
+#[inline]
 fn take_signed_integer<R>(
-    buf: &mut String,
+    buf: &mut Vec<u8>,
     cursor: &mut Cursor<R>,
     kind: &NumericKind,
 ) -> Result<(), Error>
@@ -73,30 +75,31 @@ where
 {
     // The next part must be SignedInteger.
     // This is optionally a '+' or '-' followed by 1 or more DecimalDigits.
-    match cursor.next_char()? {
-        Some('+') => {
-            buf.push('+');
-            if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? {
+    match cursor.next_byte()? {
+        Some(b'+') => {
+            buf.push(b'+');
+            if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? {
                 // A digit must follow the + or - symbol.
                 return Err(Error::syntax("No digit found after + symbol", cursor.pos()));
             }
         }
-        Some('-') => {
-            buf.push('-');
-            if !cursor.next_is_pred(&|c: char| c.is_digit(kind.base()))? {
+        Some(b'-') => {
+            buf.push(b'-');
+            if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(kind.base()))? {
                 // A digit must follow the + or - symbol.
                 return Err(Error::syntax("No digit found after - symbol", cursor.pos()));
             }
         }
-        Some(c) if c.is_digit(kind.base()) => buf.push(c),
-        Some(c) => {
-            return Err(Error::syntax(
-                format!(
-                    "When lexing exponential value found unexpected char: '{}'",
-                    c
-                ),
-                cursor.pos(),
-            ));
+        Some(byte) => {
+            let ch = char::from(byte);
+            if ch.is_ascii() && ch.is_digit(kind.base()) {
+                buf.push(byte);
+            } else {
+                return Err(Error::syntax(
+                    "When lexing exponential value found unexpected char",
+                    cursor.pos(),
+                ));
+            }
         }
         None => {
             return Err(Error::syntax(
@@ -107,7 +110,7 @@ where
     }
 
     // Consume the decimal digits.
-    cursor.take_while_pred(buf, &|c: char| c.is_digit(kind.base()))?;
+    cursor.take_while_ascii_pred(buf, &|ch| ch.is_digit(kind.base()))?;
 
     Ok(())
 }
@@ -118,12 +121,12 @@ where
 ///  - [ECMAScript Specification][spec]
 ///
 /// [spec]: https://tc39.es/ecma262/#sec-literals-numeric-literals
+#[inline]
 fn check_after_numeric_literal<R>(cursor: &mut Cursor<R>) -> Result<(), Error>
 where
     R: Read,
 {
-    let pred = |ch: char| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_';
-    if cursor.next_is_pred(&pred)? {
+    if cursor.next_is_ascii_pred(&|ch| ch.is_ascii_alphanumeric() || ch == '$' || ch == '_')? {
         Err(Error::syntax(
             "a numeric literal must not be followed by an alphanumeric, $ or _ characters",
             cursor.pos(),
@@ -140,17 +143,17 @@ impl<R> Tokenizer<R> for NumberLiteral {
     {
         let _timer = BoaProfiler::global().start_event("NumberLiteral", "Lexing");
 
-        let mut buf = self.init.to_string();
+        let mut buf = vec![self.init];
 
         // Default assume the number is a base 10 integer.
         let mut kind = NumericKind::Integer(10);
 
         let c = cursor.peek();
 
-        if self.init == '0' {
+        if self.init == b'0' {
             if let Some(ch) = c? {
                 match ch {
-                    'x' | 'X' => {
+                    b'x' | b'X' => {
                         // Remove the initial '0' from buffer.
                         cursor.next_char()?.expect("x or X character vanished");
                         buf.pop();
@@ -159,16 +162,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
                         kind = NumericKind::Integer(16);
 
                         // Checks if the next char after '0x' is a digit of that base. if not return an error.
-                        if let Some(digit) = cursor.peek()? {
-                            if !digit.is_digit(16) {
-                                return Err(Error::syntax(
-                                    "expected hexadecimal digit after number base prefix",
-                                    cursor.pos(),
-                                ));
-                            }
+                        if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(16))? {
+                            return Err(Error::syntax(
+                                "expected hexadecimal digit after number base prefix",
+                                cursor.pos(),
+                            ));
                         }
                     }
-                    'o' | 'O' => {
+                    b'o' | b'O' => {
                         // Remove the initial '0' from buffer.
                         cursor.next_char()?.expect("o or O character vanished");
                         buf.pop();
@@ -177,16 +178,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
                         kind = NumericKind::Integer(8);
 
                         // Checks if the next char after '0o' is a digit of that base. if not return an error.
-                        if let Some(digit) = cursor.peek()? {
-                            if !digit.is_digit(8) {
-                                return Err(Error::syntax(
-                                    "expected hexadecimal digit after number base prefix",
-                                    cursor.pos(),
-                                ));
-                            }
+                        if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(8))? {
+                            return Err(Error::syntax(
+                                "expected hexadecimal digit after number base prefix",
+                                cursor.pos(),
+                            ));
                         }
                     }
-                    'b' | 'B' => {
+                    b'b' | b'B' => {
                         // Remove the initial '0' from buffer.
                         cursor.next_char()?.expect("b or B character vanished");
                         buf.pop();
@@ -195,16 +194,14 @@ impl<R> Tokenizer<R> for NumberLiteral {
                         kind = NumericKind::Integer(2);
 
                         // Checks if the next char after '0b' is a digit of that base. if not return an error.
-                        if let Some(digit) = cursor.peek()? {
-                            if !digit.is_digit(2) {
-                                return Err(Error::syntax(
-                                    "expected hexadecimal digit after number base prefix",
-                                    cursor.pos(),
-                                ));
-                            }
+                        if !cursor.next_is_ascii_pred(&|ch| ch.is_digit(2))? {
+                            return Err(Error::syntax(
+                                "expected hexadecimal digit after number base prefix",
+                                cursor.pos(),
+                            ));
                         }
                     }
-                    'n' => {
+                    b'n' => {
                         cursor.next_char()?.expect("n character vanished");
 
                         // DecimalBigIntegerLiteral '0n'
@@ -213,7 +210,8 @@ impl<R> Tokenizer<R> for NumberLiteral {
                             Span::new(start_pos, cursor.pos()),
                         ));
                     }
-                    ch => {
+                    byte => {
+                        let ch = char::from(byte);
                         if ch.is_digit(8) {
                             // LegacyOctalIntegerLiteral
                             if cursor.strict_mode() {
@@ -226,7 +224,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                                 // Remove the initial '0' from buffer.
                                 buf.pop();
 
-                                buf.push(cursor.next_char()?.expect("'0' character vanished"));
+                                buf.push(cursor.next_byte()?.expect("'0' character vanished"));
 
                                 kind = NumericKind::Integer(8);
                             }
@@ -240,7 +238,7 @@ impl<R> Tokenizer<R> for NumberLiteral {
                                     start_pos,
                                 ));
                             } else {
-                                buf.push(cursor.next_char()?.expect("Number digit vanished"));
+                                buf.push(cursor.next_byte()?.expect("Number digit vanished"));
                             }
                         } // Else indicates that the symbol is a non-number.
                     }
@@ -256,42 +254,42 @@ impl<R> Tokenizer<R> for NumberLiteral {
         }
 
         // Consume digits until a non-digit character is encountered or all the characters are consumed.
-        cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
+        cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
 
         // The non-digit character could be:
         // 'n' To indicate a BigIntLiteralSuffix.
         // '.' To indicate a decimal seperator.
         // 'e' | 'E' To indicate an ExponentPart.
         match cursor.peek()? {
-            Some('n') => {
+            Some(b'n') => {
                 // DecimalBigIntegerLiteral
                 // Lexing finished.
 
                 // Consume the n
-                cursor.next_char()?.expect("n character vanished");
+                cursor.next_byte()?.expect("n character vanished");
 
                 kind = kind.to_bigint();
             }
-            Some('.') => {
+            Some(b'.') => {
                 if kind.base() == 10 {
                     // Only base 10 numbers can have a decimal seperator.
                     // Number literal lexing finished if a . is found for a number in a different base.
 
-                    cursor.next_char()?.expect(". token vanished");
-                    buf.push('.'); // Consume the .
+                    cursor.next_byte()?.expect(". token vanished");
+                    buf.push(b'.'); // Consume the .
                     kind = NumericKind::Rational;
 
                     // Consume digits until a non-digit character is encountered or all the characters are consumed.
-                    cursor.take_while_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
+                    cursor.take_while_ascii_pred(&mut buf, &|c: char| c.is_digit(kind.base()))?;
 
                     // The non-digit character at this point must be an 'e' or 'E' to indicate an Exponent Part.
                     // Another '.' or 'n' is not allowed.
                     match cursor.peek()? {
-                        Some('e') | Some('E') => {
+                        Some(b'e') | Some(b'E') => {
                             // Consume the ExponentIndicator.
-                            cursor.next_char()?.expect("e or E token vanished");
+                            cursor.next_byte()?.expect("e or E token vanished");
 
-                            buf.push('E');
+                            buf.push(b'E');
 
                             take_signed_integer(&mut buf, cursor, &kind)?;
                         }
@@ -301,10 +299,10 @@ impl<R> Tokenizer<R> for NumberLiteral {
                     }
                 }
             }
-            Some('e') | Some('E') => {
+            Some(b'e') | Some(b'E') => {
                 kind = NumericKind::Rational;
-                cursor.next_char()?.expect("e or E character vanished"); // Consume the ExponentIndicator.
-                buf.push('E');
+                cursor.next_byte()?.expect("e or E character vanished"); // Consume the ExponentIndicator.
+                buf.push(b'E');
                 take_signed_integer(&mut buf, cursor, &kind)?;
             }
             Some(_) | None => {
@@ -314,14 +312,15 @@ impl<R> Tokenizer<R> for NumberLiteral {
 
         check_after_numeric_literal(cursor)?;
 
+        let num_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
         let num = match kind {
             NumericKind::BigInt(base) => {
                 Numeric::BigInt(
-                    BigInt::from_string_radix(&buf, base).expect("Could not convert to BigInt")
+                    BigInt::from_string_radix(num_str, base).expect("Could not convert to BigInt")
                     )
             }
             NumericKind::Rational /* base: 10 */ => {
-                let val = f64::from_str(&buf).expect("Failed to parse float after checks");
+                let val = f64::from_str(num_str).expect("Failed to parse float after checks");
                 let int_val = val as i32;
 
                 // The truncated float should be identically to the non-truncated float for the conversion to be loss-less,
@@ -335,12 +334,12 @@ impl<R> Tokenizer<R> for NumberLiteral {
                 }
             },
             NumericKind::Integer(base) => {
-                if let Ok(num) = i32::from_str_radix(&buf, base) {
+                if let Ok(num) = i32::from_str_radix(num_str, base) {
                     Numeric::Integer(num)
                 } else {
                     let b = f64::from(base);
                     let mut result = 0.0_f64;
-                    for c in buf.chars() {
+                    for c in num_str.chars() {
                         let digit = f64::from(c.to_digit(base).expect("could not parse digit after already checking validity"));
                         result = result * b + digit;
                     }
diff --git a/boa/src/syntax/lexer/operator.rs b/boa/src/syntax/lexer/operator.rs
index 5aa72c7d559..11971d384c4 100644
--- a/boa/src/syntax/lexer/operator.rs
+++ b/boa/src/syntax/lexer/operator.rs
@@ -17,8 +17,8 @@ macro_rules! vop {
     ($cursor:ident, $assign_op:expr, $op:expr) => ({
         match $cursor.peek()? {
             None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())),
-            Some('=') => {
-                $cursor.next_char()?.expect("= token vanished");
+            Some(b'=') => {
+                $cursor.next_byte()?.expect("= token vanished");
                 $cursor.next_column();
                 $assign_op
             }
@@ -28,13 +28,13 @@ macro_rules! vop {
     ($cursor:ident, $assign_op:expr, $op:expr, {$($case:pat => $block:expr), +}) => ({
         match $cursor.peek()? {
             None => Err(Error::syntax("abrupt end - could not preview next value as part of the operator", $cursor.pos())),
-            Some('=') => {
-                $cursor.next_char()?.expect("= token vanished");
+            Some(b'=') => {
+                $cursor.next_byte()?.expect("= token vanished");
                 $cursor.next_column();
                 $assign_op
             },
             $($case => {
-                $cursor.next_char()?.expect("Token vanished");
+                $cursor.next_byte()?.expect("Token vanished");
                 $cursor.next_column();
                 $block
             })+,
@@ -44,7 +44,7 @@ macro_rules! vop {
     ($cursor:ident, $op:expr, {$($case:pat => $block:expr),+}) => {
         match $cursor.peek().ok_or_else(|| Error::syntax("could not preview next value", $cursor.pos()))? {
             $($case => {
-                $cursor.next_char()?;
+                $cursor.next_byte()?;
                 $cursor.next_column();
                 $block
             })+,
@@ -72,7 +72,7 @@ macro_rules! op {
 
 #[derive(Debug, Clone, Copy)]
 pub(super) struct Operator {
-    init: char,
+    init: u8,
 }
 
 /// Operator lexing.
@@ -87,7 +87,7 @@ pub(super) struct Operator {
 /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators
 impl Operator {
     /// Creates a new operator lexer.
-    pub(super) fn new(init: char) -> Self {
+    pub(super) fn new(init: u8) -> Self {
         Self { init }
     }
 }
@@ -100,61 +100,63 @@ impl<R> Tokenizer<R> for Operator {
         let _timer = BoaProfiler::global().start_event("Operator", "Lexing");
 
         match self.init {
-            '*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), {
-                Some('*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp))
+            b'*' => op!(cursor, start_pos, Ok(Punctuator::AssignMul), Ok(Punctuator::Mul), {
+                Some(b'*') => vop!(cursor, Ok(Punctuator::AssignPow), Ok(Punctuator::Exp))
             }),
-            '+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), {
-                Some('+') => Ok(Punctuator::Inc)
+            b'+' => op!(cursor, start_pos, Ok(Punctuator::AssignAdd), Ok(Punctuator::Add), {
+                Some(b'+') => Ok(Punctuator::Inc)
             }),
-            '-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), {
-                Some('-') => {
+            b'-' => op!(cursor, start_pos, Ok(Punctuator::AssignSub), Ok(Punctuator::Sub), {
+                Some(b'-') => {
                     Ok(Punctuator::Dec)
                 }
             }),
-            '%' => op!(
+            b'%' => op!(
                 cursor,
                 start_pos,
                 Ok(Punctuator::AssignMod),
                 Ok(Punctuator::Mod)
             ),
-            '|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), {
-                Some('|') => Ok(Punctuator::BoolOr)
+            b'|' => op!(cursor, start_pos, Ok(Punctuator::AssignOr), Ok(Punctuator::Or), {
+                Some(b'|') => Ok(Punctuator::BoolOr)
             }),
-            '&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), {
-                Some('&') => Ok(Punctuator::BoolAnd)
+            b'&' => op!(cursor, start_pos, Ok(Punctuator::AssignAnd), Ok(Punctuator::And), {
+                Some(b'&') => Ok(Punctuator::BoolAnd)
             }),
-            '^' => op!(
+            b'^' => op!(
                 cursor,
                 start_pos,
                 Ok(Punctuator::AssignXor),
                 Ok(Punctuator::Xor)
             ),
-            '=' => op!(cursor, start_pos, if cursor.next_is('=')? {
+            b'=' => op!(cursor, start_pos, if cursor.next_is(b'=')? {
                 Ok(Punctuator::StrictEq)
             } else {
                 Ok(Punctuator::Eq)
             }, Ok(Punctuator::Assign), {
-                Some('>') => {
+                Some(b'>') => {
                     Ok(Punctuator::Arrow)
                 }
             }),
-            '<' => op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), {
-                Some('<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh))
-            }),
-            '>' => {
+            b'<' => {
+                op!(cursor, start_pos, Ok(Punctuator::LessThanOrEq), Ok(Punctuator::LessThan), {
+                    Some(b'<') => vop!(cursor, Ok(Punctuator::AssignLeftSh), Ok(Punctuator::LeftSh))
+                })
+            }
+            b'>' => {
                 op!(cursor, start_pos, Ok(Punctuator::GreaterThanOrEq), Ok(Punctuator::GreaterThan), {
-                    Some('>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), {
-                        Some('>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh))
+                    Some(b'>') => vop!(cursor, Ok(Punctuator::AssignRightSh), Ok(Punctuator::RightSh), {
+                        Some(b'>') => vop!(cursor, Ok(Punctuator::AssignURightSh), Ok(Punctuator::URightSh))
                     })
                 })
             }
-            '!' => op!(
+            b'!' => op!(
                 cursor,
                 start_pos,
                 vop!(cursor, Ok(Punctuator::StrictNotEq), Ok(Punctuator::NotEq)),
                 Ok(Punctuator::Not)
             ),
-            '~' => Ok(Token::new(
+            b'~' => Ok(Token::new(
                 Punctuator::Neg.into(),
                 Span::new(start_pos, cursor.pos()),
             )),
diff --git a/boa/src/syntax/lexer/regex.rs b/boa/src/syntax/lexer/regex.rs
index 2367c44d70f..9b3996c5313 100644
--- a/boa/src/syntax/lexer/regex.rs
+++ b/boa/src/syntax/lexer/regex.rs
@@ -9,6 +9,8 @@ use crate::{
     },
 };
 use bitflags::bitflags;
+use std::io::{self, ErrorKind};
+use std::str;
 use std::{
     fmt::{self, Display, Formatter},
     io::Read,
@@ -39,11 +41,11 @@ impl<R> Tokenizer<R> for RegexLiteral {
     {
         let _timer = BoaProfiler::global().start_event("RegexLiteral", "Lexing");
 
-        let mut body = String::new();
+        let mut body = Vec::new();
 
         // Lex RegularExpressionBody.
         loop {
-            match cursor.next_char()? {
+            match cursor.next_byte()? {
                 None => {
                     // Abrupt end.
                     return Err(Error::syntax(
@@ -51,29 +53,45 @@ impl<R> Tokenizer<R> for RegexLiteral {
                         cursor.pos(),
                     ));
                 }
-                Some(c) => {
-                    match c {
-                        '/' => break, // RegularExpressionBody finished.
-                        '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
+                Some(b) => {
+                    match b {
+                        b'/' => break, // RegularExpressionBody finished.
+                        b'\n' | b'\r' => {
                             // Not allowed in Regex literal.
                             return Err(Error::syntax(
                                 "new lines are not allowed in regular expressions",
                                 cursor.pos(),
                             ));
                         }
-                        '\\' => {
+                        0xE2 if (cursor.peek_n(2)? == 0xA8_80 || cursor.peek_n(2)? == 0xA9_80) => {
+                            // '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed
+                            return Err(Error::syntax(
+                                "new lines are not allowed in regular expressions",
+                                cursor.pos(),
+                            ));
+                        }
+                        b'\\' => {
                             // Escape sequence
-                            body.push('\\');
-                            if let Some(sc) = cursor.next_char()? {
+                            body.push(b'\\');
+                            if let Some(sc) = cursor.next_byte()? {
                                 match sc {
-                                    '\n' | '\r' | '\u{2028}' | '\u{2029}' => {
+                                    b'\n' | b'\r' => {
                                         // Not allowed in Regex literal.
                                         return Err(Error::syntax(
                                             "new lines are not allowed in regular expressions",
                                             cursor.pos(),
                                         ));
                                     }
-                                    ch => body.push(ch),
+                                    0xE2 if (cursor.peek_n(2)? == 0xA8_80
+                                        || cursor.peek_n(2)? == 0xA9_80) =>
+                                    {
+                                        // '\u{2028}' (e2 80 a8) and '\u{2029}' (e2 80 a9) are not allowed
+                                        return Err(Error::syntax(
+                                            "new lines are not allowed in regular expressions",
+                                            cursor.pos(),
+                                        ));
+                                    }
+                                    b => body.push(b),
                                 }
                             } else {
                                 // Abrupt end of regex.
@@ -83,20 +101,31 @@ impl<R> Tokenizer<R> for RegexLiteral {
                                 ));
                             }
                         }
-                        _ => body.push(c),
+                        _ => body.push(b),
                     }
                 }
             }
         }
 
-        let mut flags = String::new();
+        let mut flags = Vec::new();
         let flags_start = cursor.pos();
-        cursor.take_while_pred(&mut flags, &char::is_alphabetic)?;
-
-        Ok(Token::new(
-            TokenKind::regular_expression_literal(body, parse_regex_flags(&flags, flags_start)?),
-            Span::new(start_pos, cursor.pos()),
-        ))
+        cursor.take_while_ascii_pred(&mut flags, &|c: char| c.is_alphabetic())?;
+
+        let flags_str = unsafe { str::from_utf8_unchecked(flags.as_slice()) };
+        if let Ok(body_str) = str::from_utf8(body.as_slice()) {
+            Ok(Token::new(
+                TokenKind::regular_expression_literal(
+                    body_str,
+                    parse_regex_flags(flags_str, flags_start)?,
+                ),
+                Span::new(start_pos, cursor.pos()),
+            ))
+        } else {
+            Err(Error::from(io::Error::new(
+                ErrorKind::InvalidData,
+                "Invalid UTF-8 character in regular expressions",
+            )))
+        }
     }
 }
 
diff --git a/boa/src/syntax/lexer/spread.rs b/boa/src/syntax/lexer/spread.rs
index cc8e0ad36f9..422ed97ab9e 100644
--- a/boa/src/syntax/lexer/spread.rs
+++ b/boa/src/syntax/lexer/spread.rs
@@ -38,8 +38,8 @@ impl<R> Tokenizer<R> for SpreadLiteral {
         let _timer = BoaProfiler::global().start_event("SpreadLiteral", "Lexing");
 
         // . or ...
-        if cursor.next_is('.')? {
-            if cursor.next_is('.')? {
+        if cursor.next_is(b'.')? {
+            if cursor.next_is(b'.')? {
                 Ok(Token::new(
                     Punctuator::Spread.into(),
                     Span::new(start_pos, cursor.pos()),
diff --git a/boa/src/syntax/lexer/string.rs b/boa/src/syntax/lexer/string.rs
index 4cbcbbae6e5..ad85c40e10c 100644
--- a/boa/src/syntax/lexer/string.rs
+++ b/boa/src/syntax/lexer/string.rs
@@ -8,6 +8,7 @@ use crate::{
         lexer::{Token, TokenKind},
     },
 };
+use core::convert::TryFrom;
 use std::{
     io::{self, ErrorKind, Read},
     str,
@@ -58,12 +59,13 @@ impl<R> Tokenizer<R> for StringLiteral {
         let mut buf: Vec<u16> = Vec::new();
         loop {
             let next_chr_start = cursor.pos();
-            let next_chr = cursor.next_char()?.ok_or_else(|| {
+            let next_chr = char::try_from(cursor.next_char()?.ok_or_else(|| {
                 Error::from(io::Error::new(
                     ErrorKind::UnexpectedEof,
                     "unterminated string literal",
                 ))
-            })?;
+            })?)
+            .unwrap();
 
             match next_chr {
                 '\'' if self.terminator == StringTerminator::SingleQuote => {
@@ -76,22 +78,22 @@ impl<R> Tokenizer<R> for StringLiteral {
                     let _timer = BoaProfiler::global()
                         .start_event("StringLiteral - escape sequence", "Lexing");
 
-                    let escape = cursor.next_char()?.ok_or_else(|| {
+                    let escape = cursor.next_byte()?.ok_or_else(|| {
                         Error::from(io::Error::new(
                             ErrorKind::UnexpectedEof,
                             "unterminated escape sequence in string literal",
                         ))
                     })?;
 
-                    if escape != '\n' {
+                    if escape != b'\n' {
                         match escape {
-                            'n' => buf.push('\n' as u16),
-                            'r' => buf.push('\r' as u16),
-                            't' => buf.push('\t' as u16),
-                            'b' => buf.push('\x08' as u16),
-                            'f' => buf.push('\x0c' as u16),
-                            '0' => buf.push('\0' as u16),
-                            'x' => {
+                            b'n' => buf.push('\n' as u16),
+                            b'r' => buf.push('\r' as u16),
+                            b't' => buf.push('\t' as u16),
+                            b'b' => buf.push('\x08' as u16),
+                            b'f' => buf.push('\x0c' as u16),
+                            b'0' => buf.push('\0' as u16),
+                            b'x' => {
                                 let mut code_point_utf8_bytes = [0u8; 2];
                                 cursor.fill_bytes(&mut code_point_utf8_bytes)?;
                                 let code_point_str = str::from_utf8(&code_point_utf8_bytes)
@@ -106,17 +108,20 @@ impl<R> Tokenizer<R> for StringLiteral {
 
                                 buf.push(code_point);
                             }
-                            'u' => {
+                            b'u' => {
                                 // Support \u{X..X} (Unicode Codepoint)
-                                if cursor.next_is('{')? {
-                                    cursor.next_char()?.expect("{ character vanished"); // Consume the '{'.
+                                if cursor.next_is(b'{')? {
+                                    cursor.next_byte()?.expect("{ character vanished"); // Consume the '{'.
 
                                     // TODO: use bytes for a bit better performance (using stack)
-                                    let mut code_point_str = String::with_capacity(6);
-                                    cursor.take_until('}', &mut code_point_str)?;
+                                    let mut code_point_buf = Vec::with_capacity(6);
+                                    cursor.take_until(b'}', &mut code_point_buf)?;
 
-                                    cursor.next_char()?.expect("} character vanished"); // Consume the '}'.
+                                    cursor.next_byte()?.expect("} character vanished"); // Consume the '}'.
 
+                                    let code_point_str = unsafe {
+                                        str::from_utf8_unchecked(code_point_buf.as_slice())
+                                    };
                                     // We know this is a single unicode codepoint, convert to u32
                                     let code_point = u32::from_str_radix(&code_point_str, 16)
                                         .map_err(|_| {
@@ -156,13 +161,12 @@ impl<R> Tokenizer<R> for StringLiteral {
                                     buf.push(code_point);
                                 }
                             }
-                            '\'' | '"' | '\\' => buf.push(escape as u16),
-                            ch => {
+                            b'\'' | b'"' | b'\\' => buf.push(escape as u16),
+                            _ => {
                                 let details = format!(
-                                    "invalid escape sequence `{}` at line {}, column {}",
+                                    "invalid escape sequence at line {}, column {}",
                                     next_chr_start.line_number(),
                                     next_chr_start.column_number(),
-                                    ch
                                 );
                                 return Err(Error::syntax(details, cursor.pos()));
                             }
diff --git a/boa/src/syntax/lexer/template.rs b/boa/src/syntax/lexer/template.rs
index c51763c7f33..28bccf5e1cb 100644
--- a/boa/src/syntax/lexer/template.rs
+++ b/boa/src/syntax/lexer/template.rs
@@ -9,6 +9,7 @@ use crate::{
     },
 };
 use std::io::{self, ErrorKind, Read};
+use std::str;
 
 /// Template literal lexing.
 ///
@@ -30,23 +31,30 @@ impl<R> Tokenizer<R> for TemplateLiteral {
     {
         let _timer = BoaProfiler::global().start_event("TemplateLiteral", "Lexing");
 
-        let mut buf = String::new();
+        let mut buf = Vec::new();
         loop {
-            match cursor.next_char()? {
+            match cursor.next_byte()? {
                 None => {
                     return Err(Error::from(io::Error::new(
                         ErrorKind::UnexpectedEof,
                         "Unterminated template literal",
                     )));
                 }
-                Some('`') => break,                 // Template literal finished.
-                Some(next_ch) => buf.push(next_ch), // TODO when there is an expression inside the literal
+                Some(b'`') => break, // Template literal finished.
+                Some(next_byte) => buf.push(next_byte), // TODO when there is an expression inside the literal
             }
         }
 
-        Ok(Token::new(
-            TokenKind::template_literal(buf),
-            Span::new(start_pos, cursor.pos()),
-        ))
+        if let Ok(s) = str::from_utf8(buf.as_slice()) {
+            Ok(Token::new(
+                TokenKind::template_literal(s),
+                Span::new(start_pos, cursor.pos()),
+            ))
+        } else {
+            Err(Error::from(io::Error::new(
+                ErrorKind::InvalidData,
+                "Invalid UTF-8 character in template literal",
+            )))
+        }
     }
 }
diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs
index b78f5d77c7f..4dd22e194cf 100644
--- a/boa/src/syntax/lexer/tests.rs
+++ b/boa/src/syntax/lexer/tests.rs
@@ -6,6 +6,7 @@ use super::token::Numeric;
 use super::*;
 use super::{Error, Position};
 use crate::syntax::ast::Keyword;
+use std::str;
 
 fn span(start: (u32, u32), end: (u32, u32)) -> Span {
     Span::new(Position::new(start.0, start.1), Position::new(end.0, end.1))
@@ -280,19 +281,19 @@ fn check_positions_codepoint() {
     // String token starts on column 13
     assert_eq!(
         lexer.next().unwrap().unwrap().span(),
-        span((1, 13), (1, 34))
+        span((1, 13), (1, 36))
     );
 
-    // Close parenthesis token starts on column 34
+    // Close parenthesis token starts on column 36
     assert_eq!(
         lexer.next().unwrap().unwrap().span(),
-        span((1, 34), (1, 35))
+        span((1, 36), (1, 37))
     );
 
-    // Semi Colon token starts on column 35
+    // Semi Colon token starts on column 37
     assert_eq!(
         lexer.next().unwrap().unwrap().span(),
-        span((1, 35), (1, 36))
+        span((1, 37), (1, 38))
     );
 }
 
@@ -554,38 +555,102 @@ fn addition_no_spaces_e_number() {
 }
 
 #[test]
-fn take_while_pred_simple() {
+fn take_while_ascii_pred_simple() {
     let mut cur = Cursor::new(&b"abcdefghijk"[..]);
 
-    let mut buf: String = String::new();
+    let mut buf: Vec<u8> = Vec::new();
 
-    cur.take_while_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
+    cur.take_while_ascii_pred(&mut buf, &|c| c == 'a' || c == 'b' || c == 'c')
         .unwrap();
 
-    assert_eq!(buf, "abc");
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
 }
 
 #[test]
-fn take_while_pred_immediate_stop() {
+fn take_while_ascii_pred_immediate_stop() {
     let mut cur = Cursor::new(&b"abcdefghijk"[..]);
 
-    let mut buf: String = String::new();
+    let mut buf: Vec<u8> = Vec::new();
 
-    cur.take_while_pred(&mut buf, &|c| c == 'd').unwrap();
+    cur.take_while_ascii_pred(&mut buf, &|_| false).unwrap();
 
-    assert_eq!(buf, "");
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
 }
 
 #[test]
-fn take_while_pred_entire_str() {
+fn take_while_ascii_pred_entire_str() {
     let mut cur = Cursor::new(&b"abcdefghijk"[..]);
 
-    let mut buf: String = String::new();
+    let mut buf: Vec<u8> = Vec::new();
 
-    cur.take_while_pred(&mut buf, &|c| c.is_alphabetic())
-        .unwrap();
+    cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
+}
+
+#[test]
+fn take_while_ascii_pred_non_ascii_stop() {
+    let mut cur = Cursor::new("abcde😀fghijk".as_bytes());
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_ascii_pred(&mut buf, &|_| true).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcde");
+}
+
+#[test]
+fn take_while_char_pred_simple() {
+    let mut cur = Cursor::new(&b"abcdefghijk"[..]);
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|c| {
+        c == 'a' as u32 || c == 'b' as u32 || c == 'c' as u32
+    })
+    .unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc");
+}
+
+#[test]
+fn take_while_char_pred_immediate_stop() {
+    let mut cur = Cursor::new(&b"abcdefghijk"[..]);
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|_| false).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "");
+}
+
+#[test]
+fn take_while_char_pred_entire_str() {
+    let mut cur = Cursor::new(&b"abcdefghijk"[..]);
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|_| true).unwrap();
+
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abcdefghijk");
+}
+
+#[test]
+fn take_while_char_pred_utf8_char() {
+    let mut cur = Cursor::new("abc😀defghijk".as_bytes());
+
+    let mut buf: Vec<u8> = Vec::new();
+
+    cur.take_while_char_pred(&mut buf, &|c| {
+        if let Ok(c) = char::try_from(c) {
+            c == 'a' || c == 'b' || c == 'c' || c == '😀'
+        } else {
+            false
+        }
+    })
+    .unwrap();
 
-    assert_eq!(buf, "abcdefghijk");
+    assert_eq!(str::from_utf8(buf.as_slice()).unwrap(), "abc😀");
 }
 
 #[test]