From 585f34773d981feb24c761070bfb391ce2bc9579 Mon Sep 17 00:00:00 2001
From: John DiSanti <jdisanti@amazon.com>
Date: Wed, 2 Jun 2021 13:53:01 -0700
Subject: [PATCH 1/3] Implement JSON token stream deserializer

---
 rust-runtime/smithy-json/src/deserialize.rs | 850 ++++++++++++++++++++
 rust-runtime/smithy-json/src/lib.rs         |   1 +
 2 files changed, 851 insertions(+)
 create mode 100644 rust-runtime/smithy-json/src/deserialize.rs
diff --git a/rust-runtime/smithy-json/src/deserialize.rs b/rust-runtime/smithy-json/src/deserialize.rs
new file mode 100644
index 0000000000..9cfa1ef73a
--- /dev/null
+++ b/rust-runtime/smithy-json/src/deserialize.rs
@@ -0,0 +1,850 @@
+/*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+use smithy_types::Number;
+use std::fmt;
+use std::str::Utf8Error;
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum ErrorReason {
+    InvalidUtf8,
+    InvalidUnicodeEscape(String),
+    InvalidEscape(char),
+    InvalidNumber,
+    ExpectedLiteral(String),
+    UnexpectedControlCharacter(u8),
+    UnexpectedToken(char, &'static str),
+    UnexpectedEOS,
+}
+use ErrorReason::*;
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Error {
+    reason: ErrorReason,
+    offset: usize,
+}
+
+impl std::error::Error for Error {}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "Error at offset {}: ", self.offset)?;
+        match &self.reason {
+            InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON stream"),
+            InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape),
+            InvalidEscape(escape) => write!(f, "invalid JSON escape: \\{}", escape),
+            InvalidNumber => write!(f, "invalid number"),
+            ExpectedLiteral(literal) => write!(f, "expected literal: {}", literal),
+            UnexpectedControlCharacter(value) => write!(
+                f,
+                "encountered unescaped control character in string: 0x{:X}",
+                value
+            ),
+            UnexpectedToken(token, expected) => write!(
+                f,
+                "unexpected token '{}'. Expected one of {}",
+                token, expected
+            ),
+            UnexpectedEOS => write!(f, "unexpected end of stream"),
+        }
+    }
+}
+
+impl From<Utf8Error> for ErrorReason {
+    fn from(_: Utf8Error) -> Self {
+        InvalidUtf8
+    }
+}
+
+/// Enum representing the different JSON tokens that can be returned by [json_token_iter].
+#[derive(Debug, PartialEq)]
+pub enum Token {
+    StartArray,
+    EndArray,
+    ObjectKey(String),
+    StartObject,
+    EndObject,
+    ValueBool(bool),
+    ValueNull,
+    ValueNumber(Number),
+    ValueString(String),
+}
+
+/// Returns an Iterator of `Result<Token, Error>` over an slice of bytes.
+pub fn json_token_iter(input: &[u8]) -> JsonTokenIterator {
+    JsonTokenIterator {
+        input,
+        index: 0,
+        state_stack: vec![State::Initial],
+    }
+}
+
+/// Internal parser state for the iterator. Used to context between successive `next` calls.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+enum State {
+    /// Entry point. Expecting any JSON value.
+    Initial,
+    /// Expecting the next token to be the *first* value in an array, or the end of the array.
+    ArrayFirstValueOrEnd,
+    /// Expecting the next token to the next value in an array, or the end of the array.
+    ArrayNextValueOrEnd,
+    /// Expecting the next token to be the *first* key in the object, or the end of the object.
+    ObjectFirstKeyOrEnd,
+    /// Expecting the next token to the next object key, or the end of the object.
+    ObjectNextKeyOrEnd,
+    /// Expecting the next token to be the value of a field in an object.
+    ObjectFieldValue,
+}
+
+/// An iterator over a `&[u8]` that yields `Result<Token, Error>` with [Token] being JSON tokens.
+/// Construct with [json_token_iter].
+pub struct JsonTokenIterator<'a> {
+    input: &'a [u8],
+    index: usize,
+    state_stack: Vec<State>,
+}
+
+impl<'a> JsonTokenIterator<'a> {
+    /// Previews the next byte.
+    fn peek_byte(&self) -> Option<u8> {
+        if self.index >= self.input.len() {
+            None
+        } else {
+            Some(self.input[self.index])
+        }
+    }
+
+    /// Expects there to be another byte coming up, and previews it.
+    /// If there isn't, an `UnexpectedEOS` error is returned.
+    fn peek_expect(&self) -> Result<u8, Error> {
+        self.peek_byte().ok_or_else(|| self.error(UnexpectedEOS))
+    }
+
+    /// Advances to the next byte in the stream.
+    fn advance(&mut self) {
+        if self.index < self.input.len() {
+            self.index += 1;
+        }
+    }
+
+    /// Advances and returns the next byte in the stream.
+    fn next_byte(&mut self) -> Option<u8> {
+        let next = self.peek_byte();
+        self.advance();
+        next
+    }
+
+    /// Expects there to be another byte coming up, and returns it while advancing.
+    /// If there isn't, an `UnexpectedEOS` error is returned.
+    fn next_expect(&mut self) -> Result<u8, Error> {
+        self.next_byte().ok_or_else(|| self.error(UnexpectedEOS))
+    }
+
+    /// Creates an error at the given `offset` in the stream.
+    fn error_at(&self, offset: usize, reason: ErrorReason) -> Error {
+        Error { reason, offset }
+    }
+
+    /// Creates an error at the current offset in the stream.
+    fn error(&self, reason: ErrorReason) -> Error {
+        self.error_at(self.index, reason)
+    }
+
+    /// Advances until it hits a non-whitespace character or the end of the slice.
+    fn discard_whitespace(&mut self) {
+        while let Some(byte) = self.peek_byte() {
+            match byte {
+                b' ' | b'\t' | b'\r' | b'\n' => {
+                    self.advance();
+                }
+                _ => break,
+            }
+        }
+    }
+
+    /// Returns the top of the state stack (current state).
+    fn state(&self) -> State {
+        self.state_stack[self.state_stack.len() - 1]
+    }
+
+    /// Replaces the top of the state stack with a new `state`.
+    fn replace_state(&mut self, state: State) {
+        self.state_stack.pop();
+        self.state_stack.push(state);
+    }
+
+    /// Discards the '{' character and pushes the `ObjectFirstKeyOrEnd` state.
+    fn start_object(&mut self) -> Token {
+        let byte = self.next_byte();
+        debug_assert_eq!(byte, Some(b'{'));
+        self.state_stack.push(State::ObjectFirstKeyOrEnd);
+        Token::StartObject
+    }
+
+    /// Discards the '}' character and pops the current state.
+    fn end_object(&mut self) -> Token {
+        let (byte, state) = (self.next_byte(), self.state_stack.pop());
+        debug_assert_eq!(byte, Some(b'}'));
+        debug_assert!(
+            state == Some(State::ObjectFirstKeyOrEnd) || state == Some(State::ObjectNextKeyOrEnd)
+        );
+        Token::EndObject
+    }
+
+    /// Discards the '[' character and pushes the `ArrayFirstValueOrEnd` state.
+    fn start_array(&mut self) -> Token {
+        let byte = self.next_byte();
+        debug_assert_eq!(byte, Some(b'['));
+        self.state_stack.push(State::ArrayFirstValueOrEnd);
+        Token::StartArray
+    }
+
+    /// Discards the ']' character and pops the current state.
+    fn end_array(&mut self) -> Token {
+        let (byte, state) = (self.next_byte(), self.state_stack.pop());
+        debug_assert_eq!(byte, Some(b']'));
+        debug_assert!(
+            state == Some(State::ArrayFirstValueOrEnd) || state == Some(State::ArrayNextValueOrEnd)
+        );
+        Token::EndArray
+    }
+
+    /// Reads a JSON Unicode escape sequence (i.e., "\u1234").
+    fn read_unicode_escape(&mut self, into: &mut Vec<u8>) -> Result<(), Error> {
+        let (start, end) = (self.index, self.index + 4);
+        if end > self.input.len() {
+            return Err(self.error(UnexpectedEOS));
+        }
+
+        let codepoint_str =
+            std::str::from_utf8(&self.input[start..end]).map_err(|err| self.error(err.into()))?;
+        let codepoint = u32::from_str_radix(codepoint_str, 16)
+            .map_err(|_| self.error(ErrorReason::InvalidUnicodeEscape(codepoint_str.into())))?;
+        let codepoint = char::from_u32(codepoint)
+            .ok_or_else(|| self.error(InvalidUnicodeEscape(codepoint_str.into())))?;
+        match codepoint.len_utf8() {
+            1 => into.push(codepoint as u8),
+            _ => into.extend_from_slice(codepoint.encode_utf8(&mut [0; 4]).as_bytes()),
+        }
+        self.index = end;
+        Ok(())
+    }
+
+    /// Reads a JSON string out of the stream.
+    fn read_string(&mut self) -> Result<String, Error> {
+        // Skip the starting quote
+        let quote_byte = self.next_byte();
+        debug_assert_eq!(quote_byte, Some(b'\"'));
+
+        // Read bytes until a non-escaped end-quote, unescaping sequences as needed on the fly
+        let mut value = Vec::new();
+        loop {
+            match self.next_expect()? {
+                b'"' => return String::from_utf8(value).map_err(|_| self.error(InvalidUtf8)),
+                b'\\' => match self.next_expect()? {
+                    b'\\' => value.push(b'\\'),
+                    b'/' => value.push(b'/'),
+                    b'"' => value.push(b'"'),
+                    b'b' => value.push(0x08),
+                    b'f' => value.push(0x0C),
+                    b'n' => value.push(b'\n'),
+                    b'r' => value.push(b'\r'),
+                    b't' => value.push(b'\t'),
+                    b'u' => self.read_unicode_escape(&mut value)?,
+                    byte => return Err(self.error(InvalidEscape(byte.into()))),
+                },
+                byte @ 0x00..=0x1F => return Err(self.error(UnexpectedControlCharacter(byte))),
+                byte => value.push(byte),
+            }
+        }
+    }
+
+    /// Expects the given literal to be next in the stream.
+    fn expect_literal(&mut self, expected: &[u8]) -> Result<(), Error> {
+        let (start, end) = (self.index, self.index + expected.len());
+        if end > self.input.len() {
+            return Err(self.error_at(self.input.len(), UnexpectedEOS));
+        }
+        if expected != &self.input[start..end] {
+            return Err(self.error_at(
+                start,
+                ExpectedLiteral(std::str::from_utf8(expected).unwrap().into()),
+            ));
+        }
+        self.index = end;
+        Ok(())
+    }
+
+    /// Expects a literal `null` next in the stream.
+    fn expect_null(&mut self) -> Result<Token, Error> {
+        self.expect_literal(b"null")?;
+        Ok(Token::ValueNull)
+    }
+
+    /// Expects a boolean `true` / `false` to be next in the stream and returns its value.
+    fn expect_bool(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b't' => {
+                self.expect_literal(b"true")?;
+                Ok(Token::ValueBool(true))
+            }
+            b'f' => {
+                self.expect_literal(b"false")?;
+                Ok(Token::ValueBool(false))
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    /// Advances passed the exponent part of a floating point number.
+    fn skip_exponent(&mut self) {
+        self.advance();
+        match self.peek_byte() {
+            Some(b'-') => self.advance(),
+            Some(b'+') => self.advance(),
+            _ => {}
+        }
+        while let Some(b'0'..=b'9') = self.peek_byte() {
+            self.advance();
+        }
+    }
+
+    /// Advances passed the decimal part of a floating point number.
+    fn skip_decimal(&mut self) {
+        self.advance();
+        while let Some(byte) = self.peek_byte() {
+            match byte {
+                b'0'..=b'9' => self.advance(),
+                b'e' | b'E' => self.skip_exponent(),
+                _ => break,
+            }
+        }
+    }
+
+    /// Starting from the current location in the stream, this advances until
+    /// it finds a character that doesn't look like its part of a number, and then
+    /// returns `(start_index, end_index, negative, floating)`, with `start_index`
+    /// and `end_index` representing the slice of the stream that is the number,
+    /// `negative` whether or not it is a negative number, and `floating` whether or not
+    /// it is a floating point number.
+    fn scan_number(&mut self) -> (usize, usize, bool, bool) {
+        let start_index = self.index;
+        let negative = if self.peek_byte() == Some(b'-') {
+            self.advance();
+            true
+        } else {
+            false
+        };
+        let mut floating = false;
+        while let Some(byte) = self.peek_byte() {
+            match byte {
+                b'0'..=b'9' => self.advance(),
+                b'.' => {
+                    floating = true;
+                    self.skip_decimal();
+                }
+                b'e' | b'E' => {
+                    floating = true;
+                    self.skip_exponent();
+                }
+                _ => break,
+            }
+        }
+        (start_index, self.index, negative, floating)
+    }
+
+    /// Expects a number in the stream, and returns its value.
+    fn expect_number(&mut self) -> Result<Token, Error> {
+        let (start, end, negative, floating) = self.scan_number();
+        let number_slice = &self.input[start..end];
+
+        // Unsafe: we examined every character in the range, and they are all number characters
+        debug_assert!(std::str::from_utf8(number_slice).is_ok());
+        let number_str = unsafe { std::str::from_utf8_unchecked(number_slice) };
+
+        use std::str::FromStr;
+        Ok(Token::ValueNumber(if floating {
+            Number::Float(
+                f64::from_str(&number_str).map_err(|_| self.error_at(start, InvalidNumber))?,
+            )
+        } else if negative {
+            // If the negative value overflows, then stuff it into an f64
+            let positive =
+                u64::from_str(&number_str[1..]).map_err(|_| self.error_at(start, InvalidNumber))?;
+            let negative = positive.wrapping_neg() as i64;
+            if negative > 0 {
+                Number::Float(-(positive as f64))
+            } else {
+                Number::NegInt(negative as i64)
+            }
+        } else {
+            Number::PosInt(
+                u64::from_str(&number_str).map_err(|_| self.error_at(start, InvalidNumber))?,
+            )
+        }))
+    }
+
+    /// Reads a value from the stream and returns the next token. For objects and arrays,
+    /// the entire object or array will not be ready, but rather, a [StartObject]/[StartArray]
+    /// will be returned.
+    fn read_value(&mut self) -> Result<Token, Error> {
+        self.discard_whitespace();
+        match self.peek_expect()? {
+            b'{' => Ok(self.start_object()),
+            b'[' => Ok(self.start_array()),
+            b'"' => self.read_string().map(Token::ValueString),
+            byte => {
+                let value = match byte {
+                    b'n' => self.expect_null(),
+                    b't' | b'f' => self.expect_bool(),
+                    b'-' | (b'0'..=b'9') => self.expect_number(),
+                    byte => Err(self.error(UnexpectedToken(
+                        byte.into(),
+                        "'{', '[', '\"', 'null', 'true', 'false', <number>",
+                    ))),
+                }?;
+                // Verify there are no unexpected trailers on the end of the value
+                if let Some(byte) = self.peek_byte() {
+                    match byte {
+                        b' ' | b'\t' | b'\r' | b'\n' | b'}' | b']' | b',' => {}
+                        _ => {
+                            return Err(self.error(UnexpectedToken(
+                                byte.into(),
+                                "<whitespace>, '}', ']', ','",
+                            )))
+                        }
+                    }
+                }
+                Ok(value)
+            }
+        }
+    }
+
+    /// Handles the [ArrayFirstValueOrEnd] state.
+    fn state_array_first_value_or_end(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b']' => Ok(self.end_array()),
+            _ => {
+                self.replace_state(State::ArrayNextValueOrEnd);
+                self.read_value()
+            }
+        }
+    }
+
+    /// Handles the [ArrayNextValueOrEnd] state.
+    fn state_array_next_value_or_end(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b']' => Ok(self.end_array()),
+            b',' => {
+                self.advance();
+                self.read_value()
+            }
+            byte => Err(self.error(UnexpectedToken(byte.into(), "']', ','"))),
+        }
+    }
+
+    /// Expects an object key.
+    fn object_key(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b'"' => {
+                self.replace_state(State::ObjectFieldValue);
+                self.read_string().map(Token::ObjectKey)
+            }
+            byte => Err(self.error(UnexpectedToken(byte.into(), "'\"'"))),
+        }
+    }
+
+    /// Handles the [ObjectFirstKeyOrEnd] state.
+    fn state_object_first_key_or_end(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b'}' => Ok(self.end_object()),
+            _ => self.object_key(),
+        }
+    }
+
+    /// Handles the [ObjectNextKeyOrEnd] state.
+    fn state_object_next_key_or_end(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b'}' => Ok(self.end_object()),
+            b',' => {
+                self.advance();
+                self.discard_whitespace();
+                self.object_key()
+            }
+            byte => Err(self.error(UnexpectedToken(byte.into(), "'}', ','"))),
+        }
+    }
+
+    /// Handles the [ObjectFieldValue] state.
+    fn state_object_field_value(&mut self) -> Result<Token, Error> {
+        match self.peek_expect()? {
+            b':' => {
+                self.advance();
+                self.replace_state(State::ObjectNextKeyOrEnd);
+                self.read_value()
+            }
+            byte => Err(self.error(UnexpectedToken(byte.into(), "':'"))),
+        }
+    }
+}
+
+impl<'a> Iterator for JsonTokenIterator<'a> {
+    type Item = Result<Token, Error>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        debug_assert!(self.index <= self.input.len());
+        if self.index == self.input.len() {
+            return None;
+        }
+
+        self.discard_whitespace();
+        let result = match self.state() {
+            State::Initial => self.peek_byte().map(|_| self.read_value()),
+            State::ArrayFirstValueOrEnd => Some(self.state_array_first_value_or_end()),
+            State::ArrayNextValueOrEnd => Some(self.state_array_next_value_or_end()),
+            State::ObjectFirstKeyOrEnd => Some(self.state_object_first_key_or_end()),
+            State::ObjectNextKeyOrEnd => Some(self.state_object_next_key_or_end()),
+            State::ObjectFieldValue => Some(self.state_object_field_value()),
+        };
+        // Invalidate the stream if we encountered an error
+        if result.as_ref().map(|r| r.is_err()).unwrap_or(false) {
+            self.index = self.input.len();
+        }
+        result
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::deserialize::{json_token_iter, Error, ErrorReason, Token};
+    use crate::escape::escape_string;
+    use proptest::prelude::*;
+    use smithy_types::Number;
+
+    #[test]
+    fn test_empty() {
+        assert_eq!(None, json_token_iter(b"").next());
+        assert_eq!(None, json_token_iter(b" ").next());
+        assert_eq!(None, json_token_iter(b"\t").next());
+    }
+
+    #[test]
+    fn test_empty_string() {
+        let mut iter = json_token_iter(b"\"\"");
+        assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next());
+        assert_eq!(None, iter.next());
+
+        let mut iter = json_token_iter(b" \r\n\t \"\"  ");
+        assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next());
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn test_empty_array() {
+        let mut iter = json_token_iter(b"[]");
+        assert_eq!(Some(Ok(Token::StartArray)), iter.next());
+        assert_eq!(Some(Ok(Token::EndArray)), iter.next());
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn test_empty_object() {
+        let mut iter = json_token_iter(b"{}");
+        assert_eq!(Some(Ok(Token::StartObject)), iter.next());
+        assert_eq!(Some(Ok(Token::EndObject)), iter.next());
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn test_null() {
+        assert_eq!(
+            Some(Ok(Token::ValueNull)),
+            json_token_iter(b" null ").next()
+        );
+
+        let tokens: Result<Vec<Token>, Error> = json_token_iter(b"[null, null,null]").collect();
+        assert_eq!(
+            vec![
+                Token::StartArray,
+                Token::ValueNull,
+                Token::ValueNull,
+                Token::ValueNull,
+                Token::EndArray
+            ],
+            tokens.unwrap()
+        );
+
+        assert!(json_token_iter(b"n").next().unwrap().is_err());
+        assert!(json_token_iter(b"nul").next().unwrap().is_err());
+        assert!(json_token_iter(b"nulll").next().unwrap().is_err());
+    }
+
+    #[test]
+    fn test_bools() {
+        assert!(json_token_iter(b"tru").next().unwrap().is_err());
+        assert!(json_token_iter(b"truee").next().unwrap().is_err());
+        assert!(json_token_iter(b"f").next().unwrap().is_err());
+        assert!(json_token_iter(b"falsee").next().unwrap().is_err());
+        assert_eq!(
+            Some(Ok(Token::ValueBool(true))),
+            json_token_iter(b" true ").next()
+        );
+        assert_eq!(
+            Some(Ok(Token::ValueBool(false))),
+            json_token_iter(b"false").next()
+        );
+
+        let tokens: Result<Vec<Token>, Error> = json_token_iter(b"[true,false]").collect();
+        assert_eq!(
+            vec![
+                Token::StartArray,
+                Token::ValueBool(true),
+                Token::ValueBool(false),
+                Token::EndArray
+            ],
+            tokens.unwrap()
+        );
+    }
+
+    proptest! {
+        #[test]
+        fn string_prop_test(input in ".*") {
+            let json = format!("\"{}\"", escape_string(&input));
+
+            let serde_string: String = serde_json::from_str(&json).unwrap();
+
+            let mut iter = json_token_iter(json.as_bytes());
+            assert_eq!(Some(Ok(Token::ValueString(serde_string))), iter.next());
+            assert_eq!(None, iter.next());
+        }
+
+        #[test]
+        fn integer_prop_test(input: i64) {
+            let json = serde_json::to_string(&input).unwrap();
+            let mut iter = json_token_iter(json.as_bytes());
+            assert_eq!(Some(Ok(Token::ValueNumber(
+                if input < 0 {
+                    Number::NegInt(input)
+                } else {
+                    Number::PosInt(input as u64)
+                }))), iter.next());
+            assert_eq!(None, iter.next());
+        }
+
+        #[test]
+        fn float_prop_test(input: f64) {
+            let json = serde_json::to_string(&input).unwrap();
+            let mut iter = json_token_iter(json.as_bytes());
+            assert_eq!(Some(Ok(Token::ValueNumber(Number::Float(input)))), iter.next());
+            assert_eq!(None, iter.next());
+        }
+    }
+
+    #[test]
+    fn valid_numbers() {
+        let expect = |number, input| {
+            assert_eq!(
+                Token::ValueNumber(number),
+                json_token_iter(input).next().unwrap().unwrap()
+            );
+        };
+        expect(Number::Float(0.0), b"0.");
+        expect(Number::Float(0.0), b"0e0");
+        expect(Number::Float(0.0), b"0E0");
+        expect(Number::Float(10.0), b"1E1");
+        expect(Number::Float(10.0), b"1E+1");
+        expect(Number::Float(100.0), b"1e+2");
+
+        expect(Number::NegInt(-50000), b"-50000");
+        expect(
+            Number::Float(-18446744073709551615.0),
+            b"-18446744073709551615",
+        );
+    }
+
+    // These cases actually shouldn't parse according to the spec, but it's easier
+    // to be lenient on these, and it doesn't really impact the SDK use-case.
+    #[test]
+    fn invalid_numbers_we_are_intentionally_accepting() {
+        let expect = |number, input| {
+            assert_eq!(
+                Token::ValueNumber(number),
+                json_token_iter(input).next().unwrap().unwrap()
+            );
+        };
+
+        expect(Number::NegInt(-1), b"-01");
+        expect(Number::Float(-2.0), b"-2.");
+        expect(Number::Float(0.0), b"0.e1");
+        expect(Number::Float(0.002), b"2.e-3");
+        expect(Number::Float(2000.0), b"2.e3");
+        expect(Number::NegInt(-12), b"-012");
+        expect(Number::Float(-0.123), b"-.123");
+        expect(Number::Float(1.0), b"1.");
+        expect(Number::PosInt(12), b"012");
+    }
+
+    #[test]
+    fn invalid_numbers() {
+        let unexpected_token = |input, token, offset, msg| {
+            let tokens: Vec<Result<Token, Error>> = json_token_iter(input).collect();
+            assert_eq!(
+                vec![Err(Error {
+                    reason: ErrorReason::UnexpectedToken(token, msg),
+                    offset
+                }),],
+                tokens,
+                "input: \"{}\"",
+                std::str::from_utf8(input).unwrap(),
+            );
+        };
+
+        let invalid_number = |input, offset| {
+            let tokens: Vec<Result<Token, Error>> = json_token_iter(input).collect();
+            assert_eq!(
+                vec![Err(Error {
+                    reason: ErrorReason::InvalidNumber,
+                    offset
+                })],
+                tokens,
+                "input: \"{}\"",
+                std::str::from_utf8(input).unwrap(),
+            );
+        };
+
+        let unexpected_trailer = "<whitespace>, '}', ']', ','";
+        let unexpected_start = "'{', '[', '\"', 'null', 'true', 'false', <number>";
+
+        unexpected_token(b".", '.', 0, unexpected_start);
+        unexpected_token(b".0", '.', 0, unexpected_start);
+        unexpected_token(b"0-05", '-', 1, unexpected_trailer);
+        unexpected_token(b"0x05", 'x', 1, unexpected_trailer);
+        unexpected_token(b"123.invalid", 'i', 4, unexpected_trailer);
+        unexpected_token(b"123invalid", 'i', 3, unexpected_trailer);
+        unexpected_token(b"asdf", 'a', 0, unexpected_start);
+
+        invalid_number(b"-a", 0);
+        invalid_number(b"1e", 0);
+        invalid_number(b"1e-", 0);
+
+        // Number parsing fails before it even looks at the trailer because of invalid exponent
+        invalid_number(b"123.0Einvalid", 0);
+    }
+
+    #[test]
+    fn test_unclosed_array() {
+        let mut iter = json_token_iter(br#" [null "#);
+        assert_eq!(Some(Ok(Token::StartArray)), iter.next());
+        assert_eq!(Some(Ok(Token::ValueNull)), iter.next());
+        assert_eq!(
+            Some(Err(Error {
+                reason: ErrorReason::UnexpectedEOS,
+                offset: 7
+            })),
+            iter.next()
+        );
+    }
+
+    #[test]
+    fn test_array_with_items() {
+        let tokens: Result<Vec<Token>, Error> = json_token_iter(b"[[], {}, \"test\"]").collect();
+        assert_eq!(
+            vec![
+                Token::StartArray,
+                Token::StartArray,
+                Token::EndArray,
+                Token::StartObject,
+                Token::EndObject,
+                Token::ValueString("test".into()),
+                Token::EndArray,
+            ],
+            tokens.unwrap()
+        )
+    }
+
+    #[test]
+    fn test_object_with_items() {
+        let tokens: Result<Vec<Token>, Error> = json_token_iter(
+            br#"
+            { "some_int": 5,
+              "some_float": 5.2,
+              "some_negative": -5,
+              "some_negative_float": -2.4,
+              "some_string": "test",
+              "some_struct": { "nested": "asdf" },
+              "some_array": ["one", "two"] }
+            "#,
+        )
+        .collect();
+        assert_eq!(
+            vec![
+                Token::StartObject,
+                Token::ObjectKey("some_int".into()),
+                Token::ValueNumber(Number::PosInt(5)),
+                Token::ObjectKey("some_float".into()),
+                Token::ValueNumber(Number::Float(5.2)),
+                Token::ObjectKey("some_negative".into()),
+                Token::ValueNumber(Number::NegInt(-5)),
+                Token::ObjectKey("some_negative_float".into()),
+                Token::ValueNumber(Number::Float(-2.4)),
+                Token::ObjectKey("some_string".into()),
+                Token::ValueString("test".into()),
+                Token::ObjectKey("some_struct".into()),
+                Token::StartObject,
+                Token::ObjectKey("nested".into()),
+                Token::ValueString("asdf".into()),
+                Token::EndObject,
+                Token::ObjectKey("some_array".into()),
+                Token::StartArray,
+                Token::ValueString("one".into()),
+                Token::ValueString("two".into()),
+                Token::EndArray,
+                Token::EndObject,
+            ],
+            tokens.unwrap()
+        )
+    }
+
+    #[test]
+    fn test_object_trailing_comma() {
+        let mut iter = json_token_iter(br#" { "test": "trailing", } "#);
+        assert_eq!(Some(Ok(Token::StartObject)), iter.next());
+        assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next());
+        assert_eq!(Some(Ok(Token::ValueString("trailing".into()))), iter.next());
+        assert_eq!(
+            Some(Err(Error {
+                reason: ErrorReason::UnexpectedToken('}', "'\"'"),
+                offset: 23,
+            })),
+            iter.next()
+        );
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn test_object_no_colon() {
+        let mut iter = json_token_iter(br#" {"test" "#);
+        assert_eq!(Some(Ok(Token::StartObject)), iter.next());
+        assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next());
+        assert_eq!(
+            Some(Err(Error {
+                reason: ErrorReason::UnexpectedEOS,
+                offset: 9,
+            })),
+            iter.next()
+        );
+        assert_eq!(None, iter.next());
+    }
+
+    #[test]
+    fn unescaped_ctrl_characters() {
+        assert!(json_token_iter(b"\"test\x00test\"")
+            .next()
+            .unwrap()
+            .is_err());
+        assert!(json_token_iter(b"\"test\ntest\"").next().unwrap().is_err());
+        assert!(json_token_iter(b"\"test\ttest\"").next().unwrap().is_err());
+    }
+}
diff --git a/rust-runtime/smithy-json/src/lib.rs b/rust-runtime/smithy-json/src/lib.rs
index de371590ee..538378e046 100644
--- a/rust-runtime/smithy-json/src/lib.rs
+++ b/rust-runtime/smithy-json/src/lib.rs
@@ -5,5 +5,6 @@
 
 //! JSON Abstractions for Smithy
 
+pub mod deserialize;
 mod escape;
 pub mod serialize;

From 0a2c800575e856bcf66f5132e613a930e1374cac Mon Sep 17 00:00:00 2001
From: John DiSanti <jdisanti@amazon.com>
Date: Thu, 3 Jun 2021 12:17:53 -0700
Subject: [PATCH 2/3] Stop allocating string values and fix surrogate pair
 unescaping

---
 rust-runtime/smithy-json/src/deserialize.rs | 177 ++++++++--------
 rust-runtime/smithy-json/src/escape.rs      | 213 ++++++++++++++++++++
 2 files changed, 309 insertions(+), 81 deletions(-)

diff --git a/rust-runtime/smithy-json/src/deserialize.rs b/rust-runtime/smithy-json/src/deserialize.rs
index 9cfa1ef73a..03906f8ed5 100644
--- a/rust-runtime/smithy-json/src/deserialize.rs
+++ b/rust-runtime/smithy-json/src/deserialize.rs
@@ -3,14 +3,17 @@
  * SPDX-License-Identifier: Apache-2.0.
  */
 
+use crate::escape::unescape_string;
 use smithy_types::Number;
+use std::borrow::Cow;
 use std::fmt;
 use std::str::Utf8Error;
 
+pub use crate::escape::Error as EscapeError;
+
 #[derive(Debug, PartialEq, Eq)]
 pub enum ErrorReason {
     InvalidUtf8,
-    InvalidUnicodeEscape(String),
     InvalidEscape(char),
     InvalidNumber,
     ExpectedLiteral(String),
@@ -33,7 +36,6 @@ impl fmt::Display for Error {
         write!(f, "Error at offset {}: ", self.offset)?;
         match &self.reason {
             InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON stream"),
-            InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape),
             InvalidEscape(escape) => write!(f, "invalid JSON escape: \\{}", escape),
             InvalidNumber => write!(f, "invalid number"),
             ExpectedLiteral(literal) => write!(f, "expected literal: {}", literal),
@@ -58,18 +60,40 @@ impl From<Utf8Error> for ErrorReason {
     }
 }
 
+/// New-type around `&str` that indicates the string is an escaped JSON string.
+/// Provides functions for retrieving the string in either form.
+#[derive(Debug, PartialEq, Eq)]
+pub struct EscapedStr<'a>(&'a str);
+
+impl<'a> EscapedStr<'a> {
+    pub fn new(value: &'a str) -> EscapedStr<'a> {
+        EscapedStr(value)
+    }
+
+    /// Returns the escaped string value
+    pub fn as_escaped_str(&self) -> &str {
+        self.0
+    }
+
+    /// Consumes self and returns the string unescaped.
+    /// If the string doesn't need unescaping, it will be returned directly.
+    pub fn into_unescaped(self) -> Result<Cow<'a, str>, EscapeError> {
+        unescape_string(self.0)
+    }
+}
+
 /// Enum representing the different JSON tokens that can be returned by [json_token_iter].
 #[derive(Debug, PartialEq)]
-pub enum Token {
+pub enum Token<'a> {
     StartArray,
     EndArray,
-    ObjectKey(String),
+    ObjectKey(EscapedStr<'a>),
     StartObject,
     EndObject,
     ValueBool(bool),
     ValueNull,
     ValueNumber(Number),
-    ValueString(String),
+    ValueString(EscapedStr<'a>),
 }
 
 /// Returns an Iterator of `Result<Token, Error>` over an slice of bytes.
@@ -176,7 +200,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Discards the '{' character and pushes the `ObjectFirstKeyOrEnd` state.
-    fn start_object(&mut self) -> Token {
+    fn start_object(&mut self) -> Token<'a> {
         let byte = self.next_byte();
         debug_assert_eq!(byte, Some(b'{'));
         self.state_stack.push(State::ObjectFirstKeyOrEnd);
@@ -184,7 +208,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Discards the '}' character and pops the current state.
-    fn end_object(&mut self) -> Token {
+    fn end_object(&mut self) -> Token<'a> {
         let (byte, state) = (self.next_byte(), self.state_stack.pop());
         debug_assert_eq!(byte, Some(b'}'));
         debug_assert!(
@@ -194,7 +218,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Discards the '[' character and pushes the `ArrayFirstValueOrEnd` state.
-    fn start_array(&mut self) -> Token {
+    fn start_array(&mut self) -> Token<'a> {
         let byte = self.next_byte();
         debug_assert_eq!(byte, Some(b'['));
         self.state_stack.push(State::ArrayFirstValueOrEnd);
@@ -202,7 +226,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Discards the ']' character and pops the current state.
-    fn end_array(&mut self) -> Token {
+    fn end_array(&mut self) -> Token<'a> {
         let (byte, state) = (self.next_byte(), self.state_stack.pop());
         debug_assert_eq!(byte, Some(b']'));
         debug_assert!(
@@ -211,52 +235,34 @@ impl<'a> JsonTokenIterator<'a> {
         Token::EndArray
     }
 
-    /// Reads a JSON Unicode escape sequence (i.e., "\u1234").
-    fn read_unicode_escape(&mut self, into: &mut Vec<u8>) -> Result<(), Error> {
-        let (start, end) = (self.index, self.index + 4);
-        if end > self.input.len() {
-            return Err(self.error(UnexpectedEOS));
-        }
-
-        let codepoint_str =
-            std::str::from_utf8(&self.input[start..end]).map_err(|err| self.error(err.into()))?;
-        let codepoint = u32::from_str_radix(codepoint_str, 16)
-            .map_err(|_| self.error(ErrorReason::InvalidUnicodeEscape(codepoint_str.into())))?;
-        let codepoint = char::from_u32(codepoint)
-            .ok_or_else(|| self.error(InvalidUnicodeEscape(codepoint_str.into())))?;
-        match codepoint.len_utf8() {
-            1 => into.push(codepoint as u8),
-            _ => into.extend_from_slice(codepoint.encode_utf8(&mut [0; 4]).as_bytes()),
-        }
-        self.index = end;
-        Ok(())
-    }
-
     /// Reads a JSON string out of the stream.
-    fn read_string(&mut self) -> Result<String, Error> {
+    fn read_string(&mut self) -> Result<&'a str, Error> {
         // Skip the starting quote
         let quote_byte = self.next_byte();
         debug_assert_eq!(quote_byte, Some(b'\"'));
 
         // Read bytes until a non-escaped end-quote, unescaping sequences as needed on the fly
-        let mut value = Vec::new();
+        let start = self.index;
         loop {
-            match self.next_expect()? {
-                b'"' => return String::from_utf8(value).map_err(|_| self.error(InvalidUtf8)),
+            match self.peek_expect()? {
+                b'"' => {
+                    let value = std::str::from_utf8(&self.input[start..self.index])
+                        .map_err(|_| self.error(InvalidUtf8))?;
+                    self.advance();
+                    return Ok(value);
+                }
                 b'\\' => match self.next_expect()? {
-                    b'\\' => value.push(b'\\'),
-                    b'/' => value.push(b'/'),
-                    b'"' => value.push(b'"'),
-                    b'b' => value.push(0x08),
-                    b'f' => value.push(0x0C),
-                    b'n' => value.push(b'\n'),
-                    b'r' => value.push(b'\r'),
-                    b't' => value.push(b'\t'),
-                    b'u' => self.read_unicode_escape(&mut value)?,
+                    b'\\' | b'/' | b'"' | b'b' | b'f' | b'n' | b'r' | b't' => self.advance(),
+                    b'u' => {
+                        if self.index + 4 > self.input.len() {
+                            return Err(self.error_at(self.input.len(), UnexpectedEOS));
+                        }
+                        self.index += 4;
+                    }
                     byte => return Err(self.error(InvalidEscape(byte.into()))),
                 },
                 byte @ 0x00..=0x1F => return Err(self.error(UnexpectedControlCharacter(byte))),
-                byte => value.push(byte),
+                _ => self.advance(),
             }
         }
     }
@@ -278,13 +284,13 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Expects a literal `null` next in the stream.
-    fn expect_null(&mut self) -> Result<Token, Error> {
+    fn expect_null(&mut self) -> Result<Token<'a>, Error> {
         self.expect_literal(b"null")?;
         Ok(Token::ValueNull)
     }
 
     /// Expects a boolean `true` / `false` to be next in the stream and returns its value.
-    fn expect_bool(&mut self) -> Result<Token, Error> {
+    fn expect_bool(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b't' => {
                 self.expect_literal(b"true")?;
@@ -356,7 +362,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Expects a number in the stream, and returns its value.
-    fn expect_number(&mut self) -> Result<Token, Error> {
+    fn expect_number(&mut self) -> Result<Token<'a>, Error> {
         let (start, end, negative, floating) = self.scan_number();
         let number_slice = &self.input[start..end];
 
@@ -389,12 +395,14 @@ impl<'a> JsonTokenIterator<'a> {
     /// Reads a value from the stream and returns the next token. For objects and arrays,
     /// the entire object or array will not be ready, but rather, a [StartObject]/[StartArray]
     /// will be returned.
-    fn read_value(&mut self) -> Result<Token, Error> {
+    fn read_value(&mut self) -> Result<Token<'a>, Error> {
         self.discard_whitespace();
         match self.peek_expect()? {
             b'{' => Ok(self.start_object()),
             b'[' => Ok(self.start_array()),
-            b'"' => self.read_string().map(Token::ValueString),
+            b'"' => self
+                .read_string()
+                .map(|s| Token::ValueString(EscapedStr(s))),
             byte => {
                 let value = match byte {
                     b'n' => self.expect_null(),
@@ -423,7 +431,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Handles the [ArrayFirstValueOrEnd] state.
-    fn state_array_first_value_or_end(&mut self) -> Result<Token, Error> {
+    fn state_array_first_value_or_end(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b']' => Ok(self.end_array()),
             _ => {
@@ -434,7 +442,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Handles the [ArrayNextValueOrEnd] state.
-    fn state_array_next_value_or_end(&mut self) -> Result<Token, Error> {
+    fn state_array_next_value_or_end(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b']' => Ok(self.end_array()),
             b',' => {
@@ -446,18 +454,18 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Expects an object key.
-    fn object_key(&mut self) -> Result<Token, Error> {
+    fn object_key(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b'"' => {
                 self.replace_state(State::ObjectFieldValue);
-                self.read_string().map(Token::ObjectKey)
+                self.read_string().map(|s| Token::ObjectKey(EscapedStr(s)))
             }
             byte => Err(self.error(UnexpectedToken(byte.into(), "'\"'"))),
         }
     }
 
     /// Handles the [ObjectFirstKeyOrEnd] state.
-    fn state_object_first_key_or_end(&mut self) -> Result<Token, Error> {
+    fn state_object_first_key_or_end(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b'}' => Ok(self.end_object()),
             _ => self.object_key(),
@@ -465,7 +473,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Handles the [ObjectNextKeyOrEnd] state.
-    fn state_object_next_key_or_end(&mut self) -> Result<Token, Error> {
+    fn state_object_next_key_or_end(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b'}' => Ok(self.end_object()),
             b',' => {
@@ -478,7 +486,7 @@ impl<'a> JsonTokenIterator<'a> {
     }
 
     /// Handles the [ObjectFieldValue] state.
-    fn state_object_field_value(&mut self) -> Result<Token, Error> {
+    fn state_object_field_value(&mut self) -> Result<Token<'a>, Error> {
         match self.peek_expect()? {
             b':' => {
                 self.advance();
@@ -491,7 +499,7 @@ impl<'a> JsonTokenIterator<'a> {
 }
 
 impl<'a> Iterator for JsonTokenIterator<'a> {
-    type Item = Result<Token, Error>;
+    type Item = Result<Token<'a>, Error>;
 
     fn next(&mut self) -> Option<Self::Item> {
         debug_assert!(self.index <= self.input.len());
@@ -518,8 +526,7 @@ impl<'a> Iterator for JsonTokenIterator<'a> {
 
 #[cfg(test)]
 mod tests {
-    use crate::deserialize::{json_token_iter, Error, ErrorReason, Token};
-    use crate::escape::escape_string;
+    use crate::deserialize::{json_token_iter, Error, ErrorReason, EscapedStr, Token};
     use proptest::prelude::*;
     use smithy_types::Number;
 
@@ -533,11 +540,11 @@ mod tests {
     #[test]
     fn test_empty_string() {
         let mut iter = json_token_iter(b"\"\"");
-        assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next());
+        assert_eq!(Some(Ok(Token::ValueString(EscapedStr("")))), iter.next());
         assert_eq!(None, iter.next());
 
         let mut iter = json_token_iter(b" \r\n\t \"\"  ");
-        assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next());
+        assert_eq!(Some(Ok(Token::ValueString(EscapedStr("")))), iter.next());
         assert_eq!(None, iter.next());
     }
 
@@ -611,12 +618,10 @@ mod tests {
     proptest! {
         #[test]
         fn string_prop_test(input in ".*") {
-            let json = format!("\"{}\"", escape_string(&input));
-
-            let serde_string: String = serde_json::from_str(&json).unwrap();
+            let json: String = serde_json::to_string(&input).unwrap();
 
             let mut iter = json_token_iter(json.as_bytes());
-            assert_eq!(Some(Ok(Token::ValueString(serde_string))), iter.next());
+            assert_eq!(Some(Ok(Token::ValueString(EscapedStr(&json[1..(json.len()-1)])))), iter.next());
             assert_eq!(None, iter.next());
         }
 
@@ -757,7 +762,7 @@ mod tests {
                 Token::EndArray,
                 Token::StartObject,
                 Token::EndObject,
-                Token::ValueString("test".into()),
+                Token::ValueString(EscapedStr("test")),
                 Token::EndArray,
             ],
             tokens.unwrap()
@@ -781,25 +786,25 @@ mod tests {
         assert_eq!(
             vec![
                 Token::StartObject,
-                Token::ObjectKey("some_int".into()),
+                Token::ObjectKey(EscapedStr("some_int")),
                 Token::ValueNumber(Number::PosInt(5)),
-                Token::ObjectKey("some_float".into()),
+                Token::ObjectKey(EscapedStr("some_float")),
                 Token::ValueNumber(Number::Float(5.2)),
-                Token::ObjectKey("some_negative".into()),
+                Token::ObjectKey(EscapedStr("some_negative")),
                 Token::ValueNumber(Number::NegInt(-5)),
-                Token::ObjectKey("some_negative_float".into()),
+                Token::ObjectKey(EscapedStr("some_negative_float")),
                 Token::ValueNumber(Number::Float(-2.4)),
-                Token::ObjectKey("some_string".into()),
-                Token::ValueString("test".into()),
-                Token::ObjectKey("some_struct".into()),
+                Token::ObjectKey(EscapedStr("some_string")),
+                Token::ValueString(EscapedStr("test")),
+                Token::ObjectKey(EscapedStr("some_struct")),
                 Token::StartObject,
-                Token::ObjectKey("nested".into()),
-                Token::ValueString("asdf".into()),
+                Token::ObjectKey(EscapedStr("nested")),
+                Token::ValueString(EscapedStr("asdf")),
                 Token::EndObject,
-                Token::ObjectKey("some_array".into()),
+                Token::ObjectKey(EscapedStr("some_array")),
                 Token::StartArray,
-                Token::ValueString("one".into()),
-                Token::ValueString("two".into()),
+                Token::ValueString(EscapedStr("one")),
+                Token::ValueString(EscapedStr("two")),
                 Token::EndArray,
                 Token::EndObject,
             ],
@@ -811,8 +816,11 @@ mod tests {
     fn test_object_trailing_comma() {
         let mut iter = json_token_iter(br#" { "test": "trailing", } "#);
         assert_eq!(Some(Ok(Token::StartObject)), iter.next());
-        assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next());
-        assert_eq!(Some(Ok(Token::ValueString("trailing".into()))), iter.next());
+        assert_eq!(Some(Ok(Token::ObjectKey(EscapedStr("test")))), iter.next());
+        assert_eq!(
+            Some(Ok(Token::ValueString(EscapedStr("trailing")))),
+            iter.next()
+        );
         assert_eq!(
             Some(Err(Error {
                 reason: ErrorReason::UnexpectedToken('}', "'\"'"),
@@ -827,7 +835,7 @@ mod tests {
     fn test_object_no_colon() {
         let mut iter = json_token_iter(br#" {"test" "#);
         assert_eq!(Some(Ok(Token::StartObject)), iter.next());
-        assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next());
+        assert_eq!(Some(Ok(Token::ObjectKey(EscapedStr("test")))), iter.next());
         assert_eq!(
             Some(Err(Error {
                 reason: ErrorReason::UnexpectedEOS,
@@ -847,4 +855,11 @@ mod tests {
         assert!(json_token_iter(b"\"test\ntest\"").next().unwrap().is_err());
         assert!(json_token_iter(b"\"test\ttest\"").next().unwrap().is_err());
     }
+
+    #[test]
+    fn escaped_str() {
+        let escaped = EscapedStr::new("foo\\nbar");
+        assert_eq!("foo\\nbar", escaped.as_escaped_str());
+        assert_eq!("foo\nbar", escaped.into_unescaped().unwrap());
+    }
 }
diff --git a/rust-runtime/smithy-json/src/escape.rs b/rust-runtime/smithy-json/src/escape.rs
index 58d4a14040..8f2faf6807 100644
--- a/rust-runtime/smithy-json/src/escape.rs
+++ b/rust-runtime/smithy-json/src/escape.rs
@@ -4,6 +4,41 @@
  */
 
 use std::borrow::Cow;
+use std::fmt;
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum Error {
+    ExpectedSurrogatePair(String),
+    InvalidEscapeCharacter(char),
+    InvalidSurrogatePair(u16, u16),
+    InvalidUnicodeEscape(String),
+    InvalidUtf8,
+    UnexpectedEndOfString,
+}
+
+impl std::error::Error for Error {}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use Error::*;
+        match self {
+            ExpectedSurrogatePair(low) => {
+                write!(
+                    f,
+                    "expected a UTF-16 surrogate pair, but got {} as the low word",
+                    low
+                )
+            }
+            InvalidEscapeCharacter(chr) => write!(f, "invalid JSON escape: \\{}", chr),
+            InvalidSurrogatePair(high, low) => {
+                write!(f, "invalid surrogate pair: \\u{:04X}\\u{:04X}", high, low)
+            }
+            InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape),
+            InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON string"),
+            UnexpectedEndOfString => write!(f, "unexpected end of string"),
+        }
+    }
+}
 
 /// Escapes a string for embedding in a JSON string value.
 pub fn escape_string(value: &str) -> Cow<str> {
@@ -45,9 +80,119 @@ fn escape_string_inner(start: &[u8], rest: &[u8]) -> String {
     unsafe { String::from_utf8_unchecked(escaped) }
 }
 
+/// Unescapes a JSON-escaped string.
+/// If there are no escape sequences, it directly returns the reference.
+pub fn unescape_string(value: &str) -> Result<Cow<str>, Error> {
+    let bytes = value.as_bytes();
+    for (index, byte) in bytes.iter().enumerate() {
+        if *byte == b'\\' {
+            return unescape_string_inner(&bytes[0..index], &bytes[index..]).map(Cow::Owned);
+        }
+    }
+    Ok(Cow::Borrowed(value))
+}
+
+fn unescape_string_inner(start: &[u8], rest: &[u8]) -> Result<String, Error> {
+    let mut unescaped = Vec::with_capacity(start.len() + rest.len());
+    unescaped.extend(start);
+
+    let mut index = 0;
+    while index < rest.len() {
+        match rest[index] {
+            b'\\' => {
+                index += 1;
+                if index == rest.len() {
+                    return Err(Error::UnexpectedEndOfString);
+                }
+                match rest[index] {
+                    b'u' => {
+                        index -= 1;
+                        index += read_unicode_escapes(&rest[index..], &mut unescaped)?;
+                    }
+                    byte => {
+                        match byte {
+                            b'\\' => unescaped.push(b'\\'),
+                            b'/' => unescaped.push(b'/'),
+                            b'"' => unescaped.push(b'"'),
+                            b'b' => unescaped.push(0x08),
+                            b'f' => unescaped.push(0x0C),
+                            b'n' => unescaped.push(b'\n'),
+                            b'r' => unescaped.push(b'\r'),
+                            b't' => unescaped.push(b'\t'),
+                            _ => return Err(Error::InvalidEscapeCharacter(byte.into())),
+                        }
+                        index += 1;
+                    }
+                }
+            }
+            byte => {
+                unescaped.push(byte);
+                index += 1
+            }
+        }
+    }
+
+    String::from_utf8(unescaped).map_err(|_| Error::InvalidUtf8)
+}
+
+fn is_utf16_low_surrogate(codepoint: u16) -> bool {
+    codepoint & 0xFC00 == 0xDC00
+}
+
+fn is_utf16_high_surrogate(codepoint: u16) -> bool {
+    codepoint & 0xFC00 == 0xD800
+}
+
+fn read_codepoint(rest: &[u8]) -> Result<u16, Error> {
+    if rest.len() < 6 {
+        return Err(Error::UnexpectedEndOfString);
+    }
+    if &rest[0..2] != b"\\u" {
+        // The first codepoint is always prefixed with "\u" since unescape_string_inner does
+        // that check, so this error will always be for the low word of a surrogate pair.
+        return Err(Error::ExpectedSurrogatePair(
+            String::from_utf8_lossy(&rest[0..6]).into(),
+        ));
+    }
+
+    let codepoint_str = std::str::from_utf8(&rest[2..6]).map_err(|_| Error::InvalidUtf8)?;
+    u16::from_str_radix(codepoint_str, 16)
+        .map_err(|_| Error::InvalidUnicodeEscape(codepoint_str.into()))
+}
+
+/// Reads JSON Unicode escape sequences (i.e., "\u1234"). Will also read
+/// an additional codepoint if the first codepoint is the start of a surrogate pair.
+fn read_unicode_escapes(bytes: &[u8], into: &mut Vec<u8>) -> Result<usize, Error> {
+    let high = read_codepoint(bytes)?;
+    let (bytes_read, chr) = if is_utf16_high_surrogate(high) {
+        let low = read_codepoint(&bytes[6..])?;
+        if !is_utf16_low_surrogate(low) {
+            return Err(Error::InvalidSurrogatePair(high, low));
+        }
+
+        let codepoint =
+            char::from_u32(0x10000 + (high - 0xD800) as u32 * 0x400 + (low - 0xDC00) as u32)
+                .ok_or(Error::InvalidSurrogatePair(high, low))?;
+        (12, codepoint)
+    } else {
+        let codepoint = char::from_u32(high as u32).ok_or_else(|| {
+            Error::InvalidUnicodeEscape(String::from_utf8_lossy(&bytes[0..6]).into())
+        })?;
+        (6, codepoint)
+    };
+
+    match chr.len_utf8() {
+        1 => into.push(chr as u8),
+        _ => into.extend_from_slice(chr.encode_utf8(&mut [0; 4]).as_bytes()),
+    }
+    Ok(bytes_read)
+}
+
 #[cfg(test)]
 mod test {
     use super::escape_string;
+    use crate::escape::{unescape_string, Error};
+    use std::borrow::Cow;
 
     #[test]
     fn escape() {
@@ -66,6 +211,49 @@ mod test {
         assert_eq!("\\u001f", escape_string("\u{1f}").as_ref());
     }
 
+    #[test]
+    fn unescape_no_escapes() {
+        let unescaped = unescape_string("test test").unwrap();
+        assert_eq!("test test", unescaped);
+        assert!(matches!(unescaped, Cow::Borrowed(_)));
+    }
+
+    #[test]
+    fn unescape() {
+        assert_eq!(
+            "\x08f\x0Co\to\r\n",
+            unescape_string(r#"\bf\fo\to\r\n"#).unwrap()
+        );
+        assert_eq!("\"test\"", unescape_string(r#"\"test\""#).unwrap());
+        assert_eq!("\x00", unescape_string("\\u0000").unwrap());
+        assert_eq!("\x1f", unescape_string("\\u001f").unwrap());
+        assert_eq!("foo\r\nbar", unescape_string("foo\\r\\nbar").unwrap());
+        assert_eq!("foo\r\n", unescape_string("foo\\r\\n").unwrap());
+        assert_eq!("\r\nbar", unescape_string("\\r\\nbar").unwrap());
+        assert_eq!("\u{10437}", unescape_string("\\uD801\\uDC37").unwrap());
+
+        assert_eq!(Err(Error::UnexpectedEndOfString), unescape_string("\\"));
+        assert_eq!(Err(Error::UnexpectedEndOfString), unescape_string("\\u"));
+        assert_eq!(Err(Error::UnexpectedEndOfString), unescape_string("\\u00"));
+        assert_eq!(
+            Err(Error::InvalidEscapeCharacter('z')),
+            unescape_string("\\z")
+        );
+
+        assert_eq!(
+            Err(Error::ExpectedSurrogatePair("\\nasdf".into())),
+            unescape_string("\\uD801\\nasdf")
+        );
+        assert_eq!(
+            Err(Error::UnexpectedEndOfString),
+            unescape_string("\\uD801\\u00")
+        );
+        assert_eq!(
+            Err(Error::InvalidSurrogatePair(0xD801, 0xC501)),
+            unescape_string("\\uD801\\uC501")
+        );
+    }
+
     use proptest::proptest;
     proptest! {
         #[test]
@@ -74,6 +262,31 @@ mod test {
             let serde_escaped = &serde_escaped[1..(serde_escaped.len() - 1)];
             assert_eq!(serde_escaped,escape_string(&s))
         }
+
+        #[test]
+        fn round_trip(chr in proptest::char::any()) {
+            let mut original = String::new();
+            original.push(chr);
+
+            let escaped = escape_string(&original);
+            let unescaped = unescape_string(&escaped).unwrap();
+            assert_eq!(original, unescaped);
+        }
+
+        #[test]
+        fn unicode_surrogates(chr in proptest::char::range(
+            char::from_u32(0x10000).unwrap(),
+            char::from_u32(0x10FFFF).unwrap(),
+        )) {
+            let mut codepoints = [0; 2];
+            chr.encode_utf16(&mut codepoints);
+
+            let escaped = format!("\\u{:04X}\\u{:04X}", codepoints[0], codepoints[1]);
+            let unescaped = unescape_string(&escaped).unwrap();
+
+            let expected = format!("{}", chr);
+            assert_eq!(expected, unescaped);
+        }
     }
 
     #[test]

From adb5600489acd77e5e9d4314d10c570180de90d2 Mon Sep 17 00:00:00 2001
From: John DiSanti <jdisanti@amazon.com>
Date: Thu, 3 Jun 2021 12:53:03 -0700
Subject: [PATCH 3/3] Add documentation on how to test against JSONTestSuite

---
 rust-runtime/smithy-json/TESTING.md | 115 ++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 rust-runtime/smithy-json/TESTING.md

diff --git a/rust-runtime/smithy-json/TESTING.md b/rust-runtime/smithy-json/TESTING.md
new file mode 100644
index 0000000000..b9ee52667b
--- /dev/null
+++ b/rust-runtime/smithy-json/TESTING.md
@@ -0,0 +1,115 @@
+How to run JSONTestSuite against smithy-json deserialize
+========================================================
+
+When making changes to the `deserialize` module, it is a good idea
+to run the changes against the [JSONTestSuite](https://github.com/nst/JSONTestSuite)
+and manually examine the test results.
+
+### How to setup the JSONTestSuite
+
+1. Clone the [JSONTestSuite](https://github.com/nst/JSONTestSuite) repository.
+2. In `JSONTestSuite/parsers`, create a new Cargo bin project named `test_json-smithy_json`.
+3. Add the following dependencies to the `Cargo.toml` (be sure to replace `<local-path-to-smithy-rs>`:
+
+```
+smithy-json = { path = "<local-path-to-smithy-rs>/rust-runtime/smithy-json" }
+```
+
+4. Replace the code in `main.rs` with:
+
+```rust
+use std::fs::File;
+use std::io::Read;
+use std::env;
+
+use smithy_json::deserialize::{json_token_iter, Token, Error};
+
+fn main() {
+    let args: Vec<_> = env::args().collect();
+    if args.len() != 2 {
+        println!("Usage: {} file.json", args[0]);
+        std::process::exit(1);
+    }
+
+    let ref path = args[1];
+    let mut s = String::new();
+    let mut f = File::open(path).expect("Unable to open file");
+    match f.read_to_string(&mut s) {
+        Err(_) => std::process::exit(1),
+        Ok(_) => println!("{}", s),
+    }
+
+    let result: Result<Vec<Token>, Error> = json_token_iter(s.as_bytes()).collect();
+    match result {
+        Err(_) => std::process::exit(1),
+        Ok(value) => if value.is_empty() {
+            std::process::exit(1)
+        } else {
+            // The test suite includes incomplete objects and arrays (i.e., "[null,").
+            // These are completely valid for this parser, so we'll just pretend to have
+            // failed to parse these to satisfy the test suite.
+            if value.first() == Some(&Token::StartObject) && value.last() != Some(&Token::EndObject) {
+                std::process::exit(1)
+            }
+            if value.first() == Some(&Token::StartArray) && value.last() != Some(&Token::EndArray) {
+                std::process::exit(1)
+            }
+            // Unescape all strings and fail if any of them failed to unescape.
+            for token in value {
+                if let Token::ValueString(escaped) = token {
+                    if escaped.into_unescaped().is_err() {
+                        std::process::exit(1)
+                    }
+                }
+            }
+            std::process::exit(0)
+        }
+    }
+}
+```
+
+5. Compile this program with `cargo build --release`.
+6. Modify `JSONTestSuite/run_tests.py` so that the `programs` dictionary only contains this one entry:
+
+```
+programs = {
+   "Rust smithy-json":
+       {
+           "url":"dontcare",
+           "commands":[os.path.join(PARSERS_DIR, "test_json-smithy_json/target/release/sj")]
+       }
+}
+```
+
+7. Run `run_tests.py` and examine the output with a web browser by opening `JSONTestSuite/results/parsing.html`.
+
+### Examining the results
+
+When looking at `JSONTestSuite/results/parsing.html`, there is a matrix of test cases against their
+results with a legend at the top.
+
+Any test result marked with blue or light blue is for a test case where correct behavior isn't specified,
+so use your best judgement to decide if it should have succeeded or failed.
+
+The other colors are bad and should be carefully examined. At time of writing, the following test cases
+succeed when they should fail, and we intentionally left it that way since we're not currently concerned
+about being more lenient in the number parsing:
+
+```
+n_number_-01.json                           [-01]
+n_number_-2..json                           [-2.]
+n_number_0.e1.json                          [0.e1]
+n_number_2.e+3.json                         [2.e+3]
+n_number_2.e-3.json                         [2.e-3]
+n_number_2.e3.json                          [2.e3]
+n_number_neg_int_starting_with_zero.json    [-012]
+n_number_neg_real_without_int_part.json     [-.123]
+n_number_real_without_fractional_part.json  [1.]
+n_number_with_leading_zero.json             [012]
+```
+
+This test case succeeds with our parser and that's OK since we're
+a token streaming parser (multiple values are allowed):
+```
+n_structure_double_array.json               [][]
+```