From 585f34773d981feb24c761070bfb391ce2bc9579 Mon Sep 17 00:00:00 2001 From: John DiSanti Date: Wed, 2 Jun 2021 13:53:01 -0700 Subject: [PATCH 1/3] Implement JSON token stream deserializer --- rust-runtime/smithy-json/src/deserialize.rs | 850 ++++++++++++++++++++ rust-runtime/smithy-json/src/lib.rs | 1 + 2 files changed, 851 insertions(+) create mode 100644 rust-runtime/smithy-json/src/deserialize.rs diff --git a/rust-runtime/smithy-json/src/deserialize.rs b/rust-runtime/smithy-json/src/deserialize.rs new file mode 100644 index 0000000000..9cfa1ef73a --- /dev/null +++ b/rust-runtime/smithy-json/src/deserialize.rs @@ -0,0 +1,850 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +use smithy_types::Number; +use std::fmt; +use std::str::Utf8Error; + +#[derive(Debug, PartialEq, Eq)] +pub enum ErrorReason { + InvalidUtf8, + InvalidUnicodeEscape(String), + InvalidEscape(char), + InvalidNumber, + ExpectedLiteral(String), + UnexpectedControlCharacter(u8), + UnexpectedToken(char, &'static str), + UnexpectedEOS, +} +use ErrorReason::*; + +#[derive(Debug, PartialEq, Eq)] +pub struct Error { + reason: ErrorReason, + offset: usize, +} + +impl std::error::Error for Error {} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Error at offset {}: ", self.offset)?; + match &self.reason { + InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON stream"), + InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape), + InvalidEscape(escape) => write!(f, "invalid JSON escape: \\{}", escape), + InvalidNumber => write!(f, "invalid number"), + ExpectedLiteral(literal) => write!(f, "expected literal: {}", literal), + UnexpectedControlCharacter(value) => write!( + f, + "encountered unescaped control character in string: 0x{:X}", + value + ), + UnexpectedToken(token, expected) => write!( + f, + "unexpected token '{}'. Expected one of {}", + token, expected + ), + UnexpectedEOS => write!(f, "unexpected end of stream"), + } + } +} + +impl From for ErrorReason { + fn from(_: Utf8Error) -> Self { + InvalidUtf8 + } +} + +/// Enum representing the different JSON tokens that can be returned by [json_token_iter]. +#[derive(Debug, PartialEq)] +pub enum Token { + StartArray, + EndArray, + ObjectKey(String), + StartObject, + EndObject, + ValueBool(bool), + ValueNull, + ValueNumber(Number), + ValueString(String), +} + +/// Returns an Iterator of `Result` over an slice of bytes. +pub fn json_token_iter(input: &[u8]) -> JsonTokenIterator { + JsonTokenIterator { + input, + index: 0, + state_stack: vec![State::Initial], + } +} + +/// Internal parser state for the iterator. Used to context between successive `next` calls. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum State { + /// Entry point. Expecting any JSON value. + Initial, + /// Expecting the next token to be the *first* value in an array, or the end of the array. + ArrayFirstValueOrEnd, + /// Expecting the next token to the next value in an array, or the end of the array. + ArrayNextValueOrEnd, + /// Expecting the next token to be the *first* key in the object, or the end of the object. + ObjectFirstKeyOrEnd, + /// Expecting the next token to the next object key, or the end of the object. + ObjectNextKeyOrEnd, + /// Expecting the next token to be the value of a field in an object. + ObjectFieldValue, +} + +/// An iterator over a `&[u8]` that yields `Result` with [Token] being JSON tokens. +/// Construct with [json_token_iter]. +pub struct JsonTokenIterator<'a> { + input: &'a [u8], + index: usize, + state_stack: Vec, +} + +impl<'a> JsonTokenIterator<'a> { + /// Previews the next byte. + fn peek_byte(&self) -> Option { + if self.index >= self.input.len() { + None + } else { + Some(self.input[self.index]) + } + } + + /// Expects there to be another byte coming up, and previews it. + /// If there isn't, an `UnexpectedEOS` error is returned. + fn peek_expect(&self) -> Result { + self.peek_byte().ok_or_else(|| self.error(UnexpectedEOS)) + } + + /// Advances to the next byte in the stream. + fn advance(&mut self) { + if self.index < self.input.len() { + self.index += 1; + } + } + + /// Advances and returns the next byte in the stream. + fn next_byte(&mut self) -> Option { + let next = self.peek_byte(); + self.advance(); + next + } + + /// Expects there to be another byte coming up, and returns it while advancing. + /// If there isn't, an `UnexpectedEOS` error is returned. + fn next_expect(&mut self) -> Result { + self.next_byte().ok_or_else(|| self.error(UnexpectedEOS)) + } + + /// Creates an error at the given `offset` in the stream. + fn error_at(&self, offset: usize, reason: ErrorReason) -> Error { + Error { reason, offset } + } + + /// Creates an error at the current offset in the stream. + fn error(&self, reason: ErrorReason) -> Error { + self.error_at(self.index, reason) + } + + /// Advances until it hits a non-whitespace character or the end of the slice. + fn discard_whitespace(&mut self) { + while let Some(byte) = self.peek_byte() { + match byte { + b' ' | b'\t' | b'\r' | b'\n' => { + self.advance(); + } + _ => break, + } + } + } + + /// Returns the top of the state stack (current state). + fn state(&self) -> State { + self.state_stack[self.state_stack.len() - 1] + } + + /// Replaces the top of the state stack with a new `state`. + fn replace_state(&mut self, state: State) { + self.state_stack.pop(); + self.state_stack.push(state); + } + + /// Discards the '{' character and pushes the `ObjectFirstKeyOrEnd` state. + fn start_object(&mut self) -> Token { + let byte = self.next_byte(); + debug_assert_eq!(byte, Some(b'{')); + self.state_stack.push(State::ObjectFirstKeyOrEnd); + Token::StartObject + } + + /// Discards the '}' character and pops the current state. + fn end_object(&mut self) -> Token { + let (byte, state) = (self.next_byte(), self.state_stack.pop()); + debug_assert_eq!(byte, Some(b'}')); + debug_assert!( + state == Some(State::ObjectFirstKeyOrEnd) || state == Some(State::ObjectNextKeyOrEnd) + ); + Token::EndObject + } + + /// Discards the '[' character and pushes the `ArrayFirstValueOrEnd` state. + fn start_array(&mut self) -> Token { + let byte = self.next_byte(); + debug_assert_eq!(byte, Some(b'[')); + self.state_stack.push(State::ArrayFirstValueOrEnd); + Token::StartArray + } + + /// Discards the ']' character and pops the current state. + fn end_array(&mut self) -> Token { + let (byte, state) = (self.next_byte(), self.state_stack.pop()); + debug_assert_eq!(byte, Some(b']')); + debug_assert!( + state == Some(State::ArrayFirstValueOrEnd) || state == Some(State::ArrayNextValueOrEnd) + ); + Token::EndArray + } + + /// Reads a JSON Unicode escape sequence (i.e., "\u1234"). + fn read_unicode_escape(&mut self, into: &mut Vec) -> Result<(), Error> { + let (start, end) = (self.index, self.index + 4); + if end > self.input.len() { + return Err(self.error(UnexpectedEOS)); + } + + let codepoint_str = + std::str::from_utf8(&self.input[start..end]).map_err(|err| self.error(err.into()))?; + let codepoint = u32::from_str_radix(codepoint_str, 16) + .map_err(|_| self.error(ErrorReason::InvalidUnicodeEscape(codepoint_str.into())))?; + let codepoint = char::from_u32(codepoint) + .ok_or_else(|| self.error(InvalidUnicodeEscape(codepoint_str.into())))?; + match codepoint.len_utf8() { + 1 => into.push(codepoint as u8), + _ => into.extend_from_slice(codepoint.encode_utf8(&mut [0; 4]).as_bytes()), + } + self.index = end; + Ok(()) + } + + /// Reads a JSON string out of the stream. + fn read_string(&mut self) -> Result { + // Skip the starting quote + let quote_byte = self.next_byte(); + debug_assert_eq!(quote_byte, Some(b'\"')); + + // Read bytes until a non-escaped end-quote, unescaping sequences as needed on the fly + let mut value = Vec::new(); + loop { + match self.next_expect()? { + b'"' => return String::from_utf8(value).map_err(|_| self.error(InvalidUtf8)), + b'\\' => match self.next_expect()? { + b'\\' => value.push(b'\\'), + b'/' => value.push(b'/'), + b'"' => value.push(b'"'), + b'b' => value.push(0x08), + b'f' => value.push(0x0C), + b'n' => value.push(b'\n'), + b'r' => value.push(b'\r'), + b't' => value.push(b'\t'), + b'u' => self.read_unicode_escape(&mut value)?, + byte => return Err(self.error(InvalidEscape(byte.into()))), + }, + byte @ 0x00..=0x1F => return Err(self.error(UnexpectedControlCharacter(byte))), + byte => value.push(byte), + } + } + } + + /// Expects the given literal to be next in the stream. + fn expect_literal(&mut self, expected: &[u8]) -> Result<(), Error> { + let (start, end) = (self.index, self.index + expected.len()); + if end > self.input.len() { + return Err(self.error_at(self.input.len(), UnexpectedEOS)); + } + if expected != &self.input[start..end] { + return Err(self.error_at( + start, + ExpectedLiteral(std::str::from_utf8(expected).unwrap().into()), + )); + } + self.index = end; + Ok(()) + } + + /// Expects a literal `null` next in the stream. + fn expect_null(&mut self) -> Result { + self.expect_literal(b"null")?; + Ok(Token::ValueNull) + } + + /// Expects a boolean `true` / `false` to be next in the stream and returns its value. + fn expect_bool(&mut self) -> Result { + match self.peek_expect()? { + b't' => { + self.expect_literal(b"true")?; + Ok(Token::ValueBool(true)) + } + b'f' => { + self.expect_literal(b"false")?; + Ok(Token::ValueBool(false)) + } + _ => unreachable!(), + } + } + + /// Advances passed the exponent part of a floating point number. + fn skip_exponent(&mut self) { + self.advance(); + match self.peek_byte() { + Some(b'-') => self.advance(), + Some(b'+') => self.advance(), + _ => {} + } + while let Some(b'0'..=b'9') = self.peek_byte() { + self.advance(); + } + } + + /// Advances passed the decimal part of a floating point number. + fn skip_decimal(&mut self) { + self.advance(); + while let Some(byte) = self.peek_byte() { + match byte { + b'0'..=b'9' => self.advance(), + b'e' | b'E' => self.skip_exponent(), + _ => break, + } + } + } + + /// Starting from the current location in the stream, this advances until + /// it finds a character that doesn't look like its part of a number, and then + /// returns `(start_index, end_index, negative, floating)`, with `start_index` + /// and `end_index` representing the slice of the stream that is the number, + /// `negative` whether or not it is a negative number, and `floating` whether or not + /// it is a floating point number. + fn scan_number(&mut self) -> (usize, usize, bool, bool) { + let start_index = self.index; + let negative = if self.peek_byte() == Some(b'-') { + self.advance(); + true + } else { + false + }; + let mut floating = false; + while let Some(byte) = self.peek_byte() { + match byte { + b'0'..=b'9' => self.advance(), + b'.' => { + floating = true; + self.skip_decimal(); + } + b'e' | b'E' => { + floating = true; + self.skip_exponent(); + } + _ => break, + } + } + (start_index, self.index, negative, floating) + } + + /// Expects a number in the stream, and returns its value. + fn expect_number(&mut self) -> Result { + let (start, end, negative, floating) = self.scan_number(); + let number_slice = &self.input[start..end]; + + // Unsafe: we examined every character in the range, and they are all number characters + debug_assert!(std::str::from_utf8(number_slice).is_ok()); + let number_str = unsafe { std::str::from_utf8_unchecked(number_slice) }; + + use std::str::FromStr; + Ok(Token::ValueNumber(if floating { + Number::Float( + f64::from_str(&number_str).map_err(|_| self.error_at(start, InvalidNumber))?, + ) + } else if negative { + // If the negative value overflows, then stuff it into an f64 + let positive = + u64::from_str(&number_str[1..]).map_err(|_| self.error_at(start, InvalidNumber))?; + let negative = positive.wrapping_neg() as i64; + if negative > 0 { + Number::Float(-(positive as f64)) + } else { + Number::NegInt(negative as i64) + } + } else { + Number::PosInt( + u64::from_str(&number_str).map_err(|_| self.error_at(start, InvalidNumber))?, + ) + })) + } + + /// Reads a value from the stream and returns the next token. For objects and arrays, + /// the entire object or array will not be ready, but rather, a [StartObject]/[StartArray] + /// will be returned. + fn read_value(&mut self) -> Result { + self.discard_whitespace(); + match self.peek_expect()? { + b'{' => Ok(self.start_object()), + b'[' => Ok(self.start_array()), + b'"' => self.read_string().map(Token::ValueString), + byte => { + let value = match byte { + b'n' => self.expect_null(), + b't' | b'f' => self.expect_bool(), + b'-' | (b'0'..=b'9') => self.expect_number(), + byte => Err(self.error(UnexpectedToken( + byte.into(), + "'{', '[', '\"', 'null', 'true', 'false', ", + ))), + }?; + // Verify there are no unexpected trailers on the end of the value + if let Some(byte) = self.peek_byte() { + match byte { + b' ' | b'\t' | b'\r' | b'\n' | b'}' | b']' | b',' => {} + _ => { + return Err(self.error(UnexpectedToken( + byte.into(), + ", '}', ']', ','", + ))) + } + } + } + Ok(value) + } + } + } + + /// Handles the [ArrayFirstValueOrEnd] state. + fn state_array_first_value_or_end(&mut self) -> Result { + match self.peek_expect()? { + b']' => Ok(self.end_array()), + _ => { + self.replace_state(State::ArrayNextValueOrEnd); + self.read_value() + } + } + } + + /// Handles the [ArrayNextValueOrEnd] state. + fn state_array_next_value_or_end(&mut self) -> Result { + match self.peek_expect()? { + b']' => Ok(self.end_array()), + b',' => { + self.advance(); + self.read_value() + } + byte => Err(self.error(UnexpectedToken(byte.into(), "']', ','"))), + } + } + + /// Expects an object key. + fn object_key(&mut self) -> Result { + match self.peek_expect()? { + b'"' => { + self.replace_state(State::ObjectFieldValue); + self.read_string().map(Token::ObjectKey) + } + byte => Err(self.error(UnexpectedToken(byte.into(), "'\"'"))), + } + } + + /// Handles the [ObjectFirstKeyOrEnd] state. + fn state_object_first_key_or_end(&mut self) -> Result { + match self.peek_expect()? { + b'}' => Ok(self.end_object()), + _ => self.object_key(), + } + } + + /// Handles the [ObjectNextKeyOrEnd] state. + fn state_object_next_key_or_end(&mut self) -> Result { + match self.peek_expect()? { + b'}' => Ok(self.end_object()), + b',' => { + self.advance(); + self.discard_whitespace(); + self.object_key() + } + byte => Err(self.error(UnexpectedToken(byte.into(), "'}', ','"))), + } + } + + /// Handles the [ObjectFieldValue] state. + fn state_object_field_value(&mut self) -> Result { + match self.peek_expect()? { + b':' => { + self.advance(); + self.replace_state(State::ObjectNextKeyOrEnd); + self.read_value() + } + byte => Err(self.error(UnexpectedToken(byte.into(), "':'"))), + } + } +} + +impl<'a> Iterator for JsonTokenIterator<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + debug_assert!(self.index <= self.input.len()); + if self.index == self.input.len() { + return None; + } + + self.discard_whitespace(); + let result = match self.state() { + State::Initial => self.peek_byte().map(|_| self.read_value()), + State::ArrayFirstValueOrEnd => Some(self.state_array_first_value_or_end()), + State::ArrayNextValueOrEnd => Some(self.state_array_next_value_or_end()), + State::ObjectFirstKeyOrEnd => Some(self.state_object_first_key_or_end()), + State::ObjectNextKeyOrEnd => Some(self.state_object_next_key_or_end()), + State::ObjectFieldValue => Some(self.state_object_field_value()), + }; + // Invalidate the stream if we encountered an error + if result.as_ref().map(|r| r.is_err()).unwrap_or(false) { + self.index = self.input.len(); + } + result + } +} + +#[cfg(test)] +mod tests { + use crate::deserialize::{json_token_iter, Error, ErrorReason, Token}; + use crate::escape::escape_string; + use proptest::prelude::*; + use smithy_types::Number; + + #[test] + fn test_empty() { + assert_eq!(None, json_token_iter(b"").next()); + assert_eq!(None, json_token_iter(b" ").next()); + assert_eq!(None, json_token_iter(b"\t").next()); + } + + #[test] + fn test_empty_string() { + let mut iter = json_token_iter(b"\"\""); + assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next()); + assert_eq!(None, iter.next()); + + let mut iter = json_token_iter(b" \r\n\t \"\" "); + assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_empty_array() { + let mut iter = json_token_iter(b"[]"); + assert_eq!(Some(Ok(Token::StartArray)), iter.next()); + assert_eq!(Some(Ok(Token::EndArray)), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_empty_object() { + let mut iter = json_token_iter(b"{}"); + assert_eq!(Some(Ok(Token::StartObject)), iter.next()); + assert_eq!(Some(Ok(Token::EndObject)), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_null() { + assert_eq!( + Some(Ok(Token::ValueNull)), + json_token_iter(b" null ").next() + ); + + let tokens: Result, Error> = json_token_iter(b"[null, null,null]").collect(); + assert_eq!( + vec![ + Token::StartArray, + Token::ValueNull, + Token::ValueNull, + Token::ValueNull, + Token::EndArray + ], + tokens.unwrap() + ); + + assert!(json_token_iter(b"n").next().unwrap().is_err()); + assert!(json_token_iter(b"nul").next().unwrap().is_err()); + assert!(json_token_iter(b"nulll").next().unwrap().is_err()); + } + + #[test] + fn test_bools() { + assert!(json_token_iter(b"tru").next().unwrap().is_err()); + assert!(json_token_iter(b"truee").next().unwrap().is_err()); + assert!(json_token_iter(b"f").next().unwrap().is_err()); + assert!(json_token_iter(b"falsee").next().unwrap().is_err()); + assert_eq!( + Some(Ok(Token::ValueBool(true))), + json_token_iter(b" true ").next() + ); + assert_eq!( + Some(Ok(Token::ValueBool(false))), + json_token_iter(b"false").next() + ); + + let tokens: Result, Error> = json_token_iter(b"[true,false]").collect(); + assert_eq!( + vec![ + Token::StartArray, + Token::ValueBool(true), + Token::ValueBool(false), + Token::EndArray + ], + tokens.unwrap() + ); + } + + proptest! { + #[test] + fn string_prop_test(input in ".*") { + let json = format!("\"{}\"", escape_string(&input)); + + let serde_string: String = serde_json::from_str(&json).unwrap(); + + let mut iter = json_token_iter(json.as_bytes()); + assert_eq!(Some(Ok(Token::ValueString(serde_string))), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn integer_prop_test(input: i64) { + let json = serde_json::to_string(&input).unwrap(); + let mut iter = json_token_iter(json.as_bytes()); + assert_eq!(Some(Ok(Token::ValueNumber( + if input < 0 { + Number::NegInt(input) + } else { + Number::PosInt(input as u64) + }))), iter.next()); + assert_eq!(None, iter.next()); + } + + #[test] + fn float_prop_test(input: f64) { + let json = serde_json::to_string(&input).unwrap(); + let mut iter = json_token_iter(json.as_bytes()); + assert_eq!(Some(Ok(Token::ValueNumber(Number::Float(input)))), iter.next()); + assert_eq!(None, iter.next()); + } + } + + #[test] + fn valid_numbers() { + let expect = |number, input| { + assert_eq!( + Token::ValueNumber(number), + json_token_iter(input).next().unwrap().unwrap() + ); + }; + expect(Number::Float(0.0), b"0."); + expect(Number::Float(0.0), b"0e0"); + expect(Number::Float(0.0), b"0E0"); + expect(Number::Float(10.0), b"1E1"); + expect(Number::Float(10.0), b"1E+1"); + expect(Number::Float(100.0), b"1e+2"); + + expect(Number::NegInt(-50000), b"-50000"); + expect( + Number::Float(-18446744073709551615.0), + b"-18446744073709551615", + ); + } + + // These cases actually shouldn't parse according to the spec, but it's easier + // to be lenient on these, and it doesn't really impact the SDK use-case. + #[test] + fn invalid_numbers_we_are_intentionally_accepting() { + let expect = |number, input| { + assert_eq!( + Token::ValueNumber(number), + json_token_iter(input).next().unwrap().unwrap() + ); + }; + + expect(Number::NegInt(-1), b"-01"); + expect(Number::Float(-2.0), b"-2."); + expect(Number::Float(0.0), b"0.e1"); + expect(Number::Float(0.002), b"2.e-3"); + expect(Number::Float(2000.0), b"2.e3"); + expect(Number::NegInt(-12), b"-012"); + expect(Number::Float(-0.123), b"-.123"); + expect(Number::Float(1.0), b"1."); + expect(Number::PosInt(12), b"012"); + } + + #[test] + fn invalid_numbers() { + let unexpected_token = |input, token, offset, msg| { + let tokens: Vec> = json_token_iter(input).collect(); + assert_eq!( + vec![Err(Error { + reason: ErrorReason::UnexpectedToken(token, msg), + offset + }),], + tokens, + "input: \"{}\"", + std::str::from_utf8(input).unwrap(), + ); + }; + + let invalid_number = |input, offset| { + let tokens: Vec> = json_token_iter(input).collect(); + assert_eq!( + vec![Err(Error { + reason: ErrorReason::InvalidNumber, + offset + })], + tokens, + "input: \"{}\"", + std::str::from_utf8(input).unwrap(), + ); + }; + + let unexpected_trailer = ", '}', ']', ','"; + let unexpected_start = "'{', '[', '\"', 'null', 'true', 'false', "; + + unexpected_token(b".", '.', 0, unexpected_start); + unexpected_token(b".0", '.', 0, unexpected_start); + unexpected_token(b"0-05", '-', 1, unexpected_trailer); + unexpected_token(b"0x05", 'x', 1, unexpected_trailer); + unexpected_token(b"123.invalid", 'i', 4, unexpected_trailer); + unexpected_token(b"123invalid", 'i', 3, unexpected_trailer); + unexpected_token(b"asdf", 'a', 0, unexpected_start); + + invalid_number(b"-a", 0); + invalid_number(b"1e", 0); + invalid_number(b"1e-", 0); + + // Number parsing fails before it even looks at the trailer because of invalid exponent + invalid_number(b"123.0Einvalid", 0); + } + + #[test] + fn test_unclosed_array() { + let mut iter = json_token_iter(br#" [null "#); + assert_eq!(Some(Ok(Token::StartArray)), iter.next()); + assert_eq!(Some(Ok(Token::ValueNull)), iter.next()); + assert_eq!( + Some(Err(Error { + reason: ErrorReason::UnexpectedEOS, + offset: 7 + })), + iter.next() + ); + } + + #[test] + fn test_array_with_items() { + let tokens: Result, Error> = json_token_iter(b"[[], {}, \"test\"]").collect(); + assert_eq!( + vec![ + Token::StartArray, + Token::StartArray, + Token::EndArray, + Token::StartObject, + Token::EndObject, + Token::ValueString("test".into()), + Token::EndArray, + ], + tokens.unwrap() + ) + } + + #[test] + fn test_object_with_items() { + let tokens: Result, Error> = json_token_iter( + br#" + { "some_int": 5, + "some_float": 5.2, + "some_negative": -5, + "some_negative_float": -2.4, + "some_string": "test", + "some_struct": { "nested": "asdf" }, + "some_array": ["one", "two"] } + "#, + ) + .collect(); + assert_eq!( + vec![ + Token::StartObject, + Token::ObjectKey("some_int".into()), + Token::ValueNumber(Number::PosInt(5)), + Token::ObjectKey("some_float".into()), + Token::ValueNumber(Number::Float(5.2)), + Token::ObjectKey("some_negative".into()), + Token::ValueNumber(Number::NegInt(-5)), + Token::ObjectKey("some_negative_float".into()), + Token::ValueNumber(Number::Float(-2.4)), + Token::ObjectKey("some_string".into()), + Token::ValueString("test".into()), + Token::ObjectKey("some_struct".into()), + Token::StartObject, + Token::ObjectKey("nested".into()), + Token::ValueString("asdf".into()), + Token::EndObject, + Token::ObjectKey("some_array".into()), + Token::StartArray, + Token::ValueString("one".into()), + Token::ValueString("two".into()), + Token::EndArray, + Token::EndObject, + ], + tokens.unwrap() + ) + } + + #[test] + fn test_object_trailing_comma() { + let mut iter = json_token_iter(br#" { "test": "trailing", } "#); + assert_eq!(Some(Ok(Token::StartObject)), iter.next()); + assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next()); + assert_eq!(Some(Ok(Token::ValueString("trailing".into()))), iter.next()); + assert_eq!( + Some(Err(Error { + reason: ErrorReason::UnexpectedToken('}', "'\"'"), + offset: 23, + })), + iter.next() + ); + assert_eq!(None, iter.next()); + } + + #[test] + fn test_object_no_colon() { + let mut iter = json_token_iter(br#" {"test" "#); + assert_eq!(Some(Ok(Token::StartObject)), iter.next()); + assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next()); + assert_eq!( + Some(Err(Error { + reason: ErrorReason::UnexpectedEOS, + offset: 9, + })), + iter.next() + ); + assert_eq!(None, iter.next()); + } + + #[test] + fn unescaped_ctrl_characters() { + assert!(json_token_iter(b"\"test\x00test\"") + .next() + .unwrap() + .is_err()); + assert!(json_token_iter(b"\"test\ntest\"").next().unwrap().is_err()); + assert!(json_token_iter(b"\"test\ttest\"").next().unwrap().is_err()); + } +} diff --git a/rust-runtime/smithy-json/src/lib.rs b/rust-runtime/smithy-json/src/lib.rs index de371590ee..538378e046 100644 --- a/rust-runtime/smithy-json/src/lib.rs +++ b/rust-runtime/smithy-json/src/lib.rs @@ -5,5 +5,6 @@ //! JSON Abstractions for Smithy +pub mod deserialize; mod escape; pub mod serialize; From 0a2c800575e856bcf66f5132e613a930e1374cac Mon Sep 17 00:00:00 2001 From: John DiSanti Date: Thu, 3 Jun 2021 12:17:53 -0700 Subject: [PATCH 2/3] Stop allocating string values and fix surrogate pair unescaping --- rust-runtime/smithy-json/src/deserialize.rs | 177 ++++++++-------- rust-runtime/smithy-json/src/escape.rs | 213 ++++++++++++++++++++ 2 files changed, 309 insertions(+), 81 deletions(-) diff --git a/rust-runtime/smithy-json/src/deserialize.rs b/rust-runtime/smithy-json/src/deserialize.rs index 9cfa1ef73a..03906f8ed5 100644 --- a/rust-runtime/smithy-json/src/deserialize.rs +++ b/rust-runtime/smithy-json/src/deserialize.rs @@ -3,14 +3,17 @@ * SPDX-License-Identifier: Apache-2.0. */ +use crate::escape::unescape_string; use smithy_types::Number; +use std::borrow::Cow; use std::fmt; use std::str::Utf8Error; +pub use crate::escape::Error as EscapeError; + #[derive(Debug, PartialEq, Eq)] pub enum ErrorReason { InvalidUtf8, - InvalidUnicodeEscape(String), InvalidEscape(char), InvalidNumber, ExpectedLiteral(String), @@ -33,7 +36,6 @@ impl fmt::Display for Error { write!(f, "Error at offset {}: ", self.offset)?; match &self.reason { InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON stream"), - InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape), InvalidEscape(escape) => write!(f, "invalid JSON escape: \\{}", escape), InvalidNumber => write!(f, "invalid number"), ExpectedLiteral(literal) => write!(f, "expected literal: {}", literal), @@ -58,18 +60,40 @@ impl From for ErrorReason { } } +/// New-type around `&str` that indicates the string is an escaped JSON string. +/// Provides functions for retrieving the string in either form. +#[derive(Debug, PartialEq, Eq)] +pub struct EscapedStr<'a>(&'a str); + +impl<'a> EscapedStr<'a> { + pub fn new(value: &'a str) -> EscapedStr<'a> { + EscapedStr(value) + } + + /// Returns the escaped string value + pub fn as_escaped_str(&self) -> &str { + self.0 + } + + /// Consumes self and returns the string unescaped. + /// If the string doesn't need unescaping, it will be returned directly. + pub fn into_unescaped(self) -> Result, EscapeError> { + unescape_string(self.0) + } +} + /// Enum representing the different JSON tokens that can be returned by [json_token_iter]. #[derive(Debug, PartialEq)] -pub enum Token { +pub enum Token<'a> { StartArray, EndArray, - ObjectKey(String), + ObjectKey(EscapedStr<'a>), StartObject, EndObject, ValueBool(bool), ValueNull, ValueNumber(Number), - ValueString(String), + ValueString(EscapedStr<'a>), } /// Returns an Iterator of `Result` over an slice of bytes. @@ -176,7 +200,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Discards the '{' character and pushes the `ObjectFirstKeyOrEnd` state. - fn start_object(&mut self) -> Token { + fn start_object(&mut self) -> Token<'a> { let byte = self.next_byte(); debug_assert_eq!(byte, Some(b'{')); self.state_stack.push(State::ObjectFirstKeyOrEnd); @@ -184,7 +208,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Discards the '}' character and pops the current state. - fn end_object(&mut self) -> Token { + fn end_object(&mut self) -> Token<'a> { let (byte, state) = (self.next_byte(), self.state_stack.pop()); debug_assert_eq!(byte, Some(b'}')); debug_assert!( @@ -194,7 +218,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Discards the '[' character and pushes the `ArrayFirstValueOrEnd` state. - fn start_array(&mut self) -> Token { + fn start_array(&mut self) -> Token<'a> { let byte = self.next_byte(); debug_assert_eq!(byte, Some(b'[')); self.state_stack.push(State::ArrayFirstValueOrEnd); @@ -202,7 +226,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Discards the ']' character and pops the current state. - fn end_array(&mut self) -> Token { + fn end_array(&mut self) -> Token<'a> { let (byte, state) = (self.next_byte(), self.state_stack.pop()); debug_assert_eq!(byte, Some(b']')); debug_assert!( @@ -211,52 +235,34 @@ impl<'a> JsonTokenIterator<'a> { Token::EndArray } - /// Reads a JSON Unicode escape sequence (i.e., "\u1234"). - fn read_unicode_escape(&mut self, into: &mut Vec) -> Result<(), Error> { - let (start, end) = (self.index, self.index + 4); - if end > self.input.len() { - return Err(self.error(UnexpectedEOS)); - } - - let codepoint_str = - std::str::from_utf8(&self.input[start..end]).map_err(|err| self.error(err.into()))?; - let codepoint = u32::from_str_radix(codepoint_str, 16) - .map_err(|_| self.error(ErrorReason::InvalidUnicodeEscape(codepoint_str.into())))?; - let codepoint = char::from_u32(codepoint) - .ok_or_else(|| self.error(InvalidUnicodeEscape(codepoint_str.into())))?; - match codepoint.len_utf8() { - 1 => into.push(codepoint as u8), - _ => into.extend_from_slice(codepoint.encode_utf8(&mut [0; 4]).as_bytes()), - } - self.index = end; - Ok(()) - } - /// Reads a JSON string out of the stream. - fn read_string(&mut self) -> Result { + fn read_string(&mut self) -> Result<&'a str, Error> { // Skip the starting quote let quote_byte = self.next_byte(); debug_assert_eq!(quote_byte, Some(b'\"')); // Read bytes until a non-escaped end-quote, unescaping sequences as needed on the fly - let mut value = Vec::new(); + let start = self.index; loop { - match self.next_expect()? { - b'"' => return String::from_utf8(value).map_err(|_| self.error(InvalidUtf8)), + match self.peek_expect()? { + b'"' => { + let value = std::str::from_utf8(&self.input[start..self.index]) + .map_err(|_| self.error(InvalidUtf8))?; + self.advance(); + return Ok(value); + } b'\\' => match self.next_expect()? { - b'\\' => value.push(b'\\'), - b'/' => value.push(b'/'), - b'"' => value.push(b'"'), - b'b' => value.push(0x08), - b'f' => value.push(0x0C), - b'n' => value.push(b'\n'), - b'r' => value.push(b'\r'), - b't' => value.push(b'\t'), - b'u' => self.read_unicode_escape(&mut value)?, + b'\\' | b'/' | b'"' | b'b' | b'f' | b'n' | b'r' | b't' => self.advance(), + b'u' => { + if self.index + 4 > self.input.len() { + return Err(self.error_at(self.input.len(), UnexpectedEOS)); + } + self.index += 4; + } byte => return Err(self.error(InvalidEscape(byte.into()))), }, byte @ 0x00..=0x1F => return Err(self.error(UnexpectedControlCharacter(byte))), - byte => value.push(byte), + _ => self.advance(), } } } @@ -278,13 +284,13 @@ impl<'a> JsonTokenIterator<'a> { } /// Expects a literal `null` next in the stream. - fn expect_null(&mut self) -> Result { + fn expect_null(&mut self) -> Result, Error> { self.expect_literal(b"null")?; Ok(Token::ValueNull) } /// Expects a boolean `true` / `false` to be next in the stream and returns its value. - fn expect_bool(&mut self) -> Result { + fn expect_bool(&mut self) -> Result, Error> { match self.peek_expect()? { b't' => { self.expect_literal(b"true")?; @@ -356,7 +362,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Expects a number in the stream, and returns its value. - fn expect_number(&mut self) -> Result { + fn expect_number(&mut self) -> Result, Error> { let (start, end, negative, floating) = self.scan_number(); let number_slice = &self.input[start..end]; @@ -389,12 +395,14 @@ impl<'a> JsonTokenIterator<'a> { /// Reads a value from the stream and returns the next token. For objects and arrays, /// the entire object or array will not be ready, but rather, a [StartObject]/[StartArray] /// will be returned. - fn read_value(&mut self) -> Result { + fn read_value(&mut self) -> Result, Error> { self.discard_whitespace(); match self.peek_expect()? { b'{' => Ok(self.start_object()), b'[' => Ok(self.start_array()), - b'"' => self.read_string().map(Token::ValueString), + b'"' => self + .read_string() + .map(|s| Token::ValueString(EscapedStr(s))), byte => { let value = match byte { b'n' => self.expect_null(), @@ -423,7 +431,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Handles the [ArrayFirstValueOrEnd] state. - fn state_array_first_value_or_end(&mut self) -> Result { + fn state_array_first_value_or_end(&mut self) -> Result, Error> { match self.peek_expect()? { b']' => Ok(self.end_array()), _ => { @@ -434,7 +442,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Handles the [ArrayNextValueOrEnd] state. - fn state_array_next_value_or_end(&mut self) -> Result { + fn state_array_next_value_or_end(&mut self) -> Result, Error> { match self.peek_expect()? { b']' => Ok(self.end_array()), b',' => { @@ -446,18 +454,18 @@ impl<'a> JsonTokenIterator<'a> { } /// Expects an object key. - fn object_key(&mut self) -> Result { + fn object_key(&mut self) -> Result, Error> { match self.peek_expect()? { b'"' => { self.replace_state(State::ObjectFieldValue); - self.read_string().map(Token::ObjectKey) + self.read_string().map(|s| Token::ObjectKey(EscapedStr(s))) } byte => Err(self.error(UnexpectedToken(byte.into(), "'\"'"))), } } /// Handles the [ObjectFirstKeyOrEnd] state. - fn state_object_first_key_or_end(&mut self) -> Result { + fn state_object_first_key_or_end(&mut self) -> Result, Error> { match self.peek_expect()? { b'}' => Ok(self.end_object()), _ => self.object_key(), @@ -465,7 +473,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Handles the [ObjectNextKeyOrEnd] state. - fn state_object_next_key_or_end(&mut self) -> Result { + fn state_object_next_key_or_end(&mut self) -> Result, Error> { match self.peek_expect()? { b'}' => Ok(self.end_object()), b',' => { @@ -478,7 +486,7 @@ impl<'a> JsonTokenIterator<'a> { } /// Handles the [ObjectFieldValue] state. - fn state_object_field_value(&mut self) -> Result { + fn state_object_field_value(&mut self) -> Result, Error> { match self.peek_expect()? { b':' => { self.advance(); @@ -491,7 +499,7 @@ impl<'a> JsonTokenIterator<'a> { } impl<'a> Iterator for JsonTokenIterator<'a> { - type Item = Result; + type Item = Result, Error>; fn next(&mut self) -> Option { debug_assert!(self.index <= self.input.len()); @@ -518,8 +526,7 @@ impl<'a> Iterator for JsonTokenIterator<'a> { #[cfg(test)] mod tests { - use crate::deserialize::{json_token_iter, Error, ErrorReason, Token}; - use crate::escape::escape_string; + use crate::deserialize::{json_token_iter, Error, ErrorReason, EscapedStr, Token}; use proptest::prelude::*; use smithy_types::Number; @@ -533,11 +540,11 @@ mod tests { #[test] fn test_empty_string() { let mut iter = json_token_iter(b"\"\""); - assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next()); + assert_eq!(Some(Ok(Token::ValueString(EscapedStr("")))), iter.next()); assert_eq!(None, iter.next()); let mut iter = json_token_iter(b" \r\n\t \"\" "); - assert_eq!(Some(Ok(Token::ValueString("".into()))), iter.next()); + assert_eq!(Some(Ok(Token::ValueString(EscapedStr("")))), iter.next()); assert_eq!(None, iter.next()); } @@ -611,12 +618,10 @@ mod tests { proptest! { #[test] fn string_prop_test(input in ".*") { - let json = format!("\"{}\"", escape_string(&input)); - - let serde_string: String = serde_json::from_str(&json).unwrap(); + let json: String = serde_json::to_string(&input).unwrap(); let mut iter = json_token_iter(json.as_bytes()); - assert_eq!(Some(Ok(Token::ValueString(serde_string))), iter.next()); + assert_eq!(Some(Ok(Token::ValueString(EscapedStr(&json[1..(json.len()-1)])))), iter.next()); assert_eq!(None, iter.next()); } @@ -757,7 +762,7 @@ mod tests { Token::EndArray, Token::StartObject, Token::EndObject, - Token::ValueString("test".into()), + Token::ValueString(EscapedStr("test")), Token::EndArray, ], tokens.unwrap() @@ -781,25 +786,25 @@ mod tests { assert_eq!( vec![ Token::StartObject, - Token::ObjectKey("some_int".into()), + Token::ObjectKey(EscapedStr("some_int")), Token::ValueNumber(Number::PosInt(5)), - Token::ObjectKey("some_float".into()), + Token::ObjectKey(EscapedStr("some_float")), Token::ValueNumber(Number::Float(5.2)), - Token::ObjectKey("some_negative".into()), + Token::ObjectKey(EscapedStr("some_negative")), Token::ValueNumber(Number::NegInt(-5)), - Token::ObjectKey("some_negative_float".into()), + Token::ObjectKey(EscapedStr("some_negative_float")), Token::ValueNumber(Number::Float(-2.4)), - Token::ObjectKey("some_string".into()), - Token::ValueString("test".into()), - Token::ObjectKey("some_struct".into()), + Token::ObjectKey(EscapedStr("some_string")), + Token::ValueString(EscapedStr("test")), + Token::ObjectKey(EscapedStr("some_struct")), Token::StartObject, - Token::ObjectKey("nested".into()), - Token::ValueString("asdf".into()), + Token::ObjectKey(EscapedStr("nested")), + Token::ValueString(EscapedStr("asdf")), Token::EndObject, - Token::ObjectKey("some_array".into()), + Token::ObjectKey(EscapedStr("some_array")), Token::StartArray, - Token::ValueString("one".into()), - Token::ValueString("two".into()), + Token::ValueString(EscapedStr("one")), + Token::ValueString(EscapedStr("two")), Token::EndArray, Token::EndObject, ], @@ -811,8 +816,11 @@ mod tests { fn test_object_trailing_comma() { let mut iter = json_token_iter(br#" { "test": "trailing", } "#); assert_eq!(Some(Ok(Token::StartObject)), iter.next()); - assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next()); - assert_eq!(Some(Ok(Token::ValueString("trailing".into()))), iter.next()); + assert_eq!(Some(Ok(Token::ObjectKey(EscapedStr("test")))), iter.next()); + assert_eq!( + Some(Ok(Token::ValueString(EscapedStr("trailing")))), + iter.next() + ); assert_eq!( Some(Err(Error { reason: ErrorReason::UnexpectedToken('}', "'\"'"), @@ -827,7 +835,7 @@ mod tests { fn test_object_no_colon() { let mut iter = json_token_iter(br#" {"test" "#); assert_eq!(Some(Ok(Token::StartObject)), iter.next()); - assert_eq!(Some(Ok(Token::ObjectKey("test".into()))), iter.next()); + assert_eq!(Some(Ok(Token::ObjectKey(EscapedStr("test")))), iter.next()); assert_eq!( Some(Err(Error { reason: ErrorReason::UnexpectedEOS, @@ -847,4 +855,11 @@ mod tests { assert!(json_token_iter(b"\"test\ntest\"").next().unwrap().is_err()); assert!(json_token_iter(b"\"test\ttest\"").next().unwrap().is_err()); } + + #[test] + fn escaped_str() { + let escaped = EscapedStr::new("foo\\nbar"); + assert_eq!("foo\\nbar", escaped.as_escaped_str()); + assert_eq!("foo\nbar", escaped.into_unescaped().unwrap()); + } } diff --git a/rust-runtime/smithy-json/src/escape.rs b/rust-runtime/smithy-json/src/escape.rs index 58d4a14040..8f2faf6807 100644 --- a/rust-runtime/smithy-json/src/escape.rs +++ b/rust-runtime/smithy-json/src/escape.rs @@ -4,6 +4,41 @@ */ use std::borrow::Cow; +use std::fmt; + +#[derive(Debug, PartialEq, Eq)] +pub enum Error { + ExpectedSurrogatePair(String), + InvalidEscapeCharacter(char), + InvalidSurrogatePair(u16, u16), + InvalidUnicodeEscape(String), + InvalidUtf8, + UnexpectedEndOfString, +} + +impl std::error::Error for Error {} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use Error::*; + match self { + ExpectedSurrogatePair(low) => { + write!( + f, + "expected a UTF-16 surrogate pair, but got {} as the low word", + low + ) + } + InvalidEscapeCharacter(chr) => write!(f, "invalid JSON escape: \\{}", chr), + InvalidSurrogatePair(high, low) => { + write!(f, "invalid surrogate pair: \\u{:04X}\\u{:04X}", high, low) + } + InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape), + InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON string"), + UnexpectedEndOfString => write!(f, "unexpected end of string"), + } + } +} /// Escapes a string for embedding in a JSON string value. pub fn escape_string(value: &str) -> Cow { @@ -45,9 +80,119 @@ fn escape_string_inner(start: &[u8], rest: &[u8]) -> String { unsafe { String::from_utf8_unchecked(escaped) } } +/// Unescapes a JSON-escaped string. +/// If there are no escape sequences, it directly returns the reference. +pub fn unescape_string(value: &str) -> Result, Error> { + let bytes = value.as_bytes(); + for (index, byte) in bytes.iter().enumerate() { + if *byte == b'\\' { + return unescape_string_inner(&bytes[0..index], &bytes[index..]).map(Cow::Owned); + } + } + Ok(Cow::Borrowed(value)) +} + +fn unescape_string_inner(start: &[u8], rest: &[u8]) -> Result { + let mut unescaped = Vec::with_capacity(start.len() + rest.len()); + unescaped.extend(start); + + let mut index = 0; + while index < rest.len() { + match rest[index] { + b'\\' => { + index += 1; + if index == rest.len() { + return Err(Error::UnexpectedEndOfString); + } + match rest[index] { + b'u' => { + index -= 1; + index += read_unicode_escapes(&rest[index..], &mut unescaped)?; + } + byte => { + match byte { + b'\\' => unescaped.push(b'\\'), + b'/' => unescaped.push(b'/'), + b'"' => unescaped.push(b'"'), + b'b' => unescaped.push(0x08), + b'f' => unescaped.push(0x0C), + b'n' => unescaped.push(b'\n'), + b'r' => unescaped.push(b'\r'), + b't' => unescaped.push(b'\t'), + _ => return Err(Error::InvalidEscapeCharacter(byte.into())), + } + index += 1; + } + } + } + byte => { + unescaped.push(byte); + index += 1 + } + } + } + + String::from_utf8(unescaped).map_err(|_| Error::InvalidUtf8) +} + +fn is_utf16_low_surrogate(codepoint: u16) -> bool { + codepoint & 0xFC00 == 0xDC00 +} + +fn is_utf16_high_surrogate(codepoint: u16) -> bool { + codepoint & 0xFC00 == 0xD800 +} + +fn read_codepoint(rest: &[u8]) -> Result { + if rest.len() < 6 { + return Err(Error::UnexpectedEndOfString); + } + if &rest[0..2] != b"\\u" { + // The first codepoint is always prefixed with "\u" since unescape_string_inner does + // that check, so this error will always be for the low word of a surrogate pair. + return Err(Error::ExpectedSurrogatePair( + String::from_utf8_lossy(&rest[0..6]).into(), + )); + } + + let codepoint_str = std::str::from_utf8(&rest[2..6]).map_err(|_| Error::InvalidUtf8)?; + u16::from_str_radix(codepoint_str, 16) + .map_err(|_| Error::InvalidUnicodeEscape(codepoint_str.into())) +} + +/// Reads JSON Unicode escape sequences (i.e., "\u1234"). Will also read +/// an additional codepoint if the first codepoint is the start of a surrogate pair. +fn read_unicode_escapes(bytes: &[u8], into: &mut Vec) -> Result { + let high = read_codepoint(bytes)?; + let (bytes_read, chr) = if is_utf16_high_surrogate(high) { + let low = read_codepoint(&bytes[6..])?; + if !is_utf16_low_surrogate(low) { + return Err(Error::InvalidSurrogatePair(high, low)); + } + + let codepoint = + char::from_u32(0x10000 + (high - 0xD800) as u32 * 0x400 + (low - 0xDC00) as u32) + .ok_or(Error::InvalidSurrogatePair(high, low))?; + (12, codepoint) + } else { + let codepoint = char::from_u32(high as u32).ok_or_else(|| { + Error::InvalidUnicodeEscape(String::from_utf8_lossy(&bytes[0..6]).into()) + })?; + (6, codepoint) + }; + + match chr.len_utf8() { + 1 => into.push(chr as u8), + _ => into.extend_from_slice(chr.encode_utf8(&mut [0; 4]).as_bytes()), + } + Ok(bytes_read) +} + #[cfg(test)] mod test { use super::escape_string; + use crate::escape::{unescape_string, Error}; + use std::borrow::Cow; #[test] fn escape() { @@ -66,6 +211,49 @@ mod test { assert_eq!("\\u001f", escape_string("\u{1f}").as_ref()); } + #[test] + fn unescape_no_escapes() { + let unescaped = unescape_string("test test").unwrap(); + assert_eq!("test test", unescaped); + assert!(matches!(unescaped, Cow::Borrowed(_))); + } + + #[test] + fn unescape() { + assert_eq!( + "\x08f\x0Co\to\r\n", + unescape_string(r#"\bf\fo\to\r\n"#).unwrap() + ); + assert_eq!("\"test\"", unescape_string(r#"\"test\""#).unwrap()); + assert_eq!("\x00", unescape_string("\\u0000").unwrap()); + assert_eq!("\x1f", unescape_string("\\u001f").unwrap()); + assert_eq!("foo\r\nbar", unescape_string("foo\\r\\nbar").unwrap()); + assert_eq!("foo\r\n", unescape_string("foo\\r\\n").unwrap()); + assert_eq!("\r\nbar", unescape_string("\\r\\nbar").unwrap()); + assert_eq!("\u{10437}", unescape_string("\\uD801\\uDC37").unwrap()); + + assert_eq!(Err(Error::UnexpectedEndOfString), unescape_string("\\")); + assert_eq!(Err(Error::UnexpectedEndOfString), unescape_string("\\u")); + assert_eq!(Err(Error::UnexpectedEndOfString), unescape_string("\\u00")); + assert_eq!( + Err(Error::InvalidEscapeCharacter('z')), + unescape_string("\\z") + ); + + assert_eq!( + Err(Error::ExpectedSurrogatePair("\\nasdf".into())), + unescape_string("\\uD801\\nasdf") + ); + assert_eq!( + Err(Error::UnexpectedEndOfString), + unescape_string("\\uD801\\u00") + ); + assert_eq!( + Err(Error::InvalidSurrogatePair(0xD801, 0xC501)), + unescape_string("\\uD801\\uC501") + ); + } + use proptest::proptest; proptest! { #[test] @@ -74,6 +262,31 @@ mod test { let serde_escaped = &serde_escaped[1..(serde_escaped.len() - 1)]; assert_eq!(serde_escaped,escape_string(&s)) } + + #[test] + fn round_trip(chr in proptest::char::any()) { + let mut original = String::new(); + original.push(chr); + + let escaped = escape_string(&original); + let unescaped = unescape_string(&escaped).unwrap(); + assert_eq!(original, unescaped); + } + + #[test] + fn unicode_surrogates(chr in proptest::char::range( + char::from_u32(0x10000).unwrap(), + char::from_u32(0x10FFFF).unwrap(), + )) { + let mut codepoints = [0; 2]; + chr.encode_utf16(&mut codepoints); + + let escaped = format!("\\u{:04X}\\u{:04X}", codepoints[0], codepoints[1]); + let unescaped = unescape_string(&escaped).unwrap(); + + let expected = format!("{}", chr); + assert_eq!(expected, unescaped); + } } #[test] From adb5600489acd77e5e9d4314d10c570180de90d2 Mon Sep 17 00:00:00 2001 From: John DiSanti Date: Thu, 3 Jun 2021 12:53:03 -0700 Subject: [PATCH 3/3] Add documentation on how to test against JSONTestSuite --- rust-runtime/smithy-json/TESTING.md | 115 ++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 rust-runtime/smithy-json/TESTING.md diff --git a/rust-runtime/smithy-json/TESTING.md b/rust-runtime/smithy-json/TESTING.md new file mode 100644 index 0000000000..b9ee52667b --- /dev/null +++ b/rust-runtime/smithy-json/TESTING.md @@ -0,0 +1,115 @@ +How to run JSONTestSuite against smithy-json deserialize +======================================================== + +When making changes to the `deserialize` module, it is a good idea +to run the changes against the [JSONTestSuite](https://github.com/nst/JSONTestSuite) +and manually examine the test results. + +### How to setup the JSONTestSuite + +1. Clone the [JSONTestSuite](https://github.com/nst/JSONTestSuite) repository. +2. In `JSONTestSuite/parsers`, create a new Cargo bin project named `test_json-smithy_json`. +3. Add the following dependencies to the `Cargo.toml` (be sure to replace ``: + +``` +smithy-json = { path = "/rust-runtime/smithy-json" } +``` + +4. Replace the code in `main.rs` with: + +```rust +use std::fs::File; +use std::io::Read; +use std::env; + +use smithy_json::deserialize::{json_token_iter, Token, Error}; + +fn main() { + let args: Vec<_> = env::args().collect(); + if args.len() != 2 { + println!("Usage: {} file.json", args[0]); + std::process::exit(1); + } + + let ref path = args[1]; + let mut s = String::new(); + let mut f = File::open(path).expect("Unable to open file"); + match f.read_to_string(&mut s) { + Err(_) => std::process::exit(1), + Ok(_) => println!("{}", s), + } + + let result: Result, Error> = json_token_iter(s.as_bytes()).collect(); + match result { + Err(_) => std::process::exit(1), + Ok(value) => if value.is_empty() { + std::process::exit(1) + } else { + // The test suite includes incomplete objects and arrays (i.e., "[null,"). + // These are completely valid for this parser, so we'll just pretend to have + // failed to parse these to satisfy the test suite. + if value.first() == Some(&Token::StartObject) && value.last() != Some(&Token::EndObject) { + std::process::exit(1) + } + if value.first() == Some(&Token::StartArray) && value.last() != Some(&Token::EndArray) { + std::process::exit(1) + } + // Unescape all strings and fail if any of them failed to unescape. + for token in value { + if let Token::ValueString(escaped) = token { + if escaped.into_unescaped().is_err() { + std::process::exit(1) + } + } + } + std::process::exit(0) + } + } +} +``` + +5. Compile this program with `cargo build --release`. +6. Modify `JSONTestSuite/run_tests.py` so that the `programs` dictionary only contains this one entry: + +``` +programs = { + "Rust smithy-json": + { + "url":"dontcare", + "commands":[os.path.join(PARSERS_DIR, "test_json-smithy_json/target/release/sj")] + } +} +``` + +7. Run `run_tests.py` and examine the output with a web browser by opening `JSONTestSuite/results/parsing.html`. + +### Examining the results + +When looking at `JSONTestSuite/results/parsing.html`, there is a matrix of test cases against their +results with a legend at the top. + +Any test result marked with blue or light blue is for a test case where correct behavior isn't specified, +so use your best judgement to decide if it should have succeeded or failed. + +The other colors are bad and should be carefully examined. At time of writing, the following test cases +succeed when they should fail, and we intentionally left it that way since we're not currently concerned +about being more lenient in the number parsing: + +``` +n_number_-01.json [-01] +n_number_-2..json [-2.] +n_number_0.e1.json [0.e1] +n_number_2.e+3.json [2.e+3] +n_number_2.e-3.json [2.e-3] +n_number_2.e3.json [2.e3] +n_number_neg_int_starting_with_zero.json [-012] +n_number_neg_real_without_int_part.json [-.123] +n_number_real_without_fractional_part.json [1.] +n_number_with_leading_zero.json [012] +``` + +This test case succeeds with our parser and that's OK since we're +a token streaming parser (multiple values are allowed): +``` +n_structure_double_array.json [][] +```