diff --git a/src/de.rs b/src/de.rs index bfde371a1..bd6f2e50c 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// /// The behavior of serde_json is specified to fail on non-UTF-8 strings /// when deserializing into Rust UTF-8 string types such as String, and - /// succeed with non-UTF-8 bytes when deserializing using this method. + /// succeed with the bytes representing the [WTF-8] encoding of code points + /// when deserializing using this method. + /// + /// [WTF-8]: https://simonsapin.github.io/wtf-8 /// /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is /// still checked if the hex number represents a valid Unicode code point. diff --git a/src/read.rs b/src/read.rs index 0eec0b3bb..19f01f058 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1,6 +1,5 @@ use crate::error::{Error, ErrorCode, Result}; use alloc::vec::Vec; -use core::char; use core::cmp; use core::mem; use core::ops::Deref; @@ -877,88 +876,133 @@ fn parse_escape<'de, R: Read<'de>>( b'n' => scratch.push(b'\n'), b'r' => scratch.push(b'\r'), b't' => scratch.push(b'\t'), - b'u' => { - fn encode_surrogate(scratch: &mut Vec, n: u16) { - scratch.extend_from_slice(&[ - (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, - (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, - (n & 0b0011_1111) as u8 | 0b1000_0000, - ]); - } + b'u' => return parse_unicode_escape(read, validate, scratch), + _ => { + return error(read, ErrorCode::InvalidEscape); + } + } - let c = match tri!(read.decode_hex_escape()) { - n @ 0xDC00..=0xDFFF => { - return if validate { - error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) - } else { - encode_surrogate(scratch, n); - Ok(()) - }; - } + Ok(()) +} - // Non-BMP characters are encoded as a sequence of two hex - // escapes, representing UTF-16 surrogates. If deserializing a - // utf-8 string the surrogates are required to be paired, - // whereas deserializing a byte string accepts lone surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(peek_or_eof(read)) == b'\\' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - encode_surrogate(scratch, n1); - Ok(()) - }; - } +/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u +/// has just been read. +#[cold] +fn parse_unicode_escape<'de, R: Read<'de>>( + read: &mut R, + validate: bool, + scratch: &mut Vec, +) -> Result<()> { + let mut n = tri!(read.decode_hex_escape()); + + // Non-BMP characters are encoded as a sequence of two hex + // escapes, representing UTF-16 surrogates. If deserializing a + // utf-8 string the surrogates are required to be paired, + // whereas deserializing a byte string accepts lone surrogates. + if validate && n >= 0xDC00 && n <= 0xDFFF { + // XXX: This is actually a trailing surrogate. + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } + + loop { + if n < 0xD800 || n > 0xDBFF { + // Every u16 outside of the surrogate ranges is guaranteed to be a + // legal char. + push_wtf8_codepoint(n as u32, scratch); + return Ok(()); + } - if tri!(peek_or_eof(read)) == b'u' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - encode_surrogate(scratch, n1); - // The \ prior to this byte started an escape sequence, - // so we need to parse that now. This recursive call - // does not blow the stack on malicious input because - // the escape is not \u, so it will be handled by one - // of the easy nonrecursive cases. - parse_escape(read, validate, scratch) - }; - } + // n is a leading surrogate, we now expect a trailing surrogate. + let n1 = n; - let n2 = tri!(read.decode_hex_escape()); + if tri!(peek_or_eof(read)) == b'\\' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + Ok(()) + }; + } - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } + if tri!(peek_or_eof(read)) == b'u' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. + parse_escape(read, validate, scratch) + }; + } - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + let n2 = tri!(read.decode_hex_escape()); - match char::from_u32(n) { - Some(c) => c, - None => { - return error(read, ErrorCode::InvalidUnicodeCodePoint); - } - } - } + if n2 < 0xDC00 || n2 > 0xDFFF { + if validate { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } + push_wtf8_codepoint(n1 as u32, scratch); + // If n2 is a leading surrogate, we need to restart. + n = n2; + continue; + } - // Every u16 outside of the surrogate ranges above is guaranteed - // to be a legal char. - n => char::from_u32(n as u32).unwrap(), - }; + // This value is in range U+10000..=U+10FFFF, which is always a + // valid codepoint. + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + push_wtf8_codepoint(n, scratch); + return Ok(()); + } +} - scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); - } - _ => { - return error(read, ErrorCode::InvalidEscape); - } +/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient +/// implementation of String::push. The codepoint may be a surrogate. +#[inline] +fn push_wtf8_codepoint(n: u32, scratch: &mut Vec) { + if n < 0x80 { + scratch.push(n as u8); + return; } - Ok(()) + scratch.reserve(4); + + unsafe { + let ptr = scratch.as_mut_ptr().add(scratch.len()); + + let encoded_len = match n { + 0..=0x7F => unreachable!(), + 0x80..=0x7FF => { + ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000); + 2 + } + 0x800..=0xFFFF => { + ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000); + ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000); + 3 + } + 0x1_0000..=0x10_FFFF => { + ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000); + ptr.add(1) + .write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000); + ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000); + 4 + } + 0x11_0000.. => unreachable!(), + }; + ptr.add(encoded_len - 1) + .write((n & 0b0011_1111) as u8 | 0b1000_0000); + + scratch.set_len(scratch.len() + encoded_len); + } } /// Parses a JSON escape sequence and discards the value. Assumes the previous diff --git a/tests/test.rs b/tests/test.rs index 5576317f9..dc8c3c22f 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1707,7 +1707,7 @@ fn test_byte_buf_de() { } #[test] -fn test_byte_buf_de_lone_surrogate() { +fn test_byte_buf_de_invalid_surrogates() { let bytes = ByteBuf::from(vec![237, 160, 188]); let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); assert_eq!(v, bytes); @@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() { let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap(); assert_eq!(v, bytes); - let bytes = ByteBuf::from(vec![237, 176, 129]); - let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); - assert_eq!(v, bytes); - let res = from_str::(r#""\ud83c\!""#); assert!(res.is_err()); let res = from_str::(r#""\ud83c\u""#); assert!(res.is_err()); - let res = from_str::(r#""\ud83c\ud83c""#); - assert!(res.is_err()); + // lone trailing surrogate + let bytes = ByteBuf::from(vec![237, 176, 129]); + let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by other leading surrogate + let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]); + let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by "a" (U+0061) in \u encoding + let bytes = ByteBuf::from(vec![237, 160, 188, 97]); + let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+0080 + let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]); + let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+FFFF + let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]); + let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap(); + assert_eq!(v, bytes); +} + +#[test] +fn test_byte_buf_de_surrogate_pair() { + // leading surrogate followed by trailing surrogate + let bytes = ByteBuf::from(vec![240, 159, 128, 128]); + let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by a surrogate pair + let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]); + let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap(); + assert_eq!(v, bytes); } #[cfg(feature = "raw_value")] #[test] -fn test_raw_de_lone_surrogate() { +fn test_raw_de_invalid_surrogates() { use serde_json::value::RawValue; assert!(from_str::>(r#""\ud83c""#).is_ok()); @@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() { assert!(from_str::>(r#""\udc01\!""#).is_err()); assert!(from_str::>(r#""\udc01\u""#).is_err()); assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0061""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0080""#).is_ok()); + assert!(from_str::>(r#""\ud83c\uffff""#).is_ok()); +} + +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_surrogate_pair() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c\udc00""#).is_ok()); } #[test]