From f50e29656a52466fe61256bf7bd9acf9b650a50a Mon Sep 17 00:00:00 2001 From: Luca Casonato Date: Tue, 12 Apr 2022 02:08:35 +0200 Subject: [PATCH 1/6] Deserialize invalid UTF-8 into byte bufs as WTF-8 Previously #828 added support for deserializing lone leading and trailing surrogates into WTF-8 encoded bytes when deserializing a string as bytes. This commit extends this to cover the case of a leading surrogate followed by code units that are not trailing surrogates. This allows for deserialization of "\ud83c\ud83c" (two leading surrogates), or "\ud83c\u0061" (a leading surrogate followed by "a"). The docs also now make it clear that we are serializing the invalid code points as WTF-8. This reference to WTF-8 signals to the user that they can use a WTF-8 parser on the bytes to construct a valid UTF-8 string. --- src/de.rs | 5 ++++- src/read.rs | 41 +++++++++++++++++++++++++++----------- tests/test.rs | 54 +++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 80 insertions(+), 20 deletions(-) diff --git a/src/de.rs b/src/de.rs index ffd0d48c2..2b6daf46f 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1570,7 +1570,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// /// The behavior of serde_json is specified to fail on non-UTF-8 strings /// when deserializing into Rust UTF-8 string types such as String, and - /// succeed with non-UTF-8 bytes when deserializing using this method. + /// succeed with the bytes representing the [WTF-8] encoding of code points + /// when deserializing using this method. + /// + /// [WTF-8]: https://simonsapin.github.io/wtf-8 /// /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is /// still checked if the hex number represents a valid Unicode code point. diff --git a/src/read.rs b/src/read.rs index 1319d89c9..f3774e563 100644 --- a/src/read.rs +++ b/src/read.rs @@ -861,20 +861,33 @@ fn parse_escape<'de, R: Read<'de>>( b'r' => scratch.push(b'\r'), b't' => scratch.push(b'\t'), b'u' => { - fn encode_surrogate(scratch: &mut Vec, n: u16) { - scratch.extend_from_slice(&[ - (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, - (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, - (n & 0b0011_1111) as u8 | 0b1000_0000, - ]); + fn encode_wtf8(scratch: &mut Vec, cp: u16) { + match cp { + 0x0000..=0x007F => { + scratch.extend_from_slice(&[cp as u8]); + } + 0x0080..=0x07FF => { + scratch + .extend_from_slice(&[0xC0 | (cp >> 6) as u8, 0x80 | (cp & 0x3F) as u8]); + } + 0x0800..=0xFFFF => { + scratch.extend_from_slice(&[ + 0xE0 | (cp >> 12) as u8, + 0x80 | ((cp >> 6) & 0x3F) as u8, + 0x80 | (cp & 0x3F) as u8, + ]); + } + } } let c = match tri!(read.decode_hex_escape()) { n @ 0xDC00..=0xDFFF => { return if validate { + // TODO: the error message is wrong, this is a lone + // _trailing_ surrogate error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) } else { - encode_surrogate(scratch, n); + encode_wtf8(scratch, n); Ok(()) }; } @@ -889,9 +902,9 @@ fn parse_escape<'de, R: Read<'de>>( } else { return if validate { read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) + error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) } else { - encode_surrogate(scratch, n1); + encode_wtf8(scratch, n1); Ok(()) }; } @@ -903,7 +916,7 @@ fn parse_escape<'de, R: Read<'de>>( read.discard(); error(read, ErrorCode::UnexpectedEndOfHexEscape) } else { - encode_surrogate(scratch, n1); + encode_wtf8(scratch, n1); // The \ prior to this byte started an escape sequence, // so we need to parse that now. This recursive call // does not blow the stack on malicious input because @@ -916,7 +929,13 @@ fn parse_escape<'de, R: Read<'de>>( let n2 = tri!(read.decode_hex_escape()); if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + return if validate { + error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) + } else { + encode_wtf8(scratch, n1); + encode_wtf8(scratch, n2); + Ok(()) + }; } let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; diff --git a/tests/test.rs b/tests/test.rs index b11635e75..7c5d5a78e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1713,7 +1713,8 @@ fn test_byte_buf_de() { } #[test] -fn test_byte_buf_de_lone_surrogate() { +fn test_byte_buf_de_invalid_surrogates() { + // lone leading surrogate let bytes = ByteBuf::from(vec![237, 160, 188]); let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); assert_eq!(v, bytes); @@ -1726,23 +1727,49 @@ fn test_byte_buf_de_lone_surrogate() { let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap(); assert_eq!(v, bytes); - let bytes = ByteBuf::from(vec![237, 176, 129]); - let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); - assert_eq!(v, bytes); - let res = from_str::(r#""\ud83c\!""#); assert!(res.is_err()); let res = from_str::(r#""\ud83c\u""#); assert!(res.is_err()); - let res = from_str::(r#""\ud83c\ud83c""#); - assert!(res.is_err()); + // lone trailing surrogate + let bytes = ByteBuf::from(vec![237, 176, 129]); + let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by other leading surrogate + let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]); + let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by "a" (U+0061) in \u encoding + let bytes = ByteBuf::from(vec![237, 160, 188, 97]); + let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+0080 + let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]); + let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+FFFF + let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]); + let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap(); + assert_eq!(v, bytes); +} + +#[test] +fn test_byte_buf_de_surrogate_pair() { + // leading surrogate followed by trailing surrogate + let bytes = ByteBuf::from(vec![240, 159, 128, 128]); + let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap(); + assert_eq!(v, bytes); } #[cfg(feature = "raw_value")] #[test] -fn test_raw_de_lone_surrogate() { +fn test_raw_de_invalid_surrogates() { use serde_json::value::RawValue; assert!(from_str::>(r#""\ud83c""#).is_ok()); @@ -1752,6 +1779,17 @@ fn test_raw_de_lone_surrogate() { assert!(from_str::>(r#""\udc01\!""#).is_err()); assert!(from_str::>(r#""\udc01\u""#).is_err()); assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0061""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0080""#).is_ok()); + assert!(from_str::>(r#""\ud83c\uffff""#).is_ok()); +} + +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_surrogate_pair() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c\udc00""#).is_ok()); } #[test] From a38dbf3708724519d7ef4e847bc0aed8606ecad0 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 12 Aug 2024 19:34:06 +0300 Subject: [PATCH 2/6] Mark \u parsing as cold This counterintuitively speeds up War and Peace 275 -> 290 MB/s (+5%) by enabling inlining of encode_utf8 and extend_from_slice. --- src/read.rs | 152 ++++++++++++++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 71 deletions(-) diff --git a/src/read.rs b/src/read.rs index 6c663768e..2d9c8f865 100644 --- a/src/read.rs +++ b/src/read.rs @@ -882,87 +882,97 @@ fn parse_escape<'de, R: Read<'de>>( b'n' => scratch.push(b'\n'), b'r' => scratch.push(b'\r'), b't' => scratch.push(b'\t'), - b'u' => { - fn encode_surrogate(scratch: &mut Vec, n: u16) { - scratch.extend_from_slice(&[ - (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, - (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, - (n & 0b0011_1111) as u8 | 0b1000_0000, - ]); - } - - let c = match tri!(read.decode_hex_escape()) { - n @ 0xDC00..=0xDFFF => { - return if validate { - error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) - } else { - encode_surrogate(scratch, n); - Ok(()) - }; - } + b'u' => return parse_unicode_escape(read, validate, scratch), + _ => { + return error(read, ErrorCode::InvalidEscape); + } + } - // Non-BMP characters are encoded as a sequence of two hex - // escapes, representing UTF-16 surrogates. If deserializing a - // utf-8 string the surrogates are required to be paired, - // whereas deserializing a byte string accepts lone surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(peek_or_eof(read)) == b'\\' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - encode_surrogate(scratch, n1); - Ok(()) - }; - } + Ok(()) +} - if tri!(peek_or_eof(read)) == b'u' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - encode_surrogate(scratch, n1); - // The \ prior to this byte started an escape sequence, - // so we need to parse that now. This recursive call - // does not blow the stack on malicious input because - // the escape is not \u, so it will be handled by one - // of the easy nonrecursive cases. - parse_escape(read, validate, scratch) - }; - } +/// Parses a JSON \u escape and appends it into the scratch space. Assumes \u +/// has just been read. +#[cold] +fn parse_unicode_escape<'de, R: Read<'de>>( + read: &mut R, + validate: bool, + scratch: &mut Vec, +) -> Result<()> { + fn encode_surrogate(scratch: &mut Vec, n: u16) { + scratch.extend_from_slice(&[ + (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, + (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, + (n & 0b0011_1111) as u8 | 0b1000_0000, + ]); + } + + let c = match tri!(read.decode_hex_escape()) { + n @ 0xDC00..=0xDFFF => { + return if validate { + error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) + } else { + encode_surrogate(scratch, n); + Ok(()) + }; + } - let n2 = tri!(read.decode_hex_escape()); + // Non-BMP characters are encoded as a sequence of two hex + // escapes, representing UTF-16 surrogates. If deserializing a + // utf-8 string the surrogates are required to be paired, + // whereas deserializing a byte string accepts lone surrogates. + n1 @ 0xD800..=0xDBFF => { + if tri!(peek_or_eof(read)) == b'\\' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + encode_surrogate(scratch, n1); + Ok(()) + }; + } - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } + if tri!(peek_or_eof(read)) == b'u' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + encode_surrogate(scratch, n1); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. + parse_escape(read, validate, scratch) + }; + } - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + let n2 = tri!(read.decode_hex_escape()); - match char::from_u32(n) { - Some(c) => c, - None => { - return error(read, ErrorCode::InvalidUnicodeCodePoint); - } - } - } + if n2 < 0xDC00 || n2 > 0xDFFF { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } - // Every u16 outside of the surrogate ranges above is guaranteed - // to be a legal char. - n => char::from_u32(n as u32).unwrap(), - }; + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; - scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); - } - _ => { - return error(read, ErrorCode::InvalidEscape); + match char::from_u32(n) { + Some(c) => c, + None => { + return error(read, ErrorCode::InvalidUnicodeCodePoint); + } + } } - } + // Every u16 outside of the surrogate ranges above is guaranteed + // to be a legal char. + n => char::from_u32(n as u32).unwrap(), + }; + + scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); Ok(()) } From 0e90b61b8cefdd699adbed9c0d4d8a1538d77183 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 12 Aug 2024 19:44:29 +0300 Subject: [PATCH 3/6] Format UTF-8 strings manually This speeds up War and Peace 290 MB/s -> 330 MB/s (+15%). --- src/read.rs | 60 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/src/read.rs b/src/read.rs index 2d9c8f865..9f2d91404 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1,6 +1,5 @@ use crate::error::{Error, ErrorCode, Result}; use alloc::vec::Vec; -use core::char; use core::cmp; use core::mem; use core::ops::Deref; @@ -957,25 +956,64 @@ fn parse_unicode_escape<'de, R: Read<'de>>( return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); } - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; - - match char::from_u32(n) { - Some(c) => c, - None => { - return error(read, ErrorCode::InvalidUnicodeCodePoint); - } - } + // This value is in range U+10000..=U+10FFFF, which is always a + // valid codepoint. + (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000 } // Every u16 outside of the surrogate ranges above is guaranteed // to be a legal char. - n => char::from_u32(n as u32).unwrap(), + n => n as u32, }; - scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); + // SAFETY: c is always a codepoint. + unsafe { + push_utf8_codepoint(c, scratch); + } Ok(()) } +/// Adds a UTF-8 codepoint to the end of the buffer. This is a more efficient +/// implementation of String::push. n must be a valid codepoint. +#[inline] +unsafe fn push_utf8_codepoint(n: u32, scratch: &mut Vec) { + if n < 0x80 { + scratch.push(n as u8); + return; + } + + scratch.reserve(4); + + unsafe { + let ptr = scratch.as_mut_ptr().add(scratch.len()); + + let encoded_len = match n { + 0..=0x7F => unreachable!(), + 0x80..=0x7FF => { + ptr.write((n >> 6 & 0b0001_1111) as u8 | 0b1100_0000); + 2 + } + 0x800..=0xFFFF => { + ptr.write((n >> 12 & 0b0000_1111) as u8 | 0b1110_0000); + ptr.add(1).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000); + 3 + } + 0x1_0000..=0x10_FFFF => { + ptr.write((n >> 18 & 0b0000_0111) as u8 | 0b1111_0000); + ptr.add(1) + .write((n >> 12 & 0b0011_1111) as u8 | 0b1000_0000); + ptr.add(2).write((n >> 6 & 0b0011_1111) as u8 | 0b1000_0000); + 4 + } + 0x11_0000.. => unreachable!(), + }; + ptr.add(encoded_len - 1) + .write((n & 0b0011_1111) as u8 | 0b1000_0000); + + scratch.set_len(scratch.len() + encoded_len); + } +} + /// Parses a JSON escape sequence and discards the value. Assumes the previous /// byte read was a backslash. fn ignore_escape<'de, R>(read: &mut R) -> Result<()> From 2f28d106e68e214cfa19043e65b1bd178b3c2ced Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 12 Aug 2024 19:46:31 +0300 Subject: [PATCH 4/6] Use the same UTF-8/WTF-8 impl for surrogates This does not affect performance. --- src/read.rs | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/read.rs b/src/read.rs index 9f2d91404..e9f3f58e7 100644 --- a/src/read.rs +++ b/src/read.rs @@ -898,20 +898,12 @@ fn parse_unicode_escape<'de, R: Read<'de>>( validate: bool, scratch: &mut Vec, ) -> Result<()> { - fn encode_surrogate(scratch: &mut Vec, n: u16) { - scratch.extend_from_slice(&[ - (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, - (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, - (n & 0b0011_1111) as u8 | 0b1000_0000, - ]); - } - let c = match tri!(read.decode_hex_escape()) { n @ 0xDC00..=0xDFFF => { return if validate { error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) } else { - encode_surrogate(scratch, n); + push_wtf8_codepoint(n as u32, scratch); Ok(()) }; } @@ -928,7 +920,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>( read.discard(); error(read, ErrorCode::UnexpectedEndOfHexEscape) } else { - encode_surrogate(scratch, n1); + push_wtf8_codepoint(n1 as u32, scratch); Ok(()) }; } @@ -940,7 +932,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>( read.discard(); error(read, ErrorCode::UnexpectedEndOfHexEscape) } else { - encode_surrogate(scratch, n1); + push_wtf8_codepoint(n1 as u32, scratch); // The \ prior to this byte started an escape sequence, // so we need to parse that now. This recursive call // does not blow the stack on malicious input because @@ -966,17 +958,14 @@ fn parse_unicode_escape<'de, R: Read<'de>>( n => n as u32, }; - // SAFETY: c is always a codepoint. - unsafe { - push_utf8_codepoint(c, scratch); - } + push_wtf8_codepoint(c, scratch); Ok(()) } -/// Adds a UTF-8 codepoint to the end of the buffer. This is a more efficient -/// implementation of String::push. n must be a valid codepoint. +/// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient +/// implementation of String::push. The codepoint may be a surrogate. #[inline] -unsafe fn push_utf8_codepoint(n: u32, scratch: &mut Vec) { +fn push_wtf8_codepoint(n: u32, scratch: &mut Vec) { if n < 0x80 { scratch.push(n as u8); return; From 236cc8247d32a5cb337850d75f68265fdb4bc14e Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 12 Aug 2024 19:53:51 +0300 Subject: [PATCH 5/6] Simplify unicode escape handling This does not affect performance. --- src/read.rs | 107 ++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/src/read.rs b/src/read.rs index e9f3f58e7..6b5abdc22 100644 --- a/src/read.rs +++ b/src/read.rs @@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>( validate: bool, scratch: &mut Vec, ) -> Result<()> { - let c = match tri!(read.decode_hex_escape()) { - n @ 0xDC00..=0xDFFF => { - return if validate { - error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) - } else { - push_wtf8_codepoint(n as u32, scratch); - Ok(()) - }; - } + let n = tri!(read.decode_hex_escape()); - // Non-BMP characters are encoded as a sequence of two hex - // escapes, representing UTF-16 surrogates. If deserializing a - // utf-8 string the surrogates are required to be paired, - // whereas deserializing a byte string accepts lone surrogates. - n1 @ 0xD800..=0xDBFF => { - if tri!(peek_or_eof(read)) == b'\\' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - push_wtf8_codepoint(n1 as u32, scratch); - Ok(()) - }; - } + // Non-BMP characters are encoded as a sequence of two hex + // escapes, representing UTF-16 surrogates. If deserializing a + // utf-8 string the surrogates are required to be paired, + // whereas deserializing a byte string accepts lone surrogates. + if validate && n >= 0xDC00 && n <= 0xDFFF { + // XXX: This is actually a trailing surrogate. + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } - if tri!(peek_or_eof(read)) == b'u' { - read.discard(); - } else { - return if validate { - read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) - } else { - push_wtf8_codepoint(n1 as u32, scratch); - // The \ prior to this byte started an escape sequence, - // so we need to parse that now. This recursive call - // does not blow the stack on malicious input because - // the escape is not \u, so it will be handled by one - // of the easy nonrecursive cases. - parse_escape(read, validate, scratch) - }; - } + if n < 0xD800 || n > 0xDBFF { + // Every u16 outside of the surrogate ranges is guaranteed to be a + // legal char. + push_wtf8_codepoint(n as u32, scratch); + return Ok(()); + } - let n2 = tri!(read.decode_hex_escape()); + // n is a leading surrogate, we now expect a trailing surrogate. + let n1 = n; - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } + if tri!(peek_or_eof(read)) == b'\\' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + Ok(()) + }; + } - // This value is in range U+10000..=U+10FFFF, which is always a - // valid codepoint. - (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000 - } + if tri!(peek_or_eof(read)) == b'u' { + read.discard(); + } else { + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. + parse_escape(read, validate, scratch) + }; + } - // Every u16 outside of the surrogate ranges above is guaranteed - // to be a legal char. - n => n as u32, - }; + let n2 = tri!(read.decode_hex_escape()); + + if n2 < 0xDC00 || n2 > 0xDFFF { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } - push_wtf8_codepoint(c, scratch); + // This value is in range U+10000..=U+10FFFF, which is always a + // valid codepoint. + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + push_wtf8_codepoint(n, scratch); Ok(()) } From 96ae60445df67f6903b53b2fcfe1d60f3bd8f778 Mon Sep 17 00:00:00 2001 From: Alisa Sireneva Date: Mon, 12 Aug 2024 20:45:44 +0300 Subject: [PATCH 6/6] Correct WTF-8 parsing Closes #877. This is a good time to make ByteBuf parsing more consistent as I'm rewriting it anyway. This commit integrates the changes from #877 and also handles a leading surrogate followed by a surrogate pair correctly. This does not affect performance significantly. Co-authored-by: Luca Casonato --- src/de.rs | 5 ++- src/read.rs | 90 ++++++++++++++++++++++++++++----------------------- tests/test.rs | 58 ++++++++++++++++++++++++++++----- 3 files changed, 103 insertions(+), 50 deletions(-) diff --git a/src/de.rs b/src/de.rs index bfde371a1..bd6f2e50c 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1575,7 +1575,10 @@ impl<'de, 'a, R: Read<'de>> de::Deserializer<'de> for &'a mut Deserializer { /// /// The behavior of serde_json is specified to fail on non-UTF-8 strings /// when deserializing into Rust UTF-8 string types such as String, and - /// succeed with non-UTF-8 bytes when deserializing using this method. + /// succeed with the bytes representing the [WTF-8] encoding of code points + /// when deserializing using this method. + /// + /// [WTF-8]: https://simonsapin.github.io/wtf-8 /// /// Escape sequences are processed as usual, and for `\uXXXX` escapes it is /// still checked if the hex number represents a valid Unicode code point. diff --git a/src/read.rs b/src/read.rs index 6b5abdc22..9e8ffaacf 100644 --- a/src/read.rs +++ b/src/read.rs @@ -898,7 +898,7 @@ fn parse_unicode_escape<'de, R: Read<'de>>( validate: bool, scratch: &mut Vec, ) -> Result<()> { - let n = tri!(read.decode_hex_escape()); + let mut n = tri!(read.decode_hex_escape()); // Non-BMP characters are encoded as a sequence of two hex // escapes, representing UTF-16 surrogates. If deserializing a @@ -909,56 +909,64 @@ fn parse_unicode_escape<'de, R: Read<'de>>( return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); } - if n < 0xD800 || n > 0xDBFF { - // Every u16 outside of the surrogate ranges is guaranteed to be a - // legal char. - push_wtf8_codepoint(n as u32, scratch); - return Ok(()); - } + loop { + if n < 0xD800 || n > 0xDBFF { + // Every u16 outside of the surrogate ranges is guaranteed to be a + // legal char. + push_wtf8_codepoint(n as u32, scratch); + return Ok(()); + } - // n is a leading surrogate, we now expect a trailing surrogate. - let n1 = n; + // n is a leading surrogate, we now expect a trailing surrogate. + let n1 = n; - if tri!(peek_or_eof(read)) == b'\\' { - read.discard(); - } else { - return if validate { + if tri!(peek_or_eof(read)) == b'\\' { read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) } else { - push_wtf8_codepoint(n1 as u32, scratch); - Ok(()) - }; - } + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + Ok(()) + }; + } - if tri!(peek_or_eof(read)) == b'u' { - read.discard(); - } else { - return if validate { + if tri!(peek_or_eof(read)) == b'u' { read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) } else { - push_wtf8_codepoint(n1 as u32, scratch); - // The \ prior to this byte started an escape sequence, - // so we need to parse that now. This recursive call - // does not blow the stack on malicious input because - // the escape is not \u, so it will be handled by one - // of the easy nonrecursive cases. - parse_escape(read, validate, scratch) - }; - } + return if validate { + read.discard(); + error(read, ErrorCode::UnexpectedEndOfHexEscape) + } else { + push_wtf8_codepoint(n1 as u32, scratch); + // The \ prior to this byte started an escape sequence, + // so we need to parse that now. This recursive call + // does not blow the stack on malicious input because + // the escape is not \u, so it will be handled by one + // of the easy nonrecursive cases. + parse_escape(read, validate, scratch) + }; + } - let n2 = tri!(read.decode_hex_escape()); + let n2 = tri!(read.decode_hex_escape()); - if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); - } + if n2 < 0xDC00 || n2 > 0xDFFF { + if validate { + return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + } + push_wtf8_codepoint(n1 as u32, scratch); + // If n2 is a leading surrogate, we need to restart. + n = n2; + continue; + } - // This value is in range U+10000..=U+10FFFF, which is always a - // valid codepoint. - let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; - push_wtf8_codepoint(n, scratch); - Ok(()) + // This value is in range U+10000..=U+10FFFF, which is always a + // valid codepoint. + let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; + push_wtf8_codepoint(n, scratch); + return Ok(()); + } } /// Adds a WTF-8 codepoint to the end of the buffer. This is a more efficient diff --git a/tests/test.rs b/tests/test.rs index 9f2159513..4290cb3ef 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1707,7 +1707,7 @@ fn test_byte_buf_de() { } #[test] -fn test_byte_buf_de_lone_surrogate() { +fn test_byte_buf_de_invalid_surrogates() { let bytes = ByteBuf::from(vec![237, 160, 188]); let v: ByteBuf = from_str(r#""\ud83c""#).unwrap(); assert_eq!(v, bytes); @@ -1720,23 +1720,54 @@ fn test_byte_buf_de_lone_surrogate() { let v: ByteBuf = from_str(r#""\ud83c ""#).unwrap(); assert_eq!(v, bytes); - let bytes = ByteBuf::from(vec![237, 176, 129]); - let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); - assert_eq!(v, bytes); - let res = from_str::(r#""\ud83c\!""#); assert!(res.is_err()); let res = from_str::(r#""\ud83c\u""#); assert!(res.is_err()); - let res = from_str::(r#""\ud83c\ud83c""#); - assert!(res.is_err()); + // lone trailing surrogate + let bytes = ByteBuf::from(vec![237, 176, 129]); + let v: ByteBuf = from_str(r#""\udc01""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by other leading surrogate + let bytes = ByteBuf::from(vec![237, 160, 188, 237, 160, 188]); + let v: ByteBuf = from_str(r#""\ud83c\ud83c""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by "a" (U+0061) in \u encoding + let bytes = ByteBuf::from(vec![237, 160, 188, 97]); + let v: ByteBuf = from_str(r#""\ud83c\u0061""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+0080 + let bytes = ByteBuf::from(vec![237, 160, 188, 194, 128]); + let v: ByteBuf = from_str(r#""\ud83c\u0080""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by U+FFFF + let bytes = ByteBuf::from(vec![237, 160, 188, 239, 191, 191]); + let v: ByteBuf = from_str(r#""\ud83c\uffff""#).unwrap(); + assert_eq!(v, bytes); +} + +#[test] +fn test_byte_buf_de_surrogate_pair() { + // leading surrogate followed by trailing surrogate + let bytes = ByteBuf::from(vec![240, 159, 128, 128]); + let v: ByteBuf = from_str(r#""\ud83c\udc00""#).unwrap(); + assert_eq!(v, bytes); + + // leading surrogate followed by a surrogate pair + let bytes = ByteBuf::from(vec![237, 160, 188, 240, 159, 128, 128]); + let v: ByteBuf = from_str(r#""\ud83c\ud83c\udc00""#).unwrap(); + assert_eq!(v, bytes); } #[cfg(feature = "raw_value")] #[test] -fn test_raw_de_lone_surrogate() { +fn test_raw_de_invalid_surrogates() { use serde_json::value::RawValue; assert!(from_str::>(r#""\ud83c""#).is_ok()); @@ -1746,6 +1777,17 @@ fn test_raw_de_lone_surrogate() { assert!(from_str::>(r#""\udc01\!""#).is_err()); assert!(from_str::>(r#""\udc01\u""#).is_err()); assert!(from_str::>(r#""\ud83c\ud83c""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0061""#).is_ok()); + assert!(from_str::>(r#""\ud83c\u0080""#).is_ok()); + assert!(from_str::>(r#""\ud83c\uffff""#).is_ok()); +} + +#[cfg(feature = "raw_value")] +#[test] +fn test_raw_de_surrogate_pair() { + use serde_json::value::RawValue; + + assert!(from_str::>(r#""\ud83c\udc00""#).is_ok()); } #[test]