Simplify unicode escape handling

This does not affect performance.
serde-rs · Aug 12, 2024 · 236cc82 · 236cc82
1 parent 2f28d10
commit 236cc82
Showing 1 changed file with 53 additions and 54 deletions.
diff --git a/src/read.rs b/src/read.rs
@@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
     validate: bool,
     scratch: &mut Vec<u8>,
 ) -> Result<()> {
-    let c = match tri!(read.decode_hex_escape()) {
-        n @ 0xDC00..=0xDFFF => {
-            return if validate {
-                error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
-            } else {
-                push_wtf8_codepoint(n as u32, scratch);
-                Ok(())
-            };
-        }
+    let n = tri!(read.decode_hex_escape());
 
-        // Non-BMP characters are encoded as a sequence of two hex
-        // escapes, representing UTF-16 surrogates. If deserializing a
-        // utf-8 string the surrogates are required to be paired,
-        // whereas deserializing a byte string accepts lone surrogates.
-        n1 @ 0xD800..=0xDBFF => {
-            if tri!(peek_or_eof(read)) == b'\\' {
-                read.discard();
-            } else {
-                return if validate {
-                    read.discard();
-                    error(read, ErrorCode::UnexpectedEndOfHexEscape)
-                } else {
-                    push_wtf8_codepoint(n1 as u32, scratch);
-                    Ok(())
-                };
-            }
+    // Non-BMP characters are encoded as a sequence of two hex
+    // escapes, representing UTF-16 surrogates. If deserializing a
+    // utf-8 string the surrogates are required to be paired,
+    // whereas deserializing a byte string accepts lone surrogates.
+    if validate && n >= 0xDC00 && n <= 0xDFFF {
+        // XXX: This is actually a trailing surrogate.
+        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+    }
 
-            if tri!(peek_or_eof(read)) == b'u' {
-                read.discard();
-            } else {
-                return if validate {
-                    read.discard();
-                    error(read, ErrorCode::UnexpectedEndOfHexEscape)
-                } else {
-                    push_wtf8_codepoint(n1 as u32, scratch);
-                    // The \ prior to this byte started an escape sequence,
-                    // so we need to parse that now. This recursive call
-                    // does not blow the stack on malicious input because
-                    // the escape is not \u, so it will be handled by one
-                    // of the easy nonrecursive cases.
-                    parse_escape(read, validate, scratch)
-                };
-            }
+    if n < 0xD800 || n > 0xDBFF {
+        // Every u16 outside of the surrogate ranges is guaranteed to be a
+        // legal char.
+        push_wtf8_codepoint(n as u32, scratch);
+        return Ok(());
+    }
 
-            let n2 = tri!(read.decode_hex_escape());
+    // n is a leading surrogate, we now expect a trailing surrogate.
+    let n1 = n;
 
-            if n2 < 0xDC00 || n2 > 0xDFFF {
-                return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
-            }
+    if tri!(peek_or_eof(read)) == b'\\' {
+        read.discard();
+    } else {
+        return if validate {
+            read.discard();
+            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+        } else {
+            push_wtf8_codepoint(n1 as u32, scratch);
+            Ok(())
+        };
+    }
 
-            // This value is in range U+10000..=U+10FFFF, which is always a
-            // valid codepoint.
-            (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000
-        }
+    if tri!(peek_or_eof(read)) == b'u' {
+        read.discard();
+    } else {
+        return if validate {
+            read.discard();
+            error(read, ErrorCode::UnexpectedEndOfHexEscape)
+        } else {
+            push_wtf8_codepoint(n1 as u32, scratch);
+            // The \ prior to this byte started an escape sequence,
+            // so we need to parse that now. This recursive call
+            // does not blow the stack on malicious input because
+            // the escape is not \u, so it will be handled by one
+            // of the easy nonrecursive cases.
+            parse_escape(read, validate, scratch)
+        };
+    }
 
-        // Every u16 outside of the surrogate ranges above is guaranteed
-        // to be a legal char.
-        n => n as u32,
-    };
+    let n2 = tri!(read.decode_hex_escape());
+
+    if n2 < 0xDC00 || n2 > 0xDFFF {
+        return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
+    }
 
-    push_wtf8_codepoint(c, scratch);
+    // This value is in range U+10000..=U+10FFFF, which is always a
+    // valid codepoint.
+    let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
+    push_wtf8_codepoint(n, scratch);
     Ok(())
 }