Skip to content

Commit

Permalink
Simplify unicode escape handling
Browse files Browse the repository at this point in the history
This does not affect performance.
  • Loading branch information
purplesyringa committed Aug 12, 2024
1 parent 2f28d10 commit 236cc82
Showing 1 changed file with 53 additions and 54 deletions.
107 changes: 53 additions & 54 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -898,67 +898,66 @@ fn parse_unicode_escape<'de, R: Read<'de>>(
validate: bool,
scratch: &mut Vec<u8>,
) -> Result<()> {
let c = match tri!(read.decode_hex_escape()) {
n @ 0xDC00..=0xDFFF => {
return if validate {
error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
} else {
push_wtf8_codepoint(n as u32, scratch);
Ok(())
};
}
let n = tri!(read.decode_hex_escape());

// Non-BMP characters are encoded as a sequence of two hex
// escapes, representing UTF-16 surrogates. If deserializing a
// utf-8 string the surrogates are required to be paired,
// whereas deserializing a byte string accepts lone surrogates.
n1 @ 0xD800..=0xDBFF => {
if tri!(peek_or_eof(read)) == b'\\' {
read.discard();
} else {
return if validate {
read.discard();
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
push_wtf8_codepoint(n1 as u32, scratch);
Ok(())
};
}
// Non-BMP characters are encoded as a sequence of two hex
// escapes, representing UTF-16 surrogates. If deserializing a
// utf-8 string the surrogates are required to be paired,
// whereas deserializing a byte string accepts lone surrogates.
if validate && n >= 0xDC00 && n <= 0xDFFF {
// XXX: This is actually a trailing surrogate.
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

if tri!(peek_or_eof(read)) == b'u' {
read.discard();
} else {
return if validate {
read.discard();
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
push_wtf8_codepoint(n1 as u32, scratch);
// The \ prior to this byte started an escape sequence,
// so we need to parse that now. This recursive call
// does not blow the stack on malicious input because
// the escape is not \u, so it will be handled by one
// of the easy nonrecursive cases.
parse_escape(read, validate, scratch)
};
}
if n < 0xD800 || n > 0xDBFF {
// Every u16 outside of the surrogate ranges is guaranteed to be a
// legal char.
push_wtf8_codepoint(n as u32, scratch);
return Ok(());
}

let n2 = tri!(read.decode_hex_escape());
// n is a leading surrogate, we now expect a trailing surrogate.
let n1 = n;

if n2 < 0xDC00 || n2 > 0xDFFF {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}
if tri!(peek_or_eof(read)) == b'\\' {
read.discard();
} else {
return if validate {
read.discard();
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
push_wtf8_codepoint(n1 as u32, scratch);
Ok(())
};
}

// This value is in range U+10000..=U+10FFFF, which is always a
// valid codepoint.
(((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000
}
if tri!(peek_or_eof(read)) == b'u' {
read.discard();
} else {
return if validate {
read.discard();
error(read, ErrorCode::UnexpectedEndOfHexEscape)
} else {
push_wtf8_codepoint(n1 as u32, scratch);
// The \ prior to this byte started an escape sequence,
// so we need to parse that now. This recursive call
// does not blow the stack on malicious input because
// the escape is not \u, so it will be handled by one
// of the easy nonrecursive cases.
parse_escape(read, validate, scratch)
};
}

// Every u16 outside of the surrogate ranges above is guaranteed
// to be a legal char.
n => n as u32,
};
let n2 = tri!(read.decode_hex_escape());

if n2 < 0xDC00 || n2 > 0xDFFF {
return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
}

push_wtf8_codepoint(c, scratch);
// This value is in range U+10000..=U+10FFFF, which is always a
// valid codepoint.
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
push_wtf8_codepoint(n, scratch);
Ok(())
}

Expand Down

0 comments on commit 236cc82

Please sign in to comment.