Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Utf8Error::error_len, to help incremental and/or lossy decoding. #40212

Merged
merged 3 commits into from
Mar 15, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/libcollectionstest/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#![feature(test)]
#![feature(unboxed_closures)]
#![feature(unicode)]
#![feature(utf8_error_error_len)]

extern crate collections;
extern crate test;
Expand Down
30 changes: 30 additions & 0 deletions src/libcollectionstest/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,36 @@ fn from_utf8_mostly_ascii() {
}
}

#[test]
fn from_utf8_error() {
macro_rules! test {
($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => {
let error = from_utf8($input).unwrap_err();
assert_eq!(error.valid_up_to(), $expected_valid_up_to);
assert_eq!(error.error_len(), $expected_error_len);
}
}
test!(b"A\xC3\xA9 \xFF ", 4, Some(1));
test!(b"A\xC3\xA9 \x80 ", 4, Some(1));
test!(b"A\xC3\xA9 \xC1 ", 4, Some(1));
test!(b"A\xC3\xA9 \xC1", 4, Some(1));
test!(b"A\xC3\xA9 \xC2", 4, None);
test!(b"A\xC3\xA9 \xC2 ", 4, Some(1));
test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(1));
test!(b"A\xC3\xA9 \xE0", 4, None);
test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(1));
test!(b"A\xC3\xA9 \xE0\xA0", 4, None);
test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2));
test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2));
test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1));
test!(b"A\xC3\xA9 \xF1", 4, None);
test!(b"A\xC3\xA9 \xF1\x80", 4, None);
test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
test!(b"A\xC3\xA9 \xF1 ", 4, Some(1));
test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2));
test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3));
}

#[test]
fn test_as_bytes() {
// no null
Expand Down
80 changes: 58 additions & 22 deletions src/libcore/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,14 @@ Section: Creating a string
#[stable(feature = "rust1", since = "1.0.0")]
pub struct Utf8Error {
valid_up_to: usize,
error_len: Option<u8>,
}

impl Utf8Error {
/// Returns the index in the given string up to which valid UTF-8 was
/// verified.
///
/// It is the maximum index such that `from_utf8(input[..index])`
/// It is the maximum index such that `from_utf8(&input[..index])`
/// would return `Ok(_)`.
///
/// # Examples
Expand All @@ -152,6 +153,23 @@ impl Utf8Error {
/// ```
#[stable(feature = "utf8_error", since = "1.5.0")]
pub fn valid_up_to(&self) -> usize { self.valid_up_to }

/// Provide more information about the failure:
///
/// * `None`: the end of the input was reached unexpectedly.
/// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
/// If a byte stream (such as a file or a network socket) is being decoded incrementally,
/// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
///
/// * `Some(len)`: an unexpected byte was encountered.
/// The length provided is that of the invalid byte sequence
/// that starts at the index given by `valid_up_to()`.
/// Decoding should resume after that sequence
/// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
#[unstable(feature = "utf8_error_error_len", reason ="new", issue = "40494")]
pub fn error_len(&self) -> Option<usize> {
self.error_len.map(|len| len as usize)
}
}

/// Converts a slice of bytes to a string slice.
Expand Down Expand Up @@ -300,7 +318,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
#[stable(feature = "rust1", since = "1.0.0")]
impl fmt::Display for Utf8Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
if let Some(error_len) = self.error_len {
write!(f, "invalid utf-8 sequence of {} bytes from index {}",
error_len, self.valid_up_to)
} else {
write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
}
}
}

Expand Down Expand Up @@ -1241,25 +1264,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {

while index < len {
let old_offset = index;
macro_rules! err { () => {{
return Err(Utf8Error {
valid_up_to: old_offset
})
}}}
macro_rules! err {
($error_len: expr) => {
return Err(Utf8Error {
valid_up_to: old_offset,
error_len: $error_len,
})
}
}

macro_rules! next { () => {{
index += 1;
// we needed data, but there was none: error!
if index >= len {
err!()
err!(None)
}
v[index]
}}}

let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
let second = next!();
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
Expand All @@ -1279,25 +1304,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(1))
},
3 => {
match (first, second, next!() & !CONT_MASK) {
(0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) |
(0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
(0xED , 0x80 ... 0x9F, TAG_CONT_U8) |
(0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
_ => err!()
match (first, next!()) {
(0xE0 , 0xA0 ... 0xBF) |
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
(0xED , 0x80 ... 0x9F) |
(0xEE ... 0xEF, 0x80 ... 0xBF) => {}
_ => err!(Some(1))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
}
4 => {
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
(0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
(0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
(0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
_ => err!()
match (first, next!()) {
(0xF0 , 0x90 ... 0xBF) |
(0xF1 ... 0xF3, 0x80 ... 0xBF) |
(0xF4 , 0x80 ... 0x8F) => {}
_ => err!(Some(1))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(2))
}
if next!() & !CONT_MASK != TAG_CONT_U8 {
err!(Some(3))
}
}
_ => err!()
_ => err!(Some(1))
}
index += 1;
} else {
Expand Down