Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new function str::from_utf8_lossy() #12062

Merged
merged 2 commits into from
Feb 7, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 2 additions & 25 deletions src/libstd/path/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,10 +547,10 @@ impl<'a, P: GenericPath> ToStr for Display<'a, P> {
if self.filename {
match self.path.filename() {
None => ~"",
Some(v) => from_utf8_with_replacement(v)
Some(v) => str::from_utf8_lossy(v)
}
} else {
from_utf8_with_replacement(self.path.as_vec())
str::from_utf8_lossy(self.path.as_vec())
}
}
}
Expand Down Expand Up @@ -635,29 +635,6 @@ fn contains_nul(v: &[u8]) -> bool {
v.iter().any(|&x| x == 0)
}

#[inline(always)]
fn from_utf8_with_replacement(mut v: &[u8]) -> ~str {
// FIXME (#9516): Don't decode utf-8 manually here once we have a good way to do it in str
// This is a truly horrifically bad implementation, done as a functionality stopgap until
// we have a proper utf-8 decoder. I don't really want to write one here.
static REPLACEMENT_CHAR: char = '\uFFFD';

let mut s = str::with_capacity(v.len());
while !v.is_empty() {
let w = str::utf8_char_width(v[0]);
if w == 0u {
s.push_char(REPLACEMENT_CHAR);
v = v.slice_from(1);
} else if v.len() < w || !str::is_utf8(v.slice_to(w)) {
s.push_char(REPLACEMENT_CHAR);
v = v.slice_from(1);
} else {
s.push_str(unsafe { ::cast::transmute(v.slice_to(w)) });
v = v.slice_from(w);
}
}
s
}
#[cfg(test)]
mod tests {
use prelude::*;
Expand Down
185 changes: 179 additions & 6 deletions src/libstd/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -900,16 +900,122 @@ pub struct CharRange {
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
// for width 3, and 3 bits for width 4
macro_rules! utf8_first_byte(
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as uint)
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
)

// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as uint)
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
)

static TAG_CONT_U8: u8 = 128u8;

/// Converts a vector of bytes to a new utf-8 string.
/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have an example or two?

///
/// # Example
///
/// ```rust
/// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
/// let output = std::str::from_utf8_lossy(input);
/// assert_eq!(output, ~"Hello \uFFFDWorld");
/// ```
pub fn from_utf8_lossy(v: &[u8]) -> ~str {
static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
let mut i = 0u;
let mut lastgood = 0u;
let total = v.len();
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
unsafe { *xs.unsafe_ref(i) }
}
fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
if i >= total {
0
} else {
unsafe_get(xs, i)
}
}
let mut res = with_capacity(total);

while i < total {
let i_ = i;
let byte = unsafe_get(v, i);
i += 1;

macro_rules! error(() => {
unsafe {
if lastgood != i_ {
raw::push_bytes(&mut res, v.slice(lastgood, i_));
}
lastgood = i;
raw::push_bytes(&mut res, REPLACEMENT);
}
})

if byte < 128u8 {
// lastgood handles this
} else {
let w = utf8_char_width(byte);

match w {
2 => {
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
error!();
continue;
}
i += 1;
}
3 => {
match (byte, safe_get(v, i, total)) {
(0xE0 , 0xA0 .. 0xBF) => (),
(0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
(0xED , 0x80 .. 0x9F) => (),
(0xEE .. 0xEF, 0x80 .. 0xBF) => (),
_ => {
error!();
continue;
}
}
i += 1;
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
error!();
continue;
}
i += 1;
}
4 => {
match (byte, safe_get(v, i, total)) {
(0xF0 , 0x90 .. 0xBF) => (),
(0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
(0xF4 , 0x80 .. 0x8F) => (),
_ => {
error!();
continue;
}
}
i += 1;
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
error!();
continue;
}
i += 1;
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
error!();
continue;
}
i += 1;
}
_ => {
error!();
continue;
}
}
}
}
unsafe { raw::push_bytes(&mut res, v.slice(lastgood, total)) };
res
}

/// Unsafe operations
pub mod raw {
use cast;
Expand Down Expand Up @@ -2211,7 +2317,7 @@ impl<'a> StrSlice<'a> for &'a str {

// Multibyte case is a fn to allow char_range_at to inline cleanly
fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
let mut val = s[i] as uint;
let mut val = s[i] as u32;
let w = UTF8_CHAR_WIDTH[val] as uint;
assert!((w != 0));

Expand All @@ -2220,7 +2326,7 @@ impl<'a> StrSlice<'a> for &'a str {
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }

return CharRange {ch: unsafe { transmute(val as u32) }, next: i + w};
return CharRange {ch: unsafe { transmute(val) }, next: i + w};
}

return multibyte_char_range_at(*self, i);
Expand All @@ -2243,7 +2349,7 @@ impl<'a> StrSlice<'a> for &'a str {
i -= 1u;
}

let mut val = s[i] as uint;
let mut val = s[i] as u32;
let w = UTF8_CHAR_WIDTH[val] as uint;
assert!((w != 0));

Expand All @@ -2252,7 +2358,7 @@ impl<'a> StrSlice<'a> for &'a str {
if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }

return CharRange {ch: unsafe { transmute(val as u32) }, next: i};
return CharRange {ch: unsafe { transmute(val) }, next: i};
}

return multibyte_char_range_at_reverse(*self, prev);
Expand Down Expand Up @@ -3834,6 +3940,37 @@ mod tests {
assert_eq!(from_utf8_owned(xs), None);
}

#[test]
fn test_str_from_utf8_lossy() {
let xs = bytes!("hello");
assert_eq!(from_utf8_lossy(xs), ~"hello");

let xs = bytes!("ศไทย中华Việt Nam");
assert_eq!(from_utf8_lossy(xs), ~"ศไทย中华Việt Nam");

let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD There\uFFFD Goodbye");

let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
assert_eq!(from_utf8_lossy(xs), ~"Hello\uFFFD\uFFFD There\uFFFD Goodbye");

let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFD\uFFFDbar");

let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFDbaz");

let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
assert_eq!(from_utf8_lossy(xs), ~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz");

let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar");

// surrogates
let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
assert_eq!(from_utf8_lossy(xs), ~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar");
}

#[test]
fn test_to_send_str() {
assert_eq!("abcde".to_send_str(), SendStrStatic("abcde"));
Expand Down Expand Up @@ -3992,6 +4129,42 @@ mod bench {
});
}

#[bench]
fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
Lorem ipsum dolor sit amet, consectetur. ");

assert_eq!(100, s.len());
bh.iter(|| {
let _ = from_utf8_lossy(s);
});
}

#[bench]
fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
assert_eq!(100, s.len());
bh.iter(|| {
let _ = from_utf8_lossy(s);
});
}

#[bench]
fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
bh.iter(|| {
let _ = from_utf8_lossy(s);
});
}

#[bench]
fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
let s = ::vec::from_elem(100, 0xF5u8);
bh.iter(|| {
let _ = from_utf8_lossy(s);
});
}

#[bench]
fn bench_with_capacity(bh: &mut BenchHarness) {
bh.iter(|| {
Expand Down