Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Wtf8Buf::into_string for the case where it contains UTF-8. #96869

Merged
merged 5 commits into from
Aug 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions library/std/src/sys/windows/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,7 @@ impl Slice {
}

pub fn to_owned(&self) -> Buf {
let mut buf = Wtf8Buf::with_capacity(self.inner.len());
buf.push_wtf8(&self.inner);
Buf { inner: buf }
Buf { inner: self.inner.to_owned() }
}

pub fn clone_into(&self, buf: &mut Buf) {
Expand Down
92 changes: 75 additions & 17 deletions library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,24 @@ impl CodePoint {
self.value
}

/// Returns the numeric value of the code point if it is a leading surrogate.
#[inline]
pub fn to_lead_surrogate(&self) -> Option<u16> {
match self.value {
lead @ 0xD800..=0xDBFF => Some(lead as u16),
_ => None,
}
}

/// Returns the numeric value of the code point if it is a trailing surrogate.
#[inline]
pub fn to_trail_surrogate(&self) -> Option<u16> {
match self.value {
trail @ 0xDC00..=0xDFFF => Some(trail as u16),
_ => None,
}
}

/// Optionally returns a Unicode scalar value for the code point.
///
/// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
Expand Down Expand Up @@ -117,6 +135,14 @@ impl CodePoint {
#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
pub struct Wtf8Buf {
bytes: Vec<u8>,

/// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
/// know this if we're constructed from a `String` or `&str`.
///
/// It is possible for `bytes` to have valid UTF-8 without this being
/// set, such as when we're concatenating `&Wtf8`'s and surrogates become
/// paired, as we don't bother to rescan the entire string.
is_known_utf8: bool,
Copy link
Member

@RalfJung RalfJung Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding a field here introduced a subtle bug in PathBuf: #124409.

Privacy-breaking transmutes are "fun". ;)

}

impl ops::Deref for Wtf8Buf {
Expand Down Expand Up @@ -147,13 +173,13 @@ impl Wtf8Buf {
/// Creates a new, empty WTF-8 string.
#[inline]
pub fn new() -> Wtf8Buf {
Wtf8Buf { bytes: Vec::new() }
Wtf8Buf { bytes: Vec::new(), is_known_utf8: true }
}

/// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
#[inline]
pub fn with_capacity(capacity: usize) -> Wtf8Buf {
Wtf8Buf { bytes: Vec::with_capacity(capacity) }
Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
}

/// Creates a WTF-8 string from a UTF-8 `String`.
Expand All @@ -163,7 +189,7 @@ impl Wtf8Buf {
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
#[inline]
pub fn from_string(string: String) -> Wtf8Buf {
Wtf8Buf { bytes: string.into_bytes() }
Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true }
}

/// Creates a WTF-8 string from a UTF-8 `&str` slice.
Expand All @@ -173,11 +199,12 @@ impl Wtf8Buf {
/// Since WTF-8 is a superset of UTF-8, this always succeeds.
#[inline]
pub fn from_str(str: &str) -> Wtf8Buf {
Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true }
}

pub fn clear(&mut self) {
self.bytes.clear()
self.bytes.clear();
self.is_known_utf8 = true;
}

/// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
Expand All @@ -193,17 +220,19 @@ impl Wtf8Buf {
let surrogate = surrogate.unpaired_surrogate();
// Surrogates are known to be in the code point range.
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
// The string will now contain an unpaired surrogate.
string.is_known_utf8 = false;
// Skip the WTF-8 concatenation check,
// surrogate pairs are already decoded by decode_utf16
string.push_code_point_unchecked(code_point)
string.push_code_point_unchecked(code_point);
}
}
}
string
}

/// Copied from String::push
/// This does **not** include the WTF-8 concatenation check.
/// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check.
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
let mut bytes = [0; 4];
let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
Expand All @@ -217,6 +246,9 @@ impl Wtf8Buf {

#[inline]
pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
// Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
// cause them to change from well-formed UTF-8 to ill-formed UTF-8,
// which would break the assumptions of the `is_known_utf8` field.
unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
}

Expand Down Expand Up @@ -313,7 +345,15 @@ impl Wtf8Buf {
self.push_char(decode_surrogate_pair(lead, trail));
self.bytes.extend_from_slice(other_without_trail_surrogate);
}
_ => self.bytes.extend_from_slice(&other.bytes),
_ => {
// If we'll be pushing a string containing a surrogate, we may
// no longer have UTF-8.
if other.next_surrogate(0).is_some() {
self.is_known_utf8 = false;
sunfishcode marked this conversation as resolved.
Show resolved Hide resolved
}

self.bytes.extend_from_slice(&other.bytes);
}
}
}

Expand All @@ -330,13 +370,19 @@ impl Wtf8Buf {
/// like concatenating ill-formed UTF-16 strings effectively would.
#[inline]
pub fn push(&mut self, code_point: CodePoint) {
if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
if let Some(trail) = code_point.to_trail_surrogate() {
if let Some(lead) = (&*self).final_lead_surrogate() {
let len_without_lead_surrogate = self.len() - 3;
self.bytes.truncate(len_without_lead_surrogate);
self.push_char(decode_surrogate_pair(lead, trail as u16));
self.push_char(decode_surrogate_pair(lead, trail));
return;
}

// We're pushing a trailing surrogate.
self.is_known_utf8 = false;
} else if code_point.to_lead_surrogate().is_some() {
// We're pushing a leading surrogate.
self.is_known_utf8 = false;
}

// No newly paired surrogates at the boundary.
Expand All @@ -363,9 +409,10 @@ impl Wtf8Buf {
/// (that is, if the string contains surrogates),
/// the original WTF-8 string is returned instead.
pub fn into_string(self) -> Result<String, Wtf8Buf> {
match self.next_surrogate(0) {
None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
Some(_) => Err(self),
if self.is_known_utf8 || self.next_surrogate(0).is_none() {
Ok(unsafe { String::from_utf8_unchecked(self.bytes) })
} else {
Err(self)
}
}

Expand All @@ -375,6 +422,11 @@ impl Wtf8Buf {
///
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
pub fn into_string_lossy(mut self) -> String {
// Fast path: If we already have UTF-8, we can return it immediately.
if self.is_known_utf8 {
return unsafe { String::from_utf8_unchecked(self.bytes) };
}

let mut pos = 0;
loop {
match self.next_surrogate(pos) {
Expand All @@ -397,7 +449,7 @@ impl Wtf8Buf {
/// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
Wtf8Buf { bytes: bytes.into_vec() }
Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false }
}
}

Expand Down Expand Up @@ -575,6 +627,11 @@ impl Wtf8 {
}
}

/// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
pub fn to_owned(&self) -> Wtf8Buf {
Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false }
sunfishcode marked this conversation as resolved.
Show resolved Hide resolved
}

/// Lossily converts the string to UTF-8.
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
///
Expand Down Expand Up @@ -664,7 +721,8 @@ impl Wtf8 {
}

pub fn clone_into(&self, buf: &mut Wtf8Buf) {
self.bytes.clone_into(&mut buf.bytes)
buf.is_known_utf8 = false;
self.bytes.clone_into(&mut buf.bytes);
}

/// Boxes this `Wtf8`.
Expand Down Expand Up @@ -704,12 +762,12 @@ impl Wtf8 {

#[inline]
pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false }
}

#[inline]
pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false }
}

#[inline]
Expand Down
Loading