From caf8bcceff301b4fa3414e3f21813581a8d758a3 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Sat, 7 May 2022 09:34:57 -0700 Subject: [PATCH 1/5] Optimize `Wtf8Buf::into_string` for the case where it contains UTF-8. Add a `is_known_utf8` flag to `Wtf8Buf`, which tracks whether the string is known to contain UTF-8. This is efficiently computed in many common situations, such as when a `Wtf8Buf` is constructed from a `String` or `&str`, or with `Wtf8Buf::from_wide` which is already doing UTF-16 decoding and already checking for surrogates. This makes `OsString::into_string` O(1) rather than O(N) on Windows in common cases. And, it eliminates the need to scan through the string for surrogates in `Args::next` and `Vars::next`, because the strings are already being translated with `Wtf8Buf::from_wide`. Many things on Windows construct `OsString`s with `Wtf8Buf::from_wide`, such as `DirEntry::file_name` and `fs::read_link`, so with this patch, users of those functions can subsequently call `.into_string()` without paying for an extra scan through the string for surrogates. --- library/std/src/sys/windows/os_str.rs | 4 +- library/std/src/sys_common/wtf8.rs | 98 ++++++-- library/std/src/sys_common/wtf8/tests.rs | 303 +++++++++++++++++++++-- 3 files changed, 366 insertions(+), 39 deletions(-) diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 11883f15022f6..4bdd8c505ff25 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -164,9 +164,7 @@ impl Slice { } pub fn to_owned(&self) -> Buf { - let mut buf = Wtf8Buf::with_capacity(self.inner.len()); - buf.push_wtf8(&self.inner); - Buf { inner: buf } + Buf { inner: self.inner.to_owned() } } pub fn clone_into(&self, buf: &mut Buf) { diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index 57fa4989358a4..a674ced88bbb3 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -89,6 +89,24 @@ impl CodePoint { self.value } + /// Returns the numeric value of the code point if it is a leading surrogate. + #[inline] + pub fn to_lead_surrogate(&self) -> Option { + match self.value { + lead @ 0xD800..=0xDBFF => Some(lead as u16), + _ => None, + } + } + + /// Returns the numeric value of the code point if it is a trailing surrogate. + #[inline] + pub fn to_trail_surrogate(&self) -> Option { + match self.value { + trail @ 0xDC00..=0xDFFF => Some(trail as u16), + _ => None, + } + } + /// Optionally returns a Unicode scalar value for the code point. /// /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). @@ -117,6 +135,14 @@ impl CodePoint { #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] pub struct Wtf8Buf { bytes: Vec, + + /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily + /// know this if we're constructed from a `String` or `&str`. + /// + /// It is possible for `bytes` to have valid UTF-8 without this being + /// set, such as when we're concatenating `&Wtf8`'s and surrogates become + /// paired, as we don't bother to rescan the entire string. + is_known_utf8: bool, } impl ops::Deref for Wtf8Buf { @@ -147,13 +173,13 @@ impl Wtf8Buf { /// Creates a new, empty WTF-8 string. #[inline] pub fn new() -> Wtf8Buf { - Wtf8Buf { bytes: Vec::new() } + Wtf8Buf { bytes: Vec::new(), is_known_utf8: true } } /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes. #[inline] pub fn with_capacity(capacity: usize) -> Wtf8Buf { - Wtf8Buf { bytes: Vec::with_capacity(capacity) } + Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true } } /// Creates a WTF-8 string from a UTF-8 `String`. @@ -163,7 +189,7 @@ impl Wtf8Buf { /// Since WTF-8 is a superset of UTF-8, this always succeeds. #[inline] pub fn from_string(string: String) -> Wtf8Buf { - Wtf8Buf { bytes: string.into_bytes() } + Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true } } /// Creates a WTF-8 string from a UTF-8 `&str` slice. @@ -173,11 +199,12 @@ impl Wtf8Buf { /// Since WTF-8 is a superset of UTF-8, this always succeeds. #[inline] pub fn from_str(str: &str) -> Wtf8Buf { - Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) } + Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true } } pub fn clear(&mut self) { - self.bytes.clear() + self.bytes.clear(); + self.is_known_utf8 = true; } /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. @@ -195,7 +222,9 @@ impl Wtf8Buf { let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; // Skip the WTF-8 concatenation check, // surrogate pairs are already decoded by decode_utf16 - string.push_code_point_unchecked(code_point) + string.push_code_point_unchecked(code_point); + // The string now contains an unpaired surrogate. + string.is_known_utf8 = false; } } } @@ -203,7 +232,7 @@ impl Wtf8Buf { } /// Copied from String::push - /// This does **not** include the WTF-8 concatenation check. + /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check. fn push_code_point_unchecked(&mut self, code_point: CodePoint) { let mut bytes = [0; 4]; let bytes = char::encode_utf8_raw(code_point.value, &mut bytes); @@ -217,6 +246,9 @@ impl Wtf8Buf { #[inline] pub fn as_mut_slice(&mut self) -> &mut Wtf8 { + // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would + // cause them to change from well-formed UTF-8 to ill-formed UTF-8, + // which would break the assumptions of the `is_known_utf8` field. unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } } @@ -313,7 +345,15 @@ impl Wtf8Buf { self.push_char(decode_surrogate_pair(lead, trail)); self.bytes.extend_from_slice(other_without_trail_surrogate); } - _ => self.bytes.extend_from_slice(&other.bytes), + _ => { + self.bytes.extend_from_slice(&other.bytes); + + // If we're pushing a string containing a surrogate, we may no + // longer have UTF-8. + if other.next_surrogate(0).is_some() { + self.is_known_utf8 = false; + } + } } } @@ -330,13 +370,19 @@ impl Wtf8Buf { /// like concatenating ill-formed UTF-16 strings effectively would. #[inline] pub fn push(&mut self, code_point: CodePoint) { - if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() { + if let Some(trail) = code_point.to_trail_surrogate() { if let Some(lead) = (&*self).final_lead_surrogate() { let len_without_lead_surrogate = self.len() - 3; self.bytes.truncate(len_without_lead_surrogate); - self.push_char(decode_surrogate_pair(lead, trail as u16)); + self.push_char(decode_surrogate_pair(lead, trail)); return; } + + // We're pushing a trailing surrogate. + self.is_known_utf8 = false; + } else if code_point.to_lead_surrogate().is_some() { + // We're pushing a leading surrogate. + self.is_known_utf8 = false; } // No newly paired surrogates at the boundary. @@ -363,9 +409,10 @@ impl Wtf8Buf { /// (that is, if the string contains surrogates), /// the original WTF-8 string is returned instead. pub fn into_string(self) -> Result { - match self.next_surrogate(0) { - None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), - Some(_) => Err(self), + if self.is_known_utf8 || self.next_surrogate(0).is_none() { + Ok(unsafe { String::from_utf8_unchecked(self.bytes) }) + } else { + Err(self) } } @@ -375,6 +422,11 @@ impl Wtf8Buf { /// /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character ā€œļæ½ā€) pub fn into_string_lossy(mut self) -> String { + // Fast path: If we already have UTF-8, we can return it immediately. + if self.is_known_utf8 { + return unsafe { String::from_utf8_unchecked(self.bytes) }; + } + let mut pos = 0; loop { match self.next_surrogate(pos) { @@ -397,7 +449,7 @@ impl Wtf8Buf { /// Converts a `Box` into a `Wtf8Buf`. pub fn from_box(boxed: Box) -> Wtf8Buf { let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; - Wtf8Buf { bytes: bytes.into_vec() } + Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false } } } @@ -575,6 +627,11 @@ impl Wtf8 { } } + /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. + pub fn to_owned(&self) -> Wtf8Buf { + Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false } + } + /// Lossily converts the string to UTF-8. /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. /// @@ -664,7 +721,8 @@ impl Wtf8 { } pub fn clone_into(&self, buf: &mut Wtf8Buf) { - self.bytes.clone_into(&mut buf.bytes) + self.bytes.clone_into(&mut buf.bytes); + buf.is_known_utf8 = false; } /// Boxes this `Wtf8`. @@ -704,12 +762,18 @@ impl Wtf8 { #[inline] pub fn to_ascii_lowercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() } + Wtf8Buf { + bytes: self.bytes.to_ascii_lowercase(), + is_known_utf8: self.next_surrogate(0).is_none(), + } } #[inline] pub fn to_ascii_uppercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() } + Wtf8Buf { + bytes: self.bytes.to_ascii_uppercase(), + is_known_utf8: self.next_surrogate(0).is_none(), + } } #[inline] diff --git a/library/std/src/sys_common/wtf8/tests.rs b/library/std/src/sys_common/wtf8/tests.rs index 931996791fbe5..8360e3da215aa 100644 --- a/library/std/src/sys_common/wtf8/tests.rs +++ b/library/std/src/sys_common/wtf8/tests.rs @@ -19,6 +19,36 @@ fn code_point_to_u32() { assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF); } +#[test] +fn code_point_to_lead_surrogate() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_lead_surrogate(), None); + assert_eq!(c(0xE9).to_lead_surrogate(), None); + assert_eq!(c(0xD800).to_lead_surrogate(), Some(0xD800)); + assert_eq!(c(0xDBFF).to_lead_surrogate(), Some(0xDBFF)); + assert_eq!(c(0xDC00).to_lead_surrogate(), None); + assert_eq!(c(0xDFFF).to_lead_surrogate(), None); + assert_eq!(c(0x1F4A9).to_lead_surrogate(), None); + assert_eq!(c(0x10FFFF).to_lead_surrogate(), None); +} + +#[test] +fn code_point_to_trail_surrogate() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_trail_surrogate(), None); + assert_eq!(c(0xE9).to_trail_surrogate(), None); + assert_eq!(c(0xD800).to_trail_surrogate(), None); + assert_eq!(c(0xDBFF).to_trail_surrogate(), None); + assert_eq!(c(0xDC00).to_trail_surrogate(), Some(0xDC00)); + assert_eq!(c(0xDFFF).to_trail_surrogate(), Some(0xDFFF)); + assert_eq!(c(0x1F4A9).to_trail_surrogate(), None); + assert_eq!(c(0x10FFFF).to_trail_surrogate(), None); +} + #[test] fn code_point_from_char() { assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); @@ -70,35 +100,66 @@ fn wtf8buf_from_string() { #[test] fn wtf8buf_from_wide() { - assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b""); - assert_eq!( - Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes, - b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9" - ); + let buf = Wtf8Buf::from_wide(&[]); + assert_eq!(buf.bytes, b""); + assert!(buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xDCA9]); + assert_eq!(buf.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); + assert_eq!(buf.bytes, b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xD800]); + assert_eq!(buf.bytes, b"\xED\xA0\x80"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xDBFF]); + assert_eq!(buf.bytes, b"\xED\xAF\xBF"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xDC00]); + assert_eq!(buf.bytes, b"\xED\xB0\x80"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xDFFF]); + assert_eq!(buf.bytes, b"\xED\xBF\xBF"); + assert!(!buf.is_known_utf8); } #[test] fn wtf8buf_push_str() { let mut string = Wtf8Buf::new(); assert_eq!(string.bytes, b""); + assert!(string.is_known_utf8); + string.push_str("aĆ© šŸ’©"); assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); } #[test] fn wtf8buf_push_char() { let mut string = Wtf8Buf::from_str("aĆ© "); assert_eq!(string.bytes, b"a\xC3\xA9 "); + assert!(string.is_known_utf8); + string.push_char('šŸ’©'); assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); } #[test] fn wtf8buf_push() { let mut string = Wtf8Buf::from_str("aĆ© "); assert_eq!(string.bytes, b"a\xC3\xA9 "); + assert!(string.is_known_utf8); + string.push(CodePoint::from_char('šŸ’©')); assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() @@ -106,37 +167,46 @@ fn wtf8buf_push() { let mut string = Wtf8Buf::new(); string.push(c(0xD83D)); // lead + assert!(!string.is_known_utf8); string.push(c(0xDCA9)); // trail assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! let mut string = Wtf8Buf::new(); string.push(c(0xD83D)); // lead + assert!(!string.is_known_utf8); string.push(c(0x20)); // not surrogate string.push(c(0xDCA9)); // trail assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); let mut string = Wtf8Buf::new(); string.push(c(0xD800)); // lead + assert!(!string.is_known_utf8); string.push(c(0xDBFF)); // lead assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); let mut string = Wtf8Buf::new(); string.push(c(0xD800)); // lead + assert!(!string.is_known_utf8); string.push(c(0xE000)); // not surrogate assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); let mut string = Wtf8Buf::new(); string.push(c(0xD7FF)); // not surrogate + assert!(string.is_known_utf8); string.push(c(0xDC00)); // trail + assert!(!string.is_known_utf8); assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); let mut string = Wtf8Buf::new(); string.push(c(0x61)); // not surrogate, < 3 bytes + assert!(string.is_known_utf8); string.push(c(0xDC00)); // trail + assert!(!string.is_known_utf8); assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); let mut string = Wtf8Buf::new(); string.push(c(0xDC00)); // trail + assert!(!string.is_known_utf8); assert_eq!(string.bytes, b"\xED\xB0\x80"); } @@ -146,6 +216,7 @@ fn wtf8buf_push_wtf8() { assert_eq!(string.bytes, b"a\xC3\xA9"); string.push_wtf8(Wtf8::from_str(" šŸ’©")); assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); fn w(v: &[u8]) -> &Wtf8 { unsafe { Wtf8::from_bytes_unchecked(v) } @@ -161,37 +232,68 @@ fn wtf8buf_push_wtf8() { string.push_wtf8(w(b" ")); // not surrogate string.push_wtf8(w(b"\xED\xB2\xA9")); // trail assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xA0\x80")); // lead string.push_wtf8(w(b"\xED\xAF\xBF")); // lead assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xA0\x80")); // lead string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate string.push_wtf8(w(b"\xED\xB0\x80")); // trail assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes string.push_wtf8(w(b"\xED\xB0\x80")); // trail assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + assert!(!string.is_known_utf8); let mut string = Wtf8Buf::new(); string.push_wtf8(w(b"\xED\xB0\x80")); // trail assert_eq!(string.bytes, b"\xED\xB0\x80"); + assert!(!string.is_known_utf8); } #[test] fn wtf8buf_truncate() { let mut string = Wtf8Buf::from_str("aĆ©"); + assert!(string.is_known_utf8); + + string.truncate(3); + assert_eq!(string.bytes, b"a\xC3\xA9"); + assert!(string.is_known_utf8); + string.truncate(1); assert_eq!(string.bytes, b"a"); + assert!(string.is_known_utf8); + + string.truncate(0); + assert_eq!(string.bytes, b""); + assert!(string.is_known_utf8); +} + +#[test] +fn wtf8buf_truncate_around_non_bmp() { + let mut string = Wtf8Buf::from_str("šŸ’©"); + assert!(string.is_known_utf8); + + string.truncate(4); + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); + + string.truncate(0); + assert_eq!(string.bytes, b""); + assert!(string.is_known_utf8); } #[test] @@ -208,11 +310,37 @@ fn wtf8buf_truncate_fail_longer() { string.truncate(4); } +#[test] +#[should_panic] +fn wtf8buf_truncate_splitting_non_bmp3() { + let mut string = Wtf8Buf::from_str("šŸ’©"); + assert!(string.is_known_utf8); + string.truncate(3); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_splitting_non_bmp2() { + let mut string = Wtf8Buf::from_str("šŸ’©"); + assert!(string.is_known_utf8); + string.truncate(2); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_splitting_non_bmp1() { + let mut string = Wtf8Buf::from_str("šŸ’©"); + assert!(string.is_known_utf8); + string.truncate(1); +} + #[test] fn wtf8buf_into_string() { let mut string = Wtf8Buf::from_str("aĆ© šŸ’©"); + assert!(string.is_known_utf8); assert_eq!(string.clone().into_string(), Ok(String::from("aĆ© šŸ’©"))); string.push(CodePoint::from_u32(0xD800).unwrap()); + assert!(!string.is_known_utf8); assert_eq!(string.clone().into_string(), Err(string)); } @@ -229,15 +357,33 @@ fn wtf8buf_from_iterator() { fn f(values: &[u32]) -> Wtf8Buf { values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::() } - assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!( + f(&[0x61, 0xE9, 0x20, 0x1F4A9]), + Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true } + ); assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! - assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); - assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); - assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); - assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); - assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80"); - assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80"); + assert_eq!( + f(&[0xD83D, 0x20, 0xDCA9]), + Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0xD800, 0xDBFF]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xED\xAF\xBF".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0xD800, 0xE000]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xEE\x80\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0xD7FF, 0xDC00]), + Wtf8Buf { bytes: b"\xED\x9F\xBF\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0x61, 0xDC00]), + Wtf8Buf { bytes: b"\x61\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!(f(&[0xDC00]), Wtf8Buf { bytes: b"\xED\xB0\x80".to_vec(), is_known_utf8: false }); } #[test] @@ -251,15 +397,36 @@ fn wtf8buf_extend() { string } - assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert_eq!( + e(&[0x61, 0xE9], &[0x20, 0x1F4A9]), + Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true } + ); assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! - assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); - assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); - assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); - assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); - assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80"); - assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80"); + assert_eq!( + e(&[0xD83D, 0x20], &[0xDCA9]), + Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0xD800], &[0xDBFF]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xED\xAF\xBF".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0xD800], &[0xE000]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xEE\x80\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0xD7FF], &[0xDC00]), + Wtf8Buf { bytes: b"\xED\x9F\xBF\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0x61], &[0xDC00]), + Wtf8Buf { bytes: b"\x61\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[], &[0xDC00]), + Wtf8Buf { bytes: b"\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); } #[test] @@ -407,3 +574,101 @@ fn wtf8_encode_wide_size_hint() { assert_eq!((0, Some(0)), iter.size_hint()); assert!(iter.next().is_none()); } + +#[test] +fn wtf8_clone_into() { + let mut string = Wtf8Buf::new(); + Wtf8::from_str("green").clone_into(&mut string); + assert_eq!(string.bytes, b"green"); + + let mut string = Wtf8Buf::from_str("green"); + Wtf8::from_str("").clone_into(&mut string); + assert_eq!(string.bytes, b""); + + let mut string = Wtf8Buf::from_str("red"); + Wtf8::from_str("green").clone_into(&mut string); + assert_eq!(string.bytes, b"green"); + + let mut string = Wtf8Buf::from_str("green"); + Wtf8::from_str("red").clone_into(&mut string); + assert_eq!(string.bytes, b"red"); + + let mut string = Wtf8Buf::from_str("green"); + assert!(string.is_known_utf8); + unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").clone_into(&mut string) }; + assert_eq!(string.bytes, b"\xED\xA0\x80"); + assert!(!string.is_known_utf8); +} + +#[test] +fn wtf8_to_ascii_lowercase() { + let lowercase = Wtf8::from_str("").to_ascii_lowercase(); + assert_eq!(lowercase.bytes, b""); + assert!(lowercase.is_known_utf8); + + let lowercase = Wtf8::from_str("GrEeN gRaPeS! šŸ‡").to_ascii_lowercase(); + assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); + assert!(lowercase.is_known_utf8); + + let lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_lowercase() }; + assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); + assert!(!lowercase.is_known_utf8); +} + +#[test] +fn wtf8_to_ascii_uppercase() { + let uppercase = Wtf8::from_str("").to_ascii_uppercase(); + assert_eq!(uppercase.bytes, b""); + assert!(uppercase.is_known_utf8); + + let uppercase = Wtf8::from_str("GrEeN gRaPeS! šŸ‡").to_ascii_uppercase(); + assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); + assert!(uppercase.is_known_utf8); + + let uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_uppercase() }; + assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); + assert!(!uppercase.is_known_utf8); +} + +#[test] +fn wtf8_make_ascii_lowercase() { + let mut lowercase = Wtf8Buf::from_str(""); + lowercase.make_ascii_lowercase(); + assert_eq!(lowercase.bytes, b""); + assert!(lowercase.is_known_utf8); + + let mut lowercase = Wtf8Buf::from_str("GrEeN gRaPeS! šŸ‡"); + lowercase.make_ascii_lowercase(); + assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); + assert!(lowercase.is_known_utf8); + + let mut lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + lowercase.make_ascii_lowercase(); + assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); + assert!(!lowercase.is_known_utf8); +} + +#[test] +fn wtf8_make_ascii_uppercase() { + let mut uppercase = Wtf8Buf::from_str(""); + uppercase.make_ascii_uppercase(); + assert_eq!(uppercase.bytes, b""); + assert!(uppercase.is_known_utf8); + + let mut uppercase = Wtf8Buf::from_str("GrEeN gRaPeS! šŸ‡"); + uppercase.make_ascii_uppercase(); + assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); + assert!(uppercase.is_known_utf8); + + let mut uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + uppercase.make_ascii_uppercase(); + assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); + assert!(!uppercase.is_known_utf8); +} + +#[test] +fn wtf8_to_owned() { + let string = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + assert_eq!(string.bytes, b"\xED\xA0\x80"); + assert!(!string.is_known_utf8); +} From d3a585e593f441405fd30d082b8eb70e0b4c0d62 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 9 May 2022 08:55:06 -0700 Subject: [PATCH 2/5] Panic safety. --- library/std/src/sys_common/wtf8.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index a674ced88bbb3..718308d1e2a40 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -220,11 +220,11 @@ impl Wtf8Buf { let surrogate = surrogate.unpaired_surrogate(); // Surrogates are known to be in the code point range. let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; + // The string will now contain an unpaired surrogate. + string.is_known_utf8 = false; // Skip the WTF-8 concatenation check, // surrogate pairs are already decoded by decode_utf16 string.push_code_point_unchecked(code_point); - // The string now contains an unpaired surrogate. - string.is_known_utf8 = false; } } } @@ -346,13 +346,13 @@ impl Wtf8Buf { self.bytes.extend_from_slice(other_without_trail_surrogate); } _ => { - self.bytes.extend_from_slice(&other.bytes); - - // If we're pushing a string containing a surrogate, we may no - // longer have UTF-8. + // If we'll be pushing a string containing a surrogate, we may + // no longer have UTF-8. if other.next_surrogate(0).is_some() { self.is_known_utf8 = false; } + + self.bytes.extend_from_slice(&other.bytes); } } } @@ -721,8 +721,8 @@ impl Wtf8 { } pub fn clone_into(&self, buf: &mut Wtf8Buf) { - self.bytes.clone_into(&mut buf.bytes); buf.is_known_utf8 = false; + self.bytes.clone_into(&mut buf.bytes); } /// Boxes this `Wtf8`. From a7d57c61c2422b7942575936faf9a62d9e5e1599 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 9 May 2022 09:06:27 -0700 Subject: [PATCH 3/5] Don't eagerly scan for `is_known_utf8` in `to_ascii_lowercase`/`uppercase`. --- library/std/src/sys_common/wtf8.rs | 10 ++-------- library/std/src/sys_common/wtf8/tests.rs | 4 ---- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs index 718308d1e2a40..86d173d8fcde0 100644 --- a/library/std/src/sys_common/wtf8.rs +++ b/library/std/src/sys_common/wtf8.rs @@ -762,18 +762,12 @@ impl Wtf8 { #[inline] pub fn to_ascii_lowercase(&self) -> Wtf8Buf { - Wtf8Buf { - bytes: self.bytes.to_ascii_lowercase(), - is_known_utf8: self.next_surrogate(0).is_none(), - } + Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false } } #[inline] pub fn to_ascii_uppercase(&self) -> Wtf8Buf { - Wtf8Buf { - bytes: self.bytes.to_ascii_uppercase(), - is_known_utf8: self.next_surrogate(0).is_none(), - } + Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false } } #[inline] diff --git a/library/std/src/sys_common/wtf8/tests.rs b/library/std/src/sys_common/wtf8/tests.rs index 8360e3da215aa..ba2bdc76e3b44 100644 --- a/library/std/src/sys_common/wtf8/tests.rs +++ b/library/std/src/sys_common/wtf8/tests.rs @@ -635,12 +635,10 @@ fn wtf8_make_ascii_lowercase() { let mut lowercase = Wtf8Buf::from_str(""); lowercase.make_ascii_lowercase(); assert_eq!(lowercase.bytes, b""); - assert!(lowercase.is_known_utf8); let mut lowercase = Wtf8Buf::from_str("GrEeN gRaPeS! šŸ‡"); lowercase.make_ascii_lowercase(); assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); - assert!(lowercase.is_known_utf8); let mut lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; lowercase.make_ascii_lowercase(); @@ -653,12 +651,10 @@ fn wtf8_make_ascii_uppercase() { let mut uppercase = Wtf8Buf::from_str(""); uppercase.make_ascii_uppercase(); assert_eq!(uppercase.bytes, b""); - assert!(uppercase.is_known_utf8); let mut uppercase = Wtf8Buf::from_str("GrEeN gRaPeS! šŸ‡"); uppercase.make_ascii_uppercase(); assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); - assert!(uppercase.is_known_utf8); let mut uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; uppercase.make_ascii_uppercase(); From 516a67aad9a2d962917724138654e8f14a0fac4c Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Mon, 9 May 2022 09:32:09 -0700 Subject: [PATCH 4/5] Remove `is_known_utf8` checks from more tests where it's no longer set. --- library/std/src/sys_common/wtf8/tests.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/library/std/src/sys_common/wtf8/tests.rs b/library/std/src/sys_common/wtf8/tests.rs index ba2bdc76e3b44..1a302d646941b 100644 --- a/library/std/src/sys_common/wtf8/tests.rs +++ b/library/std/src/sys_common/wtf8/tests.rs @@ -604,11 +604,9 @@ fn wtf8_clone_into() { fn wtf8_to_ascii_lowercase() { let lowercase = Wtf8::from_str("").to_ascii_lowercase(); assert_eq!(lowercase.bytes, b""); - assert!(lowercase.is_known_utf8); let lowercase = Wtf8::from_str("GrEeN gRaPeS! šŸ‡").to_ascii_lowercase(); assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); - assert!(lowercase.is_known_utf8); let lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_lowercase() }; assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); @@ -619,11 +617,9 @@ fn wtf8_to_ascii_lowercase() { fn wtf8_to_ascii_uppercase() { let uppercase = Wtf8::from_str("").to_ascii_uppercase(); assert_eq!(uppercase.bytes, b""); - assert!(uppercase.is_known_utf8); let uppercase = Wtf8::from_str("GrEeN gRaPeS! šŸ‡").to_ascii_uppercase(); assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); - assert!(uppercase.is_known_utf8); let uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_uppercase() }; assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); From ddeb936c6d797d47ec8652c885c8a2cceba8386f Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Tue, 17 May 2022 18:23:43 -0700 Subject: [PATCH 5/5] Don't test the rustdoc rendering context size on Windows. This assert is just making sure the size of `Context` doens't grow unexpectedly, and it's already not being checked on every platform. `PathBuf` now has a different size on Windows, so adjust this to avoid checking the size on Windows. --- src/librustdoc/html/render/context.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/librustdoc/html/render/context.rs b/src/librustdoc/html/render/context.rs index bfdc44c7e4534..367ffd4e883e7 100644 --- a/src/librustdoc/html/render/context.rs +++ b/src/librustdoc/html/render/context.rs @@ -71,7 +71,7 @@ pub(crate) struct Context<'tcx> { } // `Context` is cloned a lot, so we don't want the size to grow unexpectedly. -#[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] +#[cfg(all(not(windows), target_arch = "x86_64", target_pointer_width = "64"))] rustc_data_structures::static_assert_size!(Context<'_>, 128); /// Shared mutable state used in [`Context`] and elsewhere.