From 8047f8fb519bb5d9bb3c29652bb77b30369de7b5 Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 12 Apr 2022 02:43:35 -0500 Subject: [PATCH 1/4] Add feature(str_from_utf16_endian) --- library/alloc/src/string.rs | 150 ++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index ed43244ebda19..e29f5fb70ab57 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -714,6 +714,156 @@ impl String { .collect() } + /// Decode a UTF-16LEā€“encoded vector `v` into a `String`, returning [`Err`] + /// if `v` contains any invalid data. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(str_from_utf16_endian)] + /// // š¯„˛music + /// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00, + /// 0x73, 0x00, 0x69, 0x00, 0x63, 0x00]; + /// assert_eq!(String::from("š¯„˛music"), + /// String::from_utf16le(v).unwrap()); + /// + /// // š¯„˛muic + /// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00, + /// 0x00, 0xD8, 0x69, 0x00, 0x63, 0x00]; + /// assert!(String::from_utf16le(v).is_err()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + pub fn from_utf16le(v: &[u8]) -> Result { + if v.len() % 2 != 0 { + return Err(FromUtf16Error(())); + } + match (cfg!(target_endian = "little"), unsafe { v.align_to::() }) { + (true, (&[], v, &[])) => Self::from_utf16(v), + _ => decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes)) + .collect::>() + .map_err(|_| FromUtf16Error(())), + } + } + + /// Decode a UTF-16LEā€“encoded slice `v` into a `String`, replacing + /// invalid data with [the replacement character (`U+FFFD`)][U+FFFD]. + /// + /// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`], + /// `from_utf16le_lossy` returns a `String` since the UTF-16 to UTF-8 + /// conversion requires a memory allocation. + /// + /// [`from_utf8_lossy`]: String::from_utf8_lossy + /// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow" + /// [U+FFFD]: core::char::REPLACEMENT_CHARACTER + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(str_from_utf16_endian)] + /// // š¯„˛music + /// let v = &[0x34, 0xD8, 0x1E, 0xDD, 0x6d, 0x00, 0x75, 0x00, + /// 0x73, 0x00, 0x1E, 0xDD, 0x69, 0x00, 0x63, 0x00, + /// 0x34, 0xD8]; + /// + /// assert_eq!(String::from("š¯„˛mus\u{FFFD}ic\u{FFFD}"), + /// String::from_utf16le_lossy(v)); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + pub fn from_utf16le_lossy(v: &[u8]) -> String { + match (cfg!(target_endian = "little"), unsafe { v.align_to::() }) { + (true, (&[], v, &[])) => Self::from_utf16_lossy(v), + (true, (&[], v, &[_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", + _ => { + let mut iter = v.array_chunks::<2>(); + let string = decode_utf16(iter.by_ref().copied().map(u16::from_le_bytes)) + .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) + .collect(); + if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" } + } + } + } + + /// Decode a UTF-16BEā€“encoded vector `v` into a `String`, returning [`Err`] + /// if `v` contains any invalid data. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(str_from_utf16_endian)] + /// // š¯„˛music + /// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75, + /// 0x00, 0x73, 0x00, 0x69, 0x00, 0x63]; + /// assert_eq!(String::from("š¯„˛music"), + /// String::from_utf16be(v).unwrap()); + /// + /// // š¯„˛muic + /// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75, + /// 0xD8, 0x00, 0x00, 0x69, 0x00, 0x63]; + /// assert!(String::from_utf16be(v).is_err()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + pub fn from_utf16be(v: &[u8]) -> Result { + if v.len() % 2 != 0 { + return Err(FromUtf16Error(())); + } + match (cfg!(target_endian = "big"), unsafe { v.align_to::() }) { + (true, (&[], v, &[])) => Self::from_utf16(v), + _ => decode_utf16(v.array_chunks::<2>().copied().map(u16::from_be_bytes)) + .collect::>() + .map_err(|_| FromUtf16Error(())), + } + } + + /// Decode a UTF-16BEā€“encoded slice `v` into a `String`, replacing + /// invalid data with [the replacement character (`U+FFFD`)][U+FFFD]. + /// + /// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`], + /// `from_utf16le_lossy` returns a `String` since the UTF-16 to UTF-8 + /// conversion requires a memory allocation. + /// + /// [`from_utf8_lossy`]: String::from_utf8_lossy + /// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow" + /// [U+FFFD]: core::char::REPLACEMENT_CHARACTER + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// #![feature(str_from_utf16_endian)] + /// // š¯„˛music + /// let v = &[0xD8, 0x34, 0xDD, 0x1E, 0x00, 0x6d, 0x00, 0x75, + /// 0x00, 0x73, 0xDD, 0x1E, 0x00, 0x69, 0x00, 0x63, + /// 0xD8, 0x34]; + /// + /// assert_eq!(String::from("š¯„˛mus\u{FFFD}ic\u{FFFD}"), + /// String::from_utf16be_lossy(v)); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + pub fn from_utf16be_lossy(v: &[u8]) -> String { + match (cfg!(target_endian = "big"), unsafe { v.align_to::() }) { + (true, (&[], v, &[])) => Self::from_utf16_lossy(v), + (true, (&[], v, &[_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", + _ => { + let mut iter = v.array_chunks::<2>(); + let string = decode_utf16(iter.by_ref().copied().map(u16::from_be_bytes)) + .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) + .collect(); + if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" } + } + } + } + /// Decomposes a `String` into its raw components. /// /// Returns the raw pointer to the underlying data, the length of From 3d448bd067234283c32792bf2a96d63d65fb5e91 Mon Sep 17 00:00:00 2001 From: Christopher Durham Date: Thu, 28 Sep 2023 23:18:55 -0400 Subject: [PATCH 2/4] style nits Co-authored-by: David Tolnay --- library/alloc/src/string.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index e29f5fb70ab57..9a7d8213f4279 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -741,7 +741,7 @@ impl String { return Err(FromUtf16Error(())); } match (cfg!(target_endian = "little"), unsafe { v.align_to::() }) { - (true, (&[], v, &[])) => Self::from_utf16(v), + (true, ([], v, [])) => Self::from_utf16(v), _ => decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes)) .collect::>() .map_err(|_| FromUtf16Error(())), @@ -777,8 +777,8 @@ impl String { #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] pub fn from_utf16le_lossy(v: &[u8]) -> String { match (cfg!(target_endian = "little"), unsafe { v.align_to::() }) { - (true, (&[], v, &[])) => Self::from_utf16_lossy(v), - (true, (&[], v, &[_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", + (true, ([], v, [])) => Self::from_utf16_lossy(v), + (true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", _ => { let mut iter = v.array_chunks::<2>(); let string = decode_utf16(iter.by_ref().copied().map(u16::from_le_bytes)) @@ -816,7 +816,7 @@ impl String { return Err(FromUtf16Error(())); } match (cfg!(target_endian = "big"), unsafe { v.align_to::() }) { - (true, (&[], v, &[])) => Self::from_utf16(v), + (true, ([], v, [])) => Self::from_utf16(v), _ => decode_utf16(v.array_chunks::<2>().copied().map(u16::from_be_bytes)) .collect::>() .map_err(|_| FromUtf16Error(())), @@ -852,8 +852,8 @@ impl String { #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] pub fn from_utf16be_lossy(v: &[u8]) -> String { match (cfg!(target_endian = "big"), unsafe { v.align_to::() }) { - (true, (&[], v, &[])) => Self::from_utf16_lossy(v), - (true, (&[], v, &[_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", + (true, ([], v, [])) => Self::from_utf16_lossy(v), + (true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", _ => { let mut iter = v.array_chunks::<2>(); let string = decode_utf16(iter.by_ref().copied().map(u16::from_be_bytes)) From 1efea3138554e37874c098e497f59e8f955d2d3a Mon Sep 17 00:00:00 2001 From: Christopher Durham Date: Thu, 28 Sep 2023 23:44:39 -0400 Subject: [PATCH 3/4] add str_from_utf16_endian tracking issue --- library/alloc/src/string.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 9a7d8213f4279..4e4c358200e35 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -735,7 +735,7 @@ impl String { /// assert!(String::from_utf16le(v).is_err()); /// ``` #[cfg(not(no_global_oom_handling))] - #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + #[unstable(feature = "str_from_utf16_endian", issue = "116258")] pub fn from_utf16le(v: &[u8]) -> Result { if v.len() % 2 != 0 { return Err(FromUtf16Error(())); @@ -774,7 +774,7 @@ impl String { /// String::from_utf16le_lossy(v)); /// ``` #[cfg(not(no_global_oom_handling))] - #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + #[unstable(feature = "str_from_utf16_endian", issue = "116258")] pub fn from_utf16le_lossy(v: &[u8]) -> String { match (cfg!(target_endian = "little"), unsafe { v.align_to::() }) { (true, ([], v, [])) => Self::from_utf16_lossy(v), @@ -810,7 +810,7 @@ impl String { /// assert!(String::from_utf16be(v).is_err()); /// ``` #[cfg(not(no_global_oom_handling))] - #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + #[unstable(feature = "str_from_utf16_endian", issue = "116258")] pub fn from_utf16be(v: &[u8]) -> Result { if v.len() % 2 != 0 { return Err(FromUtf16Error(())); @@ -849,7 +849,7 @@ impl String { /// String::from_utf16be_lossy(v)); /// ``` #[cfg(not(no_global_oom_handling))] - #[unstable(feature = "str_from_utf16_endian", issue = "none", reason = "recently added")] + #[unstable(feature = "str_from_utf16_endian", issue = "116258")] pub fn from_utf16be_lossy(v: &[u8]) -> String { match (cfg!(target_endian = "big"), unsafe { v.align_to::() }) { (true, ([], v, [])) => Self::from_utf16_lossy(v), From 5facc32e22e8843a8c276305fff4ec84d718e1c0 Mon Sep 17 00:00:00 2001 From: Christopher Durham Date: Fri, 29 Sep 2023 00:04:57 -0400 Subject: [PATCH 4/4] fix char imports --- library/alloc/src/string.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 4e4c358200e35..844441ba3a764 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -742,7 +742,7 @@ impl String { } match (cfg!(target_endian = "little"), unsafe { v.align_to::() }) { (true, ([], v, [])) => Self::from_utf16(v), - _ => decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes)) + _ => char::decode_utf16(v.array_chunks::<2>().copied().map(u16::from_le_bytes)) .collect::>() .map_err(|_| FromUtf16Error(())), } @@ -781,8 +781,8 @@ impl String { (true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", _ => { let mut iter = v.array_chunks::<2>(); - let string = decode_utf16(iter.by_ref().copied().map(u16::from_le_bytes)) - .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) + let string = char::decode_utf16(iter.by_ref().copied().map(u16::from_le_bytes)) + .map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER)) .collect(); if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" } } @@ -817,7 +817,7 @@ impl String { } match (cfg!(target_endian = "big"), unsafe { v.align_to::() }) { (true, ([], v, [])) => Self::from_utf16(v), - _ => decode_utf16(v.array_chunks::<2>().copied().map(u16::from_be_bytes)) + _ => char::decode_utf16(v.array_chunks::<2>().copied().map(u16::from_be_bytes)) .collect::>() .map_err(|_| FromUtf16Error(())), } @@ -856,8 +856,8 @@ impl String { (true, ([], v, [_remainder])) => Self::from_utf16_lossy(v) + "\u{FFFD}", _ => { let mut iter = v.array_chunks::<2>(); - let string = decode_utf16(iter.by_ref().copied().map(u16::from_be_bytes)) - .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) + let string = char::decode_utf16(iter.by_ref().copied().map(u16::from_be_bytes)) + .map(|r| r.unwrap_or(char::REPLACEMENT_CHARACTER)) .collect(); if iter.remainder().is_empty() { string } else { string + "\u{FFFD}" } }