From 28449daa2241410d4c384c5608b87fe7c85ffae6 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 12 May 2023 19:37:02 -0700 Subject: [PATCH] `ascii::Char`-ify the escaping code This means that `EscapeIterInner::as_str` no longer needs unsafe code, because the type system ensures the internal buffer is only ASCII, and thus valid UTF-8. --- library/core/src/ascii.rs | 2 +- library/core/src/char/methods.rs | 24 ++++++------ library/core/src/char/mod.rs | 23 +++++------ library/core/src/escape.rs | 65 +++++++++++++++++++------------- library/core/src/lib.rs | 1 + 5 files changed, 65 insertions(+), 50 deletions(-) diff --git a/library/core/src/ascii.rs b/library/core/src/ascii.rs index 7fd14a7e1eae8..ef8e4d098ed95 100644 --- a/library/core/src/ascii.rs +++ b/library/core/src/ascii.rs @@ -91,7 +91,7 @@ pub struct EscapeDefault(escape::EscapeIterInner<4>); /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn escape_default(c: u8) -> EscapeDefault { - let mut data = [0; 4]; + let mut data = [Char::Null; 4]; let range = escape::escape_ascii_into(&mut data, c); EscapeDefault(escape::EscapeIterInner::new(data, range)) } diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 1dfa9c34db14c..515b8d20ead86 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -392,13 +392,13 @@ impl char { #[inline] pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug { match self { - '\0' => EscapeDebug::backslash(b'0'), - '\t' => EscapeDebug::backslash(b't'), - '\r' => EscapeDebug::backslash(b'r'), - '\n' => EscapeDebug::backslash(b'n'), - '\\' => EscapeDebug::backslash(b'\\'), - '"' if args.escape_double_quote => EscapeDebug::backslash(b'"'), - '\'' if args.escape_single_quote => EscapeDebug::backslash(b'\''), + '\0' => EscapeDebug::backslash(ascii::Char::Digit0), + '\t' => EscapeDebug::backslash(ascii::Char::SmallT), + '\r' => EscapeDebug::backslash(ascii::Char::SmallR), + '\n' => EscapeDebug::backslash(ascii::Char::SmallN), + '\\' => EscapeDebug::backslash(ascii::Char::ReverseSolidus), + '\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark), + '\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe), _ if args.escape_grapheme_extended && self.is_grapheme_extended() => { EscapeDebug::from_unicode(self.escape_unicode()) } @@ -503,11 +503,11 @@ impl char { #[inline] pub fn escape_default(self) -> EscapeDefault { match self { - '\t' => EscapeDefault::backslash(b't'), - '\r' => EscapeDefault::backslash(b'r'), - '\n' => EscapeDefault::backslash(b'n'), - '\\' | '\'' | '"' => EscapeDefault::backslash(self as u8), - '\x20'..='\x7e' => EscapeDefault::printable(self as u8), + '\t' => EscapeDefault::backslash(ascii::Char::SmallT), + '\r' => EscapeDefault::backslash(ascii::Char::SmallR), + '\n' => EscapeDefault::backslash(ascii::Char::SmallN), + '\\' | '\'' | '"' => EscapeDefault::backslash(self.as_ascii().unwrap()), + '\x20'..='\x7e' => EscapeDefault::printable(self.as_ascii().unwrap()), _ => EscapeDefault::from_unicode(self.escape_unicode()), } } diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index e186db7052cd0..5c42912874c66 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -38,6 +38,7 @@ pub use self::methods::encode_utf16_raw; #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] pub use self::methods::encode_utf8_raw; +use crate::ascii; use crate::error::Error; use crate::escape; use crate::fmt::{self, Write}; @@ -152,7 +153,7 @@ pub struct EscapeUnicode(escape::EscapeIterInner<10>); impl EscapeUnicode { fn new(chr: char) -> Self { - let mut data = [0; 10]; + let mut data = [ascii::Char::Null; 10]; let range = escape::escape_unicode_into(&mut data, chr); Self(escape::EscapeIterInner::new(data, range)) } @@ -218,14 +219,14 @@ impl fmt::Display for EscapeUnicode { pub struct EscapeDefault(escape::EscapeIterInner<10>); impl EscapeDefault { - fn printable(chr: u8) -> Self { - let data = [chr, 0, 0, 0, 0, 0, 0, 0, 0, 0]; - Self(escape::EscapeIterInner::new(data, 0..1)) + fn printable(chr: ascii::Char) -> Self { + let data = [chr]; + Self(escape::EscapeIterInner::from_array(data)) } - fn backslash(chr: u8) -> Self { - let data = [b'\\', chr, 0, 0, 0, 0, 0, 0, 0, 0]; - Self(escape::EscapeIterInner::new(data, 0..2)) + fn backslash(chr: ascii::Char) -> Self { + let data = [ascii::Char::ReverseSolidus, chr]; + Self(escape::EscapeIterInner::from_array(data)) } fn from_unicode(esc: EscapeUnicode) -> Self { @@ -307,9 +308,9 @@ impl EscapeDebug { Self(EscapeDebugInner::Char(chr)) } - fn backslash(chr: u8) -> Self { - let data = [b'\\', chr, 0, 0, 0, 0, 0, 0, 0, 0]; - let iter = escape::EscapeIterInner::new(data, 0..2); + fn backslash(chr: ascii::Char) -> Self { + let data = [ascii::Char::ReverseSolidus, chr]; + let iter = escape::EscapeIterInner::from_array(data); Self(EscapeDebugInner::Bytes(iter)) } @@ -318,7 +319,7 @@ impl EscapeDebug { } fn clear(&mut self) { - let bytes = escape::EscapeIterInner::new([0; 10], 0..0); + let bytes = escape::EscapeIterInner::from_array([]); self.0 = EscapeDebugInner::Bytes(bytes); } } diff --git a/library/core/src/escape.rs b/library/core/src/escape.rs index 20ac3cf027f87..3d471419bb8f1 100644 --- a/library/core/src/escape.rs +++ b/library/core/src/escape.rs @@ -1,34 +1,41 @@ //! Helper code for character escaping. +use crate::ascii; use crate::num::NonZeroUsize; use crate::ops::Range; -const HEX_DIGITS: [u8; 16] = *b"0123456789abcdef"; +const HEX_DIGITS: [ascii::Char; 16] = *b"0123456789abcdef".as_ascii().unwrap(); /// Escapes a byte into provided buffer; returns length of escaped /// representation. -pub(crate) fn escape_ascii_into(output: &mut [u8; 4], byte: u8) -> Range { +pub(crate) fn escape_ascii_into(output: &mut [ascii::Char; 4], byte: u8) -> Range { + #[inline] + fn backslash(a: ascii::Char) -> ([ascii::Char; 4], u8) { + ([ascii::Char::ReverseSolidus, a, ascii::Char::Null, ascii::Char::Null], 2) + } + let (data, len) = match byte { - b'\t' => ([b'\\', b't', 0, 0], 2), - b'\r' => ([b'\\', b'r', 0, 0], 2), - b'\n' => ([b'\\', b'n', 0, 0], 2), - b'\\' => ([b'\\', b'\\', 0, 0], 2), - b'\'' => ([b'\\', b'\'', 0, 0], 2), - b'"' => ([b'\\', b'"', 0, 0], 2), - b'\x20'..=b'\x7e' => ([byte, 0, 0, 0], 1), - _ => { + b'\t' => backslash(ascii::Char::SmallT), + b'\r' => backslash(ascii::Char::SmallR), + b'\n' => backslash(ascii::Char::SmallN), + b'\\' => backslash(ascii::Char::ReverseSolidus), + b'\'' => backslash(ascii::Char::Apostrophe), + b'\"' => backslash(ascii::Char::QuotationMark), + _ => if let Some(a) = byte.as_ascii() && !byte.is_ascii_control() { + ([a, ascii::Char::Null, ascii::Char::Null, ascii::Char::Null], 1) + } else { let hi = HEX_DIGITS[usize::from(byte >> 4)]; let lo = HEX_DIGITS[usize::from(byte & 0xf)]; - ([b'\\', b'x', hi, lo], 4) + ([ascii::Char::ReverseSolidus, ascii::Char::SmallX, hi, lo], 4) } }; *output = data; - 0..(len as u8) + 0..len } /// Escapes a character into provided buffer using `\u{NNNN}` representation. -pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range { - output[9] = b'}'; +pub(crate) fn escape_unicode_into(output: &mut [ascii::Char; 10], ch: char) -> Range { + output[9] = ascii::Char::RightCurlyBracket; let ch = ch as u32; output[3] = HEX_DIGITS[((ch >> 20) & 15) as usize]; @@ -41,7 +48,8 @@ pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range // or-ing 1 ensures that for ch==0 the code computes that one digit should // be printed. let start = (ch | 1).leading_zeros() as usize / 4 - 2; - output[start..start + 3].copy_from_slice(b"\\u{"); + const UNICODE_ESCAPE_PREFIX: &[ascii::Char; 3] = b"\\u{".as_ascii().unwrap(); + output[start..][..3].copy_from_slice(UNICODE_ESCAPE_PREFIX); (start as u8)..10 } @@ -52,29 +60,34 @@ pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range /// limited to u8 to reduce size of the structure. #[derive(Clone, Debug)] pub(crate) struct EscapeIterInner { - // Invariant: data[alive] is all ASCII. - pub(crate) data: [u8; N], + // The element type ensures this is always ASCII, and thus also valid UTF-8. + pub(crate) data: [ascii::Char; N], // Invariant: alive.start <= alive.end <= N. pub(crate) alive: Range, } impl EscapeIterInner { - pub fn new(data: [u8; N], alive: Range) -> Self { + pub fn new(data: [ascii::Char; N], alive: Range) -> Self { const { assert!(N < 256) }; debug_assert!(alive.start <= alive.end && usize::from(alive.end) <= N, "{alive:?}"); - let this = Self { data, alive }; - debug_assert!(this.as_bytes().is_ascii(), "Expected ASCII, got {:?}", this.as_bytes()); - this + Self { data, alive } + } + + pub fn from_array(array: [ascii::Char; M]) -> Self { + const { assert!(M <= N) }; + + let mut data = [ascii::Char::Null; N]; + data[..M].copy_from_slice(&array); + Self::new(data, 0..M as u8) } - fn as_bytes(&self) -> &[u8] { + pub fn as_ascii(&self) -> &[ascii::Char] { &self.data[usize::from(self.alive.start)..usize::from(self.alive.end)] } pub fn as_str(&self) -> &str { - // SAFETY: self.data[self.alive] is all ASCII characters. - unsafe { crate::str::from_utf8_unchecked(self.as_bytes()) } + self.as_ascii().as_str() } pub fn len(&self) -> usize { @@ -82,11 +95,11 @@ impl EscapeIterInner { } pub fn next(&mut self) -> Option { - self.alive.next().map(|i| self.data[usize::from(i)]) + self.alive.next().map(|i| self.data[usize::from(i)].as_u8()) } pub fn next_back(&mut self) -> Option { - self.alive.next_back().map(|i| self.data[usize::from(i)]) + self.alive.next_back().map(|i| self.data[usize::from(i)].as_u8()) } pub fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 26c51e8403522..aac172988f3b9 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -215,6 +215,7 @@ #![feature(intra_doc_pointers)] #![feature(intrinsics)] #![feature(lang_items)] +#![feature(let_chains)] #![feature(link_llvm_intrinsics)] #![feature(macro_metavar_expr)] #![feature(min_specialization)]