Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize character escaping. #124307

Merged
merged 3 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions library/core/src/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,21 @@ pub struct EscapeDefault(escape::EscapeIterInner<4>);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn escape_default(c: u8) -> EscapeDefault {
let mut data = [Char::Null; 4];
let range = escape::escape_ascii_into(&mut data, c);
EscapeDefault(escape::EscapeIterInner::new(data, range))
EscapeDefault::new(c)
}

impl EscapeDefault {
#[inline]
pub(crate) const fn new(c: u8) -> Self {
Self(escape::EscapeIterInner::ascii(c))
}

#[inline]
pub(crate) fn empty() -> Self {
let data = [Char::Null; 4];
EscapeDefault(escape::EscapeIterInner::new(data, 0..0))
Self(escape::EscapeIterInner::empty())
}

#[inline]
pub(crate) fn as_str(&self) -> &str {
self.0.as_str()
}
Expand Down
8 changes: 4 additions & 4 deletions library/core/src/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,10 +449,10 @@ impl char {
'\"' if args.escape_double_quote => EscapeDebug::backslash(ascii::Char::QuotationMark),
'\'' if args.escape_single_quote => EscapeDebug::backslash(ascii::Char::Apostrophe),
_ if args.escape_grapheme_extended && self.is_grapheme_extended() => {
EscapeDebug::from_unicode(self.escape_unicode())
EscapeDebug::unicode(self)
}
_ if is_printable(self) => EscapeDebug::printable(self),
_ => EscapeDebug::from_unicode(self.escape_unicode()),
_ => EscapeDebug::unicode(self),
}
}

Expand Down Expand Up @@ -555,9 +555,9 @@ impl char {
'\t' => EscapeDefault::backslash(ascii::Char::SmallT),
'\r' => EscapeDefault::backslash(ascii::Char::SmallR),
'\n' => EscapeDefault::backslash(ascii::Char::SmallN),
'\\' | '\'' | '"' => EscapeDefault::backslash(self.as_ascii().unwrap()),
'\\' | '\'' | '\"' => EscapeDefault::backslash(self.as_ascii().unwrap()),
'\x20'..='\x7e' => EscapeDefault::printable(self.as_ascii().unwrap()),
_ => EscapeDefault::from_unicode(self.escape_unicode()),
_ => EscapeDefault::unicode(self),
}
}

Expand Down
44 changes: 23 additions & 21 deletions library/core/src/char/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,9 @@ pub const fn from_digit(num: u32, radix: u32) -> Option<char> {
pub struct EscapeUnicode(escape::EscapeIterInner<10>);

impl EscapeUnicode {
fn new(chr: char) -> Self {
let mut data = [ascii::Char::Null; 10];
let range = escape::escape_unicode_into(&mut data, chr);
Self(escape::EscapeIterInner::new(data, range))
#[inline]
const fn new(c: char) -> Self {
Self(escape::EscapeIterInner::unicode(c))
}
}

Expand Down Expand Up @@ -219,18 +218,19 @@ impl fmt::Display for EscapeUnicode {
pub struct EscapeDefault(escape::EscapeIterInner<10>);

impl EscapeDefault {
fn printable(chr: ascii::Char) -> Self {
let data = [chr];
Self(escape::EscapeIterInner::from_array(data))
#[inline]
const fn printable(c: ascii::Char) -> Self {
Self(escape::EscapeIterInner::ascii(c.to_u8()))
}

fn backslash(chr: ascii::Char) -> Self {
let data = [ascii::Char::ReverseSolidus, chr];
Self(escape::EscapeIterInner::from_array(data))
#[inline]
const fn backslash(c: ascii::Char) -> Self {
Self(escape::EscapeIterInner::backslash(c))
}

fn from_unicode(esc: EscapeUnicode) -> Self {
Self(esc.0)
#[inline]
const fn unicode(c: char) -> Self {
Self(escape::EscapeIterInner::unicode(c))
}
}

Expand Down Expand Up @@ -304,23 +304,24 @@ enum EscapeDebugInner {
}

impl EscapeDebug {
fn printable(chr: char) -> Self {
#[inline]
const fn printable(chr: char) -> Self {
Self(EscapeDebugInner::Char(chr))
}

fn backslash(chr: ascii::Char) -> Self {
let data = [ascii::Char::ReverseSolidus, chr];
let iter = escape::EscapeIterInner::from_array(data);
Self(EscapeDebugInner::Bytes(iter))
#[inline]
const fn backslash(c: ascii::Char) -> Self {
Self(EscapeDebugInner::Bytes(escape::EscapeIterInner::backslash(c)))
}

fn from_unicode(esc: EscapeUnicode) -> Self {
Self(EscapeDebugInner::Bytes(esc.0))
#[inline]
const fn unicode(c: char) -> Self {
Self(EscapeDebugInner::Bytes(escape::EscapeIterInner::unicode(c)))
}

#[inline]
fn clear(&mut self) {
let bytes = escape::EscapeIterInner::from_array([]);
self.0 = EscapeDebugInner::Bytes(bytes);
self.0 = EscapeDebugInner::Bytes(escape::EscapeIterInner::empty());
}
}

Expand All @@ -339,6 +340,7 @@ impl Iterator for EscapeDebug {
}
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let n = self.len();
(n, Some(n))
Expand Down
139 changes: 90 additions & 49 deletions library/core/src/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,56 +6,79 @@ use crate::ops::Range;

const HEX_DIGITS: [ascii::Char; 16] = *b"0123456789abcdef".as_ascii().unwrap();

/// Escapes a byte into provided buffer; returns length of escaped
/// representation.
pub(crate) fn escape_ascii_into(output: &mut [ascii::Char; 4], byte: u8) -> Range<u8> {
#[inline]
fn backslash(a: ascii::Char) -> ([ascii::Char; 4], u8) {
([ascii::Char::ReverseSolidus, a, ascii::Char::Null, ascii::Char::Null], 2)
}
#[inline]
const fn backslash<const N: usize>(a: ascii::Char) -> ([ascii::Char; N], Range<u8>) {
const { assert!(N >= 2) };

let mut output = [ascii::Char::Null; N];

output[0] = ascii::Char::ReverseSolidus;
output[1] = a;

(output, 0..2)
}

let (data, len) = match byte {
/// Escapes an ASCII character.
///
/// Returns a buffer and the length of the escaped representation.
const fn escape_ascii<const N: usize>(byte: u8) -> ([ascii::Char; N], Range<u8>) {
const { assert!(N >= 4) };

match byte {
b'\t' => backslash(ascii::Char::SmallT),
b'\r' => backslash(ascii::Char::SmallR),
b'\n' => backslash(ascii::Char::SmallN),
b'\\' => backslash(ascii::Char::ReverseSolidus),
b'\'' => backslash(ascii::Char::Apostrophe),
b'\"' => backslash(ascii::Char::QuotationMark),
_ => {
if let Some(a) = byte.as_ascii()
byte => {
let mut output = [ascii::Char::Null; N];

if let Some(c) = byte.as_ascii()
&& !byte.is_ascii_control()
{
([a, ascii::Char::Null, ascii::Char::Null, ascii::Char::Null], 1)
output[0] = c;
(output, 0..1)
} else {
let hi = HEX_DIGITS[usize::from(byte >> 4)];
let lo = HEX_DIGITS[usize::from(byte & 0xf)];
([ascii::Char::ReverseSolidus, ascii::Char::SmallX, hi, lo], 4)
let hi = HEX_DIGITS[(byte >> 4) as usize];
let lo = HEX_DIGITS[(byte & 0xf) as usize];

output[0] = ascii::Char::ReverseSolidus;
output[1] = ascii::Char::SmallX;
output[2] = hi;
output[3] = lo;

(output, 0..4)
}
}
};
*output = data;
0..len
}
}

/// Escapes a character into provided buffer using `\u{NNNN}` representation.
pub(crate) fn escape_unicode_into(output: &mut [ascii::Char; 10], ch: char) -> Range<u8> {
/// Escapes a character `\u{NNNN}` representation.
///
/// Returns a buffer and the length of the escaped representation.
const fn escape_unicode<const N: usize>(c: char) -> ([ascii::Char; N], Range<u8>) {
const { assert!(N >= 10 && N < u8::MAX as usize) };

let c = u32::from(c);

// OR-ing `1` ensures that for `c == 0` the code computes that
// one digit should be printed.
let start = (c | 1).leading_zeros() as usize / 4 - 2;

let mut output = [ascii::Char::Null; N];
output[3] = HEX_DIGITS[((c >> 20) & 15) as usize];
output[4] = HEX_DIGITS[((c >> 16) & 15) as usize];
output[5] = HEX_DIGITS[((c >> 12) & 15) as usize];
output[6] = HEX_DIGITS[((c >> 8) & 15) as usize];
output[7] = HEX_DIGITS[((c >> 4) & 15) as usize];
output[8] = HEX_DIGITS[((c >> 0) & 15) as usize];
output[9] = ascii::Char::RightCurlyBracket;
output[start + 0] = ascii::Char::ReverseSolidus;
output[start + 1] = ascii::Char::SmallU;
output[start + 2] = ascii::Char::LeftCurlyBracket;

let ch = ch as u32;
output[3] = HEX_DIGITS[((ch >> 20) & 15) as usize];
output[4] = HEX_DIGITS[((ch >> 16) & 15) as usize];
output[5] = HEX_DIGITS[((ch >> 12) & 15) as usize];
output[6] = HEX_DIGITS[((ch >> 8) & 15) as usize];
output[7] = HEX_DIGITS[((ch >> 4) & 15) as usize];
output[8] = HEX_DIGITS[((ch >> 0) & 15) as usize];

// or-ing 1 ensures that for ch==0 the code computes that one digit should
// be printed.
let start = (ch | 1).leading_zeros() as usize / 4 - 2;
const UNICODE_ESCAPE_PREFIX: &[ascii::Char; 3] = b"\\u{".as_ascii().unwrap();
output[start..][..3].copy_from_slice(UNICODE_ESCAPE_PREFIX);

(start as u8)..10
(output, (start as u8)..(N as u8))
}

/// An iterator over an fixed-size array.
Expand All @@ -65,45 +88,63 @@ pub(crate) fn escape_unicode_into(output: &mut [ascii::Char; 10], ch: char) -> R
#[derive(Clone, Debug)]
pub(crate) struct EscapeIterInner<const N: usize> {
// The element type ensures this is always ASCII, and thus also valid UTF-8.
pub(crate) data: [ascii::Char; N],
data: [ascii::Char; N],

// Invariant: alive.start <= alive.end <= N.
pub(crate) alive: Range<u8>,
// Invariant: `alive.start <= alive.end <= N`
alive: Range<u8>,
}

impl<const N: usize> EscapeIterInner<N> {
pub fn new(data: [ascii::Char; N], alive: Range<u8>) -> Self {
const { assert!(N < 256) };
debug_assert!(alive.start <= alive.end && usize::from(alive.end) <= N, "{alive:?}");
Self { data, alive }
pub const fn backslash(c: ascii::Char) -> Self {
let (data, range) = backslash(c);
Self { data, alive: range }
}

pub const fn ascii(c: u8) -> Self {
let (data, range) = escape_ascii(c);
Self { data, alive: range }
}

pub fn from_array<const M: usize>(array: [ascii::Char; M]) -> Self {
const { assert!(M <= N) };
pub const fn unicode(c: char) -> Self {
let (data, range) = escape_unicode(c);
Self { data, alive: range }
}

let mut data = [ascii::Char::Null; N];
data[..M].copy_from_slice(&array);
Self::new(data, 0..M as u8)
#[inline]
pub const fn empty() -> Self {
Self { data: [ascii::Char::Null; N], alive: 0..0 }
}

#[inline]
pub fn as_ascii(&self) -> &[ascii::Char] {
&self.data[usize::from(self.alive.start)..usize::from(self.alive.end)]
// SAFETY: `self.alive` is guaranteed to be a valid range for indexing `self.data`.
unsafe {
self.data.get_unchecked(usize::from(self.alive.start)..usize::from(self.alive.end))
}
}

#[inline]
pub fn as_str(&self) -> &str {
self.as_ascii().as_str()
}

#[inline]
pub fn len(&self) -> usize {
usize::from(self.alive.end - self.alive.start)
}

pub fn next(&mut self) -> Option<u8> {
self.alive.next().map(|i| self.data[usize::from(i)].to_u8())
let i = self.alive.next()?;

// SAFETY: `i` is guaranteed to be a valid index for `self.data`.
unsafe { Some(self.data.get_unchecked(usize::from(i)).to_u8()) }
}

pub fn next_back(&mut self) -> Option<u8> {
self.alive.next_back().map(|i| self.data[usize::from(i)].to_u8())
let i = self.alive.next_back()?;

// SAFETY: `i` is guaranteed to be a valid index for `self.data`.
unsafe { Some(self.data.get_unchecked(usize::from(i)).to_u8()) }
}

pub fn advance_by(&mut self, n: usize) -> Result<(), NonZero<usize>> {
Expand Down
Loading