From 8c781b0906209e81f3540d1495becddae9894a25 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Sat, 29 Apr 2023 14:45:36 -0700 Subject: [PATCH] Add the basic `ascii::Char` type --- library/alloc/src/lib.rs | 1 + library/alloc/src/string.rs | 9 + library/core/src/array/ascii.rs | 34 ++ library/core/src/array/mod.rs | 1 + library/core/src/ascii.rs | 4 + library/core/src/ascii/ascii_char.rs | 565 +++++++++++++++++++++++++++ library/core/src/char/methods.rs | 19 + library/core/src/num/mod.rs | 11 +- library/core/src/slice/ascii.rs | 30 ++ library/core/src/str/mod.rs | 11 + library/std/src/ascii.rs | 3 + tests/codegen/ascii-char.rs | 37 ++ 12 files changed, 724 insertions(+), 1 deletion(-) create mode 100644 library/core/src/array/ascii.rs create mode 100644 library/core/src/ascii/ascii_char.rs create mode 100644 tests/codegen/ascii-char.rs diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index a002421aeef3a..18f25aec5feee 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -101,6 +101,7 @@ #![feature(array_into_iter_constructors)] #![feature(array_methods)] #![feature(array_windows)] +#![feature(ascii_char)] #![feature(assert_matches)] #![feature(async_iterator)] #![feature(coerce_unsized)] diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index cf16a3424a092..b9ef76c109abf 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -2526,6 +2526,15 @@ impl ToString for T { } } +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "ascii_char", issue = "110998")] +impl ToString for core::ascii::Char { + #[inline] + fn to_string(&self) -> String { + self.as_str().to_owned() + } +} + #[cfg(not(no_global_oom_handling))] #[stable(feature = "char_to_string_specialization", since = "1.46.0")] impl ToString for char { diff --git a/library/core/src/array/ascii.rs b/library/core/src/array/ascii.rs new file mode 100644 index 0000000000000..6750d7c071144 --- /dev/null +++ b/library/core/src/array/ascii.rs @@ -0,0 +1,34 @@ +use crate::ascii; + +#[cfg(not(test))] +impl [u8; N] { + /// Converts this array of bytes into a array of ASCII characters, + /// or returns `None` if any of the characters is non-ASCII. + #[unstable(feature = "ascii_char", issue = "110998")] + #[must_use] + #[inline] + pub fn as_ascii(&self) -> Option<&[ascii::Char; N]> { + if self.is_ascii() { + // SAFETY: Just checked that it's ASCII + Some(unsafe { self.as_ascii_unchecked() }) + } else { + None + } + } + + /// Converts this array of bytes into a array of ASCII characters, + /// without checking whether they're valid. + /// + /// # Safety + /// + /// Every byte in the array must be in `0..=127`, or else this is UB. + #[unstable(feature = "ascii_char", issue = "110998")] + #[must_use] + #[inline] + pub const unsafe fn as_ascii_unchecked(&self) -> &[ascii::Char; N] { + let byte_ptr: *const [u8; N] = self; + let ascii_ptr = byte_ptr as *const [ascii::Char; N]; + // SAFETY: The caller promised all the bytes are ASCII + unsafe { &*ascii_ptr } + } +} diff --git a/library/core/src/array/mod.rs b/library/core/src/array/mod.rs index 940558974e69b..bdb4c975909e0 100644 --- a/library/core/src/array/mod.rs +++ b/library/core/src/array/mod.rs @@ -17,6 +17,7 @@ use crate::ops::{ }; use crate::slice::{Iter, IterMut}; +mod ascii; mod drain; mod equality; mod iter; diff --git a/library/core/src/ascii.rs b/library/core/src/ascii.rs index 8a4cb78cc7f92..6034c8afce915 100644 --- a/library/core/src/ascii.rs +++ b/library/core/src/ascii.rs @@ -14,6 +14,10 @@ use crate::iter::FusedIterator; use crate::ops::Range; use crate::str::from_utf8_unchecked; +mod ascii_char; +#[unstable(feature = "ascii_char", issue = "110998")] +pub use ascii_char::AsciiChar as Char; + /// An iterator over the escaped version of a byte. /// /// This `struct` is created by the [`escape_default`] function. See its diff --git a/library/core/src/ascii/ascii_char.rs b/library/core/src/ascii/ascii_char.rs new file mode 100644 index 0000000000000..f093a0990d1a9 --- /dev/null +++ b/library/core/src/ascii/ascii_char.rs @@ -0,0 +1,565 @@ +//! This uses the name `AsciiChar`, even though it's not exposed that way right now, +//! because it avoids a whole bunch of "are you sure you didn't mean `char`?" +//! suggestions from rustc if you get anything slightly wrong in here, and overall +//! helps with clarity as we're also referring to `char` intentionally in here. + +use crate::fmt; +use crate::mem::transmute; + +/// One of the 128 Unicode characters from U+0000 through U+007F, +/// often known as the [ASCII] subset. +/// +/// Officially, this is the first [block] in Unicode, _Basic Latin_. +/// For details, see the [*C0 Controls and Basic Latin*][chart] code chart. +/// +/// This block was based on older 7-bit character code standards such as +/// ANSI X3.4-1977, ISO 646-1973, and [NIST FIPS 1-2]. +/// +/// # When to use this +/// +/// The main advantage of this subset is that it's always valid UTF-8. As such, +/// the `&[ascii::Char]` -> `&str` conversion function (as well as other related +/// ones) are O(1): *no* runtime checks are needed. +/// +/// If you're consuming strings, you should usually handle Unicode and thus +/// accept `str`s, not limit yourself to `ascii::Char`s. +/// +/// However, certain formats are intentionally designed to produce ASCII-only +/// output in order to be 8-bit-clean. In those cases, it can be simpler and +/// faster to generate `ascii::Char`s instead of dealing with the variable width +/// properties of general UTF-8 encoded strings, while still allowing the result +/// to be used freely with other Rust things that deal in general `str`s. +/// +/// For example, a UUID library might offer a way to produce the string +/// representation of a UUID as an `[ascii::Char; 36]` to avoid memory +/// allocation yet still allow it to be used as UTF-8 via `as_str` without +/// paying for validation (or needing `unsafe` code) the way it would if it +/// were provided as a `[u8; 36]`. +/// +/// # Layout +/// +/// This type is guaranteed to have a size and alignment of 1 byte. +/// +/// # Names +/// +/// The variants on this type are [Unicode names][NamesList] of the characters +/// in upper camel case, with a few tweaks: +/// - For `` characters, the primary alias name is used. +/// - `LATIN` is dropped, as this block has no non-latin letters. +/// - `LETTER` is dropped, as `CAPITAL`/`SMALL` suffices in this block. +/// - `DIGIT`s use a single digit rather than writing out `ZERO`, `ONE`, etc. +/// +/// [ASCII]: https://www.unicode.org/glossary/index.html#ASCII +/// [block]: https://www.unicode.org/glossary/index.html#block +/// [chart]: https://www.unicode.org/charts/PDF/U0000.pdf +/// [NIST FIPS 1-2]: https://nvlpubs.nist.gov/nistpubs/Legacy/FIPS/fipspub1-2-1977.pdf +/// [NamesList]: https://www.unicode.org/Public/15.0.0/ucd/NamesList.txt +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +#[unstable(feature = "ascii_char", issue = "110998")] +#[repr(u8)] +pub enum AsciiChar { + /// U+0000 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Null = 0, + /// U+0001 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + StartOfHeading = 1, + /// U+0002 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + StartOfText = 2, + /// U+0003 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + EndOfText = 3, + /// U+0004 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + EndOfTransmission = 4, + /// U+0005 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Enquiry = 5, + /// U+0006 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Acknowledge = 6, + /// U+0007 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Bell = 7, + /// U+0008 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Backspace = 8, + /// U+0009 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CharacterTabulation = 9, + /// U+000A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LineFeed = 10, + /// U+000B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LineTabulation = 11, + /// U+000C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + FormFeed = 12, + /// U+000D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CarriageReturn = 13, + /// U+000E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + ShiftOut = 14, + /// U+000F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + ShiftIn = 15, + /// U+0010 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + DataLinkEscape = 16, + /// U+0011 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + DeviceControlOne = 17, + /// U+0012 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + DeviceControlTwo = 18, + /// U+0013 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + DeviceControlThree = 19, + /// U+0014 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + DeviceControlFour = 20, + /// U+0015 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + NegativeAcknowledge = 21, + /// U+0016 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SynchronousIdle = 22, + /// U+0017 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + EndOfTransmissionBlock = 23, + /// U+0018 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Cancel = 24, + /// U+0019 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + EndOfMedium = 25, + /// U+001A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Substitute = 26, + /// U+001B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Escape = 27, + /// U+001C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + InformationSeparatorFour = 28, + /// U+001D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + InformationSeparatorThree = 29, + /// U+001E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + InformationSeparatorTwo = 30, + /// U+001F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + InformationSeparatorOne = 31, + /// U+0020 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Space = 32, + /// U+0021 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + ExclamationMark = 33, + /// U+0022 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + QuotationMark = 34, + /// U+0023 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + NumberSign = 35, + /// U+0024 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + DollarSign = 36, + /// U+0025 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + PercentSign = 37, + /// U+0026 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Ampersand = 38, + /// U+0027 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Apostrophe = 39, + /// U+0028 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LeftParenthesis = 40, + /// U+0029 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + RightParenthesis = 41, + /// U+002A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Asterisk = 42, + /// U+002B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + PlusSign = 43, + /// U+002C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Comma = 44, + /// U+002D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + HyphenMinus = 45, + /// U+002E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + FullStop = 46, + /// U+002F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Solidus = 47, + /// U+0030 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit0 = 48, + /// U+0031 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit1 = 49, + /// U+0032 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit2 = 50, + /// U+0033 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit3 = 51, + /// U+0034 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit4 = 52, + /// U+0035 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit5 = 53, + /// U+0036 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit6 = 54, + /// U+0037 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit7 = 55, + /// U+0038 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit8 = 56, + /// U+0039 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Digit9 = 57, + /// U+003A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Colon = 58, + /// U+003B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Semicolon = 59, + /// U+003C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LessThanSign = 60, + /// U+003D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + EqualsSign = 61, + /// U+003E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + GreaterThanSign = 62, + /// U+003F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + QuestionMark = 63, + /// U+0040 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CommercialAt = 64, + /// U+0041 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalA = 65, + /// U+0042 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalB = 66, + /// U+0043 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalC = 67, + /// U+0044 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalD = 68, + /// U+0045 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalE = 69, + /// U+0046 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalF = 70, + /// U+0047 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalG = 71, + /// U+0048 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalH = 72, + /// U+0049 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalI = 73, + /// U+004A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalJ = 74, + /// U+004B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalK = 75, + /// U+004C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalL = 76, + /// U+004D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalM = 77, + /// U+004E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalN = 78, + /// U+004F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalO = 79, + /// U+0050 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalP = 80, + /// U+0051 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalQ = 81, + /// U+0052 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalR = 82, + /// U+0053 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalS = 83, + /// U+0054 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalT = 84, + /// U+0055 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalU = 85, + /// U+0056 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalV = 86, + /// U+0057 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalW = 87, + /// U+0058 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalX = 88, + /// U+0059 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalY = 89, + /// U+005A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CapitalZ = 90, + /// U+005B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LeftSquareBracket = 91, + /// U+005C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + ReverseSolidus = 92, + /// U+005D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + RightSquareBracket = 93, + /// U+005E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + CircumflexAccent = 94, + /// U+005F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LowLine = 95, + /// U+0060 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + GraveAccent = 96, + /// U+0061 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallA = 97, + /// U+0062 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallB = 98, + /// U+0063 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallC = 99, + /// U+0064 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallD = 100, + /// U+0065 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallE = 101, + /// U+0066 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallF = 102, + /// U+0067 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallG = 103, + /// U+0068 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallH = 104, + /// U+0069 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallI = 105, + /// U+006A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallJ = 106, + /// U+006B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallK = 107, + /// U+006C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallL = 108, + /// U+006D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallM = 109, + /// U+006E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallN = 110, + /// U+006F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallO = 111, + /// U+0070 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallP = 112, + /// U+0071 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallQ = 113, + /// U+0072 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallR = 114, + /// U+0073 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallS = 115, + /// U+0074 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallT = 116, + /// U+0075 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallU = 117, + /// U+0076 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallV = 118, + /// U+0077 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallW = 119, + /// U+0078 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallX = 120, + /// U+0079 + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallY = 121, + /// U+007A + #[unstable(feature = "ascii_char_variants", issue = "110998")] + SmallZ = 122, + /// U+007B + #[unstable(feature = "ascii_char_variants", issue = "110998")] + LeftCurlyBracket = 123, + /// U+007C + #[unstable(feature = "ascii_char_variants", issue = "110998")] + VerticalLine = 124, + /// U+007D + #[unstable(feature = "ascii_char_variants", issue = "110998")] + RightCurlyBracket = 125, + /// U+007E + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Tilde = 126, + /// U+007F + #[unstable(feature = "ascii_char_variants", issue = "110998")] + Delete = 127, +} + +impl AsciiChar { + /// Creates an ascii character from the byte `b`, + /// or returns `None` if it's too large. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn from_u8(b: u8) -> Option { + if b <= 127 { + // SAFETY: Just checked that `b` is in-range + Some(unsafe { Self::from_u8_unchecked(b) }) + } else { + None + } + } + + /// Creates an ASCII character from the byte `b`, + /// without checking whether it's valid. + /// + /// # Safety + /// + /// `b` must be in `0..=127`, or else this is UB. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const unsafe fn from_u8_unchecked(b: u8) -> Self { + // SAFETY: Our safety precondition is that `b` is in-range. + unsafe { transmute(b) } + } + + /// When passed the *number* `0`, `1`, …, `9`, returns the *character* + /// `'0'`, `'1'`, …, `'9'` respectively. + /// + /// If `d >= 10`, returns `None`. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn digit(d: u8) -> Option { + if d < 10 { + // SAFETY: Just checked it's in-range. + Some(unsafe { Self::digit_unchecked(d) }) + } else { + None + } + } + + /// When passed the *number* `0`, `1`, …, `9`, returns the *character* + /// `'0'`, `'1'`, …, `'9'` respectively, without checking that it's in-range. + /// + /// # Safety + /// + /// This is immediate UB if called with `d > 64`. + /// + /// If `d >= 10` and `d <= 64`, this is allowed to return any value or panic. + /// Notably, it should not be expected to return hex digits, or any other + /// reasonable extension of the decimal digits. + /// + /// (This lose safety condition is intended to simplify soundness proofs + /// when writing code using this method, since the implementation doesn't + /// need something really specific, not to make those other arguments do + /// something useful. It might be tightened before stabilization.) + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const unsafe fn digit_unchecked(d: u8) -> Self { + debug_assert!(d < 10); + + // SAFETY: `'0'` through `'9'` are U+00030 through U+0039, + // so because `d` must be 64 or less the addition can return at most + // 112 (0x70), which doesn't overflow and is within the ASCII range. + unsafe { + let byte = b'0'.unchecked_add(d); + Self::from_u8_unchecked(byte) + } + } + + /// Gets this ASCII character as a byte. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_u8(self) -> u8 { + self as u8 + } + + /// Gets this ASCII character as a `char` Unicode Scalar Value. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_char(self) -> char { + self as u8 as char + } + + /// Views this ASCII character as a one-code-unit UTF-8 `str`. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_str(&self) -> &str { + crate::slice::from_ref(self).as_str() + } +} + +impl [AsciiChar] { + /// Views this slice of ASCII characters as a UTF-8 `str`. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_str(&self) -> &str { + let ascii_ptr: *const Self = self; + let str_ptr = ascii_ptr as *const str; + // SAFETY: Each ASCII codepoint in UTF-8 is encoded as one single-byte + // code unit having the same value as the ASCII byte. + unsafe { &*str_ptr } + } + + /// Views this slice of ASCII characters as a slice of `u8` bytes. + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_bytes(&self) -> &[u8] { + self.as_str().as_bytes() + } +} + +#[unstable(feature = "ascii_char", issue = "110998")] +impl fmt::Display for AsciiChar { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ::fmt(self.as_str(), f) + } +} diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 9bc97ea0bff18..27e51054203a4 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -1,5 +1,6 @@ //! impl char {} +use crate::ascii; use crate::slice; use crate::str::from_utf8_unchecked_mut; use crate::unicode::printable::is_printable; @@ -1116,6 +1117,24 @@ impl char { *self as u32 <= 0x7F } + /// Returns `Some` if the value is within the ASCII range, + /// or `None` if it's not. + /// + /// This is preferred to [`Self::is_ascii`] when you're passing the value + /// along to something else that can take [`ascii::Char`] rather than + /// needing to check again for itself whether the value is in ASCII. + #[must_use] + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_ascii(&self) -> Option { + if self.is_ascii() { + // SAFETY: Just checked that this is ASCII. + Some(unsafe { ascii::Char::from_u8_unchecked(*self as u8) }) + } else { + None + } + } + /// Makes a copy of the value in its ASCII upper case equivalent. /// /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index fdd7be625ed93..08444421dca08 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -472,7 +472,16 @@ impl u8 { #[rustc_const_stable(feature = "const_u8_is_ascii", since = "1.43.0")] #[inline] pub const fn is_ascii(&self) -> bool { - *self & 128 == 0 + *self <= 127 + } + + /// If the value of this byte is within the ASCII range, returns it as an + /// [ASCII character](ascii::Char). Otherwise, returns `None`. + #[must_use] + #[unstable(feature = "ascii_char", issue = "110998")] + #[inline] + pub const fn as_ascii(&self) -> Option { + ascii::Char::from_u8(*self) } /// Makes a copy of the value in its ASCII upper case equivalent. diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 5e5399acc1b0f..7bae6692ad4ed 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -16,6 +16,36 @@ impl [u8] { is_ascii(self) } + /// If this slice [`is_ascii`](Self::is_ascii), returns it as a slice of + /// [ASCII characters](`ascii::Char`), otherwise returns `None`. + #[unstable(feature = "ascii_char", issue = "110998")] + #[must_use] + #[inline] + pub fn as_ascii(&self) -> Option<&[ascii::Char]> { + if self.is_ascii() { + // SAFETY: Just checked that it's ASCII + Some(unsafe { self.as_ascii_unchecked() }) + } else { + None + } + } + + /// Converts this slice of bytes into a slice of ASCII characters, + /// without checking whether they're valid. + /// + /// # Safety + /// + /// Every byte in the slice must be in `0..=127`, or else this is UB. + #[unstable(feature = "ascii_char", issue = "110998")] + #[must_use] + #[inline] + pub const unsafe fn as_ascii_unchecked(&self) -> &[ascii::Char] { + let byte_ptr: *const [u8] = self; + let ascii_ptr = byte_ptr as *const [ascii::Char]; + // SAFETY: The caller promised all the bytes are ASCII + unsafe { &*ascii_ptr } + } + /// Checks that two slices are an ASCII case-insensitive match. /// /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index a13107fd0de09..66fa9cf6f64c0 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -16,6 +16,7 @@ mod validations; use self::pattern::Pattern; use self::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; +use crate::ascii; use crate::char::{self, EscapeDebugExtArgs}; use crate::mem; use crate::slice::{self, SliceIndex}; @@ -2366,6 +2367,16 @@ impl str { self.as_bytes().is_ascii() } + /// If this string slice [`is_ascii`](Self::is_ascii), returns it as a slice + /// of [ASCII characters](`ascii::Char`), otherwise returns `None`. + #[unstable(feature = "ascii_char", issue = "110998")] + #[must_use] + #[inline] + pub fn as_ascii(&self) -> Option<&[ascii::Char]> { + // Like in `is_ascii`, we can work on the bytes directly. + self.as_bytes().as_ascii() + } + /// Checks that two strings are an ASCII case-insensitive match. /// /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, diff --git a/library/std/src/ascii.rs b/library/std/src/ascii.rs index c29f015777f1a..b18ab50de123e 100644 --- a/library/std/src/ascii.rs +++ b/library/std/src/ascii.rs @@ -16,6 +16,9 @@ #[stable(feature = "rust1", since = "1.0.0")] pub use core::ascii::{escape_default, EscapeDefault}; +#[unstable(feature = "ascii_char", issue = "110998")] +pub use core::ascii::Char; + /// Extension methods for ASCII-subset only operations. /// /// Be aware that operations on seemingly non-ASCII characters can sometimes diff --git a/tests/codegen/ascii-char.rs b/tests/codegen/ascii-char.rs new file mode 100644 index 0000000000000..4167becf5e9a9 --- /dev/null +++ b/tests/codegen/ascii-char.rs @@ -0,0 +1,37 @@ +// compile-flags: -C opt-level=1 +// ignore-debug (the extra assertions get in the way) + +#![crate_type = "lib"] +#![feature(ascii_char)] + +use std::ascii::Char as AsciiChar; + +// CHECK-LABEL: i8 @unwrap_digit_from_remainder(i32 +#[no_mangle] +pub fn unwrap_digit_from_remainder(v: u32) -> AsciiChar { + // CHECK-NOT: icmp + // CHECK-NOT: panic + + // CHECK: %[[R:.+]] = urem i32 %v, 10 + // CHECK-NEXT: %[[T:.+]] = trunc i32 %[[R]] to i8 + // CHECK-NEXT: %[[D:.+]] = or i8 %[[T]], 48 + // CHECK-NEXT: ret i8 %[[D]] + + // CHECK-NOT: icmp + // CHECK-NOT: panic + AsciiChar::digit((v % 10) as u8).unwrap() +} + +// CHECK-LABEL: i8 @unwrap_from_masked(i8 +#[no_mangle] +pub fn unwrap_from_masked(b: u8) -> AsciiChar { + // CHECK-NOT: icmp + // CHECK-NOT: panic + + // CHECK: %[[M:.+]] = and i8 %b, 127 + // CHECK-NEXT: ret i8 %[[M]] + + // CHECK-NOT: icmp + // CHECK-NOT: panic + AsciiChar::from_u8(b & 0x7f).unwrap() +}