diff --git a/src/Cargo.lock b/src/Cargo.lock index a60679e417ada..6e7c4b67acf24 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -13,7 +13,6 @@ dependencies = [ "compiler_builtins 0.0.0", "core 0.0.0", "rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", - "std_unicode 0.0.0", ] [[package]] diff --git a/src/ci/docker/wasm32-unknown/Dockerfile b/src/ci/docker/wasm32-unknown/Dockerfile index 6c0ec1ad9d4e1..853923ad947cd 100644 --- a/src/ci/docker/wasm32-unknown/Dockerfile +++ b/src/ci/docker/wasm32-unknown/Dockerfile @@ -34,4 +34,3 @@ ENV SCRIPT python2.7 /checkout/x.py test --target $TARGETS \ src/test/mir-opt \ src/test/codegen-units \ src/libcore \ - src/libstd_unicode/ \ diff --git a/src/doc/unstable-book/src/language-features/lang-items.md b/src/doc/unstable-book/src/language-features/lang-items.md index c51674186146b..6a7aea7f1c27e 100644 --- a/src/doc/unstable-book/src/language-features/lang-items.md +++ b/src/doc/unstable-book/src/language-features/lang-items.md @@ -243,7 +243,7 @@ the source code. - `usize`: `libcore/num/mod.rs` - `f32`: `libstd/f32.rs` - `f64`: `libstd/f64.rs` - - `char`: `libstd_unicode/char.rs` + - `char`: `libcore/char.rs` - `slice`: `liballoc/slice.rs` - `str`: `liballoc/str.rs` - `const_ptr`: `libcore/ptr.rs` diff --git a/src/liballoc/Cargo.toml b/src/liballoc/Cargo.toml index 2eb8ea1260446..6383bd1e941ed 100644 --- a/src/liballoc/Cargo.toml +++ b/src/liballoc/Cargo.toml @@ -9,7 +9,6 @@ path = "lib.rs" [dependencies] core = { path = "../libcore" } -std_unicode = { path = "../libstd_unicode" } compiler_builtins = { path = "../rustc/compiler_builtins_shim" } [dev-dependencies] diff --git a/src/liballoc/lib.rs b/src/liballoc/lib.rs index b08bd66b47c59..69fc007ab7c16 100644 --- a/src/liballoc/lib.rs +++ b/src/liballoc/lib.rs @@ -113,7 +113,7 @@ #![feature(trusted_len)] #![feature(try_reserve)] #![feature(unboxed_closures)] -#![feature(unicode)] +#![feature(unicode_internals)] #![feature(unsize)] #![feature(allocator_internals)] #![feature(on_unimplemented)] @@ -135,8 +135,6 @@ extern crate test; #[cfg(test)] extern crate rand; -extern crate std_unicode; - // Module with internal macros used by other modules (needs to be included before other modules). #[macro_use] mod macros; diff --git a/src/liballoc/str.rs b/src/liballoc/str.rs index d5ef41df0d850..65df93bd3bb54 100644 --- a/src/liballoc/str.rs +++ b/src/liballoc/str.rs @@ -45,12 +45,10 @@ use core::str::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher}; use core::mem; use core::ptr; use core::iter::FusedIterator; -use std_unicode::str::{UnicodeStr, Utf16Encoder}; use vec_deque::VecDeque; use borrow::{Borrow, ToOwned}; use string::String; -use std_unicode; use vec::Vec; use slice::{SliceConcatExt, SliceIndex}; use boxed::Box; @@ -75,7 +73,7 @@ pub use core::str::{from_utf8, from_utf8_mut, Chars, CharIndices, Bytes}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut, ParseBoolError}; #[stable(feature = "rust1", since = "1.0.0")] -pub use std_unicode::str::SplitWhitespace; +pub use core::str::SplitWhitespace; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::pattern; @@ -147,7 +145,8 @@ impl> SliceConcatExt for [S] { #[derive(Clone)] #[stable(feature = "encode_utf16", since = "1.8.0")] pub struct EncodeUtf16<'a> { - encoder: Utf16Encoder>, + chars: Chars<'a>, + extra: u16, } #[stable(feature = "collection_debug", since = "1.17.0")] @@ -163,12 +162,29 @@ impl<'a> Iterator for EncodeUtf16<'a> { #[inline] fn next(&mut self) -> Option { - self.encoder.next() + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + let mut buf = [0; 2]; + self.chars.next().map(|ch| { + let n = ch.encode_utf16(&mut buf).len(); + if n == 2 { + self.extra = buf[1]; + } + buf[0] + }) } #[inline] fn size_hint(&self) -> (usize, Option) { - self.encoder.size_hint() + let (low, high) = self.chars.size_hint(); + // every char gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(2))) } } @@ -801,7 +817,7 @@ impl str { #[stable(feature = "split_whitespace", since = "1.1.0")] #[inline] pub fn split_whitespace(&self) -> SplitWhitespace { - UnicodeStr::split_whitespace(self) + StrExt::split_whitespace(self) } /// An iterator over the lines of a string, as string slices. @@ -871,7 +887,7 @@ impl str { /// ``` #[stable(feature = "encode_utf16", since = "1.8.0")] pub fn encode_utf16(&self) -> EncodeUtf16 { - EncodeUtf16 { encoder: Utf16Encoder::new(self[..].chars()) } + EncodeUtf16 { chars: self[..].chars(), extra: 0 } } /// Returns `true` if the given pattern matches a sub-slice of @@ -1571,7 +1587,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn trim(&self) -> &str { - UnicodeStr::trim(self) + StrExt::trim(self) } /// Returns a string slice with leading whitespace removed. @@ -1607,7 +1623,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn trim_left(&self) -> &str { - UnicodeStr::trim_left(self) + StrExt::trim_left(self) } /// Returns a string slice with trailing whitespace removed. @@ -1643,7 +1659,7 @@ impl str { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn trim_right(&self) -> &str { - UnicodeStr::trim_right(self) + StrExt::trim_right(self) } /// Returns a string slice with all prefixes and suffixes that match a @@ -1960,7 +1976,7 @@ impl str { } fn case_ignoreable_then_cased>(iter: I) -> bool { - use std_unicode::derived_property::{Cased, Case_Ignorable}; + use core::unicode::derived_property::{Cased, Case_Ignorable}; match iter.skip_while(|&c| Case_Ignorable(c)).next() { Some(c) => Cased(c), None => false, @@ -2142,7 +2158,7 @@ impl str { #[stable(feature = "unicode_methods_on_intrinsics", since = "1.27.0")] #[inline] pub fn is_whitespace(&self) -> bool { - UnicodeStr::is_whitespace(self) + StrExt::is_whitespace(self) } /// Returns true if this `str` is entirely alphanumeric, and false otherwise. @@ -2161,7 +2177,7 @@ impl str { #[stable(feature = "unicode_methods_on_intrinsics", since = "1.27.0")] #[inline] pub fn is_alphanumeric(&self) -> bool { - UnicodeStr::is_alphanumeric(self) + StrExt::is_alphanumeric(self) } /// Checks if all characters in this string are within the ASCII range. diff --git a/src/liballoc/string.rs b/src/liballoc/string.rs index b95aae02894ed..29d759b1f0007 100644 --- a/src/liballoc/string.rs +++ b/src/liballoc/string.rs @@ -56,6 +56,7 @@ #![stable(feature = "rust1", since = "1.0.0")] +use core::char::{decode_utf16, REPLACEMENT_CHARACTER}; use core::fmt; use core::hash; use core::iter::{FromIterator, FusedIterator}; @@ -63,8 +64,7 @@ use core::ops::Bound::{Excluded, Included, Unbounded}; use core::ops::{self, Add, AddAssign, Index, IndexMut, RangeBounds}; use core::ptr; use core::str::pattern::Pattern; -use std_unicode::lossy; -use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER}; +use core::str::lossy; use borrow::{Cow, ToOwned}; use str::{self, from_boxed_utf8_unchecked, FromStr, Utf8Error, Chars}; diff --git a/src/liballoc/tests/lib.rs b/src/liballoc/tests/lib.rs index 17f1d0464a5c2..3227216930700 100644 --- a/src/liballoc/tests/lib.rs +++ b/src/liballoc/tests/lib.rs @@ -24,12 +24,11 @@ #![feature(string_retain)] #![feature(try_reserve)] #![feature(unboxed_closures)] -#![feature(unicode)] #![feature(exact_chunks)] #![feature(inclusive_range_fields)] extern crate alloc_system; -extern crate std_unicode; +extern crate core; extern crate rand; use std::hash::{Hash, Hasher}; diff --git a/src/liballoc/tests/str.rs b/src/liballoc/tests/str.rs index a14a5d32738b3..a3f4c385fe23b 100644 --- a/src/liballoc/tests/str.rs +++ b/src/liballoc/tests/str.rs @@ -1204,8 +1204,7 @@ fn test_rev_split_char_iterator_no_trailing() { #[test] fn test_utf16_code_units() { - use std_unicode::str::Utf16Encoder; - assert_eq!(Utf16Encoder::new(vec!['é', '\u{1F4A9}'].into_iter()).collect::>(), + assert_eq!("é\u{1F4A9}".encode_utf16().collect::>(), [0xE9, 0xD83D, 0xDCA9]) } diff --git a/src/liballoc/tests/string.rs b/src/liballoc/tests/string.rs index cb4a17a22d8a4..17d53e4cf3e09 100644 --- a/src/liballoc/tests/string.rs +++ b/src/liballoc/tests/string.rs @@ -132,7 +132,7 @@ fn test_from_utf16() { let s_as_utf16 = s.encode_utf16().collect::>(); let u_as_string = String::from_utf16(&u).unwrap(); - assert!(::std_unicode::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok())); + assert!(::core::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok())); assert_eq!(s_as_utf16, u); assert_eq!(u_as_string, s); diff --git a/src/libcore/char.rs b/src/libcore/char.rs deleted file mode 100644 index 718c6b893edf2..0000000000000 --- a/src/libcore/char.rs +++ /dev/null @@ -1,918 +0,0 @@ -// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! Character manipulation. -//! -//! For more details, see ::std_unicode::char (a.k.a. std::char) - -#![allow(non_snake_case)] -#![stable(feature = "core_char", since = "1.2.0")] - -use char_private::is_printable; -use convert::TryFrom; -use fmt::{self, Write}; -use slice; -use str::{from_utf8_unchecked_mut, FromStr}; -use iter::FusedIterator; -use mem::transmute; - -// UTF-8 ranges and tags for encoding characters -const TAG_CONT: u8 = 0b1000_0000; -const TAG_TWO_B: u8 = 0b1100_0000; -const TAG_THREE_B: u8 = 0b1110_0000; -const TAG_FOUR_B: u8 = 0b1111_0000; -const MAX_ONE_B: u32 = 0x80; -const MAX_TWO_B: u32 = 0x800; -const MAX_THREE_B: u32 = 0x10000; - -/* - Lu Uppercase_Letter an uppercase letter - Ll Lowercase_Letter a lowercase letter - Lt Titlecase_Letter a digraphic character, with first part uppercase - Lm Modifier_Letter a modifier letter - Lo Other_Letter other letters, including syllables and ideographs - Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) - Mc Spacing_Mark a spacing combining mark (positive advance width) - Me Enclosing_Mark an enclosing combining mark - Nd Decimal_Number a decimal digit - Nl Letter_Number a letterlike numeric character - No Other_Number a numeric character of other type - Pc Connector_Punctuation a connecting punctuation mark, like a tie - Pd Dash_Punctuation a dash or hyphen punctuation mark - Ps Open_Punctuation an opening punctuation mark (of a pair) - Pe Close_Punctuation a closing punctuation mark (of a pair) - Pi Initial_Punctuation an initial quotation mark - Pf Final_Punctuation a final quotation mark - Po Other_Punctuation a punctuation mark of other type - Sm Math_Symbol a symbol of primarily mathematical use - Sc Currency_Symbol a currency sign - Sk Modifier_Symbol a non-letterlike modifier symbol - So Other_Symbol a symbol of other type - Zs Space_Separator a space character (of various non-zero widths) - Zl Line_Separator U+2028 LINE SEPARATOR only - Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only - Cc Control a C0 or C1 control code - Cf Format a format control character - Cs Surrogate a surrogate code point - Co Private_Use a private-use character - Cn Unassigned a reserved unassigned code point or a noncharacter -*/ - -/// The highest valid code point a `char` can have. -/// -/// A [`char`] is a [Unicode Scalar Value], which means that it is a [Code -/// Point], but only ones within a certain range. `MAX` is the highest valid -/// code point that's a valid [Unicode Scalar Value]. -/// -/// [`char`]: ../../std/primitive.char.html -/// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value -/// [Code Point]: http://www.unicode.org/glossary/#code_point -#[stable(feature = "rust1", since = "1.0.0")] -pub const MAX: char = '\u{10ffff}'; - -/// Converts a `u32` to a `char`. -/// -/// Note that all [`char`]s are valid [`u32`]s, and can be cast to one with -/// [`as`]: -/// -/// ``` -/// let c = '💯'; -/// let i = c as u32; -/// -/// assert_eq!(128175, i); -/// ``` -/// -/// However, the reverse is not true: not all valid [`u32`]s are valid -/// [`char`]s. `from_u32()` will return `None` if the input is not a valid value -/// for a [`char`]. -/// -/// [`char`]: ../../std/primitive.char.html -/// [`u32`]: ../../std/primitive.u32.html -/// [`as`]: ../../book/first-edition/casting-between-types.html#as -/// -/// For an unsafe version of this function which ignores these checks, see -/// [`from_u32_unchecked`]. -/// -/// [`from_u32_unchecked`]: fn.from_u32_unchecked.html -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::char; -/// -/// let c = char::from_u32(0x2764); -/// -/// assert_eq!(Some('❤'), c); -/// ``` -/// -/// Returning `None` when the input is not a valid [`char`]: -/// -/// ``` -/// use std::char; -/// -/// let c = char::from_u32(0x110000); -/// -/// assert_eq!(None, c); -/// ``` -#[inline] -#[stable(feature = "rust1", since = "1.0.0")] -pub fn from_u32(i: u32) -> Option { - char::try_from(i).ok() -} - -/// Converts a `u32` to a `char`, ignoring validity. -/// -/// Note that all [`char`]s are valid [`u32`]s, and can be cast to one with -/// [`as`]: -/// -/// ``` -/// let c = '💯'; -/// let i = c as u32; -/// -/// assert_eq!(128175, i); -/// ``` -/// -/// However, the reverse is not true: not all valid [`u32`]s are valid -/// [`char`]s. `from_u32_unchecked()` will ignore this, and blindly cast to -/// [`char`], possibly creating an invalid one. -/// -/// [`char`]: ../../std/primitive.char.html -/// [`u32`]: ../../std/primitive.u32.html -/// [`as`]: ../../book/first-edition/casting-between-types.html#as -/// -/// # Safety -/// -/// This function is unsafe, as it may construct invalid `char` values. -/// -/// For a safe version of this function, see the [`from_u32`] function. -/// -/// [`from_u32`]: fn.from_u32.html -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::char; -/// -/// let c = unsafe { char::from_u32_unchecked(0x2764) }; -/// -/// assert_eq!('❤', c); -/// ``` -#[inline] -#[stable(feature = "char_from_unchecked", since = "1.5.0")] -pub unsafe fn from_u32_unchecked(i: u32) -> char { - transmute(i) -} - -#[stable(feature = "char_convert", since = "1.13.0")] -impl From for u32 { - #[inline] - fn from(c: char) -> Self { - c as u32 - } -} - -/// Maps a byte in 0x00...0xFF to a `char` whose code point has the same value, in U+0000 to U+00FF. -/// -/// Unicode is designed such that this effectively decodes bytes -/// with the character encoding that IANA calls ISO-8859-1. -/// This encoding is compatible with ASCII. -/// -/// Note that this is different from ISO/IEC 8859-1 a.k.a. ISO 8859-1 (with one less hyphen), -/// which leaves some "blanks", byte values that are not assigned to any character. -/// ISO-8859-1 (the IANA one) assigns them to the C0 and C1 control codes. -/// -/// Note that this is *also* different from Windows-1252 a.k.a. code page 1252, -/// which is a superset ISO/IEC 8859-1 that assigns some (not all!) blanks -/// to punctuation and various Latin characters. -/// -/// To confuse things further, [on the Web](https://encoding.spec.whatwg.org/) -/// `ascii`, `iso-8859-1`, and `windows-1252` are all aliases -/// for a superset of Windows-1252 that fills the remaining blanks with corresponding -/// C0 and C1 control codes. -#[stable(feature = "char_convert", since = "1.13.0")] -impl From for char { - #[inline] - fn from(i: u8) -> Self { - i as char - } -} - - -/// An error which can be returned when parsing a char. -#[stable(feature = "char_from_str", since = "1.20.0")] -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct ParseCharError { - kind: CharErrorKind, -} - -impl ParseCharError { - #[unstable(feature = "char_error_internals", - reason = "this method should not be available publicly", - issue = "0")] - #[doc(hidden)] - pub fn __description(&self) -> &str { - match self.kind { - CharErrorKind::EmptyString => { - "cannot parse char from empty string" - }, - CharErrorKind::TooManyChars => "too many characters in string" - } - } -} - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -enum CharErrorKind { - EmptyString, - TooManyChars, -} - -#[stable(feature = "char_from_str", since = "1.20.0")] -impl fmt::Display for ParseCharError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.__description().fmt(f) - } -} - - -#[stable(feature = "char_from_str", since = "1.20.0")] -impl FromStr for char { - type Err = ParseCharError; - - #[inline] - fn from_str(s: &str) -> Result { - let mut chars = s.chars(); - match (chars.next(), chars.next()) { - (None, _) => { - Err(ParseCharError { kind: CharErrorKind::EmptyString }) - }, - (Some(c), None) => Ok(c), - _ => { - Err(ParseCharError { kind: CharErrorKind::TooManyChars }) - } - } - } -} - - -#[stable(feature = "try_from", since = "1.26.0")] -impl TryFrom for char { - type Error = CharTryFromError; - - #[inline] - fn try_from(i: u32) -> Result { - if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) { - Err(CharTryFromError(())) - } else { - Ok(unsafe { from_u32_unchecked(i) }) - } - } -} - -/// The error type returned when a conversion from u32 to char fails. -#[stable(feature = "try_from", since = "1.26.0")] -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct CharTryFromError(()); - -#[stable(feature = "try_from", since = "1.26.0")] -impl fmt::Display for CharTryFromError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - "converted integer out of range for `char`".fmt(f) - } -} - -/// Converts a digit in the given radix to a `char`. -/// -/// A 'radix' here is sometimes also called a 'base'. A radix of two -/// indicates a binary number, a radix of ten, decimal, and a radix of -/// sixteen, hexadecimal, to give some common values. Arbitrary -/// radices are supported. -/// -/// `from_digit()` will return `None` if the input is not a digit in -/// the given radix. -/// -/// # Panics -/// -/// Panics if given a radix larger than 36. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::char; -/// -/// let c = char::from_digit(4, 10); -/// -/// assert_eq!(Some('4'), c); -/// -/// // Decimal 11 is a single digit in base 16 -/// let c = char::from_digit(11, 16); -/// -/// assert_eq!(Some('b'), c); -/// ``` -/// -/// Returning `None` when the input is not a digit: -/// -/// ``` -/// use std::char; -/// -/// let c = char::from_digit(20, 10); -/// -/// assert_eq!(None, c); -/// ``` -/// -/// Passing a large radix, causing a panic: -/// -/// ``` -/// use std::thread; -/// use std::char; -/// -/// let result = thread::spawn(|| { -/// // this panics -/// let c = char::from_digit(1, 37); -/// }).join(); -/// -/// assert!(result.is_err()); -/// ``` -#[inline] -#[stable(feature = "rust1", since = "1.0.0")] -pub fn from_digit(num: u32, radix: u32) -> Option { - if radix > 36 { - panic!("from_digit: radix is too high (maximum 36)"); - } - if num < radix { - let num = num as u8; - if num < 10 { - Some((b'0' + num) as char) - } else { - Some((b'a' + num - 10) as char) - } - } else { - None - } -} - -// NB: the stabilization and documentation for this trait is in -// unicode/char.rs, not here -#[allow(missing_docs)] // docs in libunicode/u_char.rs -#[doc(hidden)] -#[unstable(feature = "core_char_ext", - reason = "the stable interface is `impl char` in later crate", - issue = "32110")] -pub trait CharExt { - #[stable(feature = "core", since = "1.6.0")] - fn is_digit(self, radix: u32) -> bool; - #[stable(feature = "core", since = "1.6.0")] - fn to_digit(self, radix: u32) -> Option; - #[stable(feature = "core", since = "1.6.0")] - fn escape_unicode(self) -> EscapeUnicode; - #[stable(feature = "core", since = "1.6.0")] - fn escape_default(self) -> EscapeDefault; - #[stable(feature = "char_escape_debug", since = "1.20.0")] - fn escape_debug(self) -> EscapeDebug; - #[stable(feature = "core", since = "1.6.0")] - fn len_utf8(self) -> usize; - #[stable(feature = "core", since = "1.6.0")] - fn len_utf16(self) -> usize; - #[stable(feature = "unicode_encode_char", since = "1.15.0")] - fn encode_utf8(self, dst: &mut [u8]) -> &mut str; - #[stable(feature = "unicode_encode_char", since = "1.15.0")] - fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16]; -} - -#[stable(feature = "core", since = "1.6.0")] -impl CharExt for char { - #[inline] - fn is_digit(self, radix: u32) -> bool { - self.to_digit(radix).is_some() - } - - #[inline] - fn to_digit(self, radix: u32) -> Option { - if radix > 36 { - panic!("to_digit: radix is too high (maximum 36)"); - } - let val = match self { - '0' ... '9' => self as u32 - '0' as u32, - 'a' ... 'z' => self as u32 - 'a' as u32 + 10, - 'A' ... 'Z' => self as u32 - 'A' as u32 + 10, - _ => return None, - }; - if val < radix { Some(val) } - else { None } - } - - #[inline] - fn escape_unicode(self) -> EscapeUnicode { - let c = self as u32; - - // or-ing 1 ensures that for c==0 the code computes that one - // digit should be printed and (which is the same) avoids the - // (31 - 32) underflow - let msb = 31 - (c | 1).leading_zeros(); - - // the index of the most significant hex digit - let ms_hex_digit = msb / 4; - EscapeUnicode { - c: self, - state: EscapeUnicodeState::Backslash, - hex_digit_idx: ms_hex_digit as usize, - } - } - - #[inline] - fn escape_default(self) -> EscapeDefault { - let init_state = match self { - '\t' => EscapeDefaultState::Backslash('t'), - '\r' => EscapeDefaultState::Backslash('r'), - '\n' => EscapeDefaultState::Backslash('n'), - '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), - '\x20' ... '\x7e' => EscapeDefaultState::Char(self), - _ => EscapeDefaultState::Unicode(self.escape_unicode()) - }; - EscapeDefault { state: init_state } - } - - #[inline] - fn escape_debug(self) -> EscapeDebug { - let init_state = match self { - '\t' => EscapeDefaultState::Backslash('t'), - '\r' => EscapeDefaultState::Backslash('r'), - '\n' => EscapeDefaultState::Backslash('n'), - '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), - c if is_printable(c) => EscapeDefaultState::Char(c), - c => EscapeDefaultState::Unicode(c.escape_unicode()), - }; - EscapeDebug(EscapeDefault { state: init_state }) - } - - #[inline] - fn len_utf8(self) -> usize { - let code = self as u32; - if code < MAX_ONE_B { - 1 - } else if code < MAX_TWO_B { - 2 - } else if code < MAX_THREE_B { - 3 - } else { - 4 - } - } - - #[inline] - fn len_utf16(self) -> usize { - let ch = self as u32; - if (ch & 0xFFFF) == ch { 1 } else { 2 } - } - - #[inline] - fn encode_utf8(self, dst: &mut [u8]) -> &mut str { - let code = self as u32; - unsafe { - let len = - if code < MAX_ONE_B && !dst.is_empty() { - *dst.get_unchecked_mut(0) = code as u8; - 1 - } else if code < MAX_TWO_B && dst.len() >= 2 { - *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; - *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT; - 2 - } else if code < MAX_THREE_B && dst.len() >= 3 { - *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; - *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT; - 3 - } else if dst.len() >= 4 { - *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; - *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; - *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; - *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT; - 4 - } else { - panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", - from_u32_unchecked(code).len_utf8(), - code, - dst.len()) - }; - from_utf8_unchecked_mut(dst.get_unchecked_mut(..len)) - } - } - - #[inline] - fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { - let mut code = self as u32; - unsafe { - if (code & 0xFFFF) == code && !dst.is_empty() { - // The BMP falls through (assuming non-surrogate, as it should) - *dst.get_unchecked_mut(0) = code as u16; - slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) - } else if dst.len() >= 2 { - // Supplementary planes break into surrogates. - code -= 0x1_0000; - *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); - *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); - slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) - } else { - panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", - from_u32_unchecked(code).len_utf16(), - code, - dst.len()) - } - } - } -} - -/// Returns an iterator that yields the hexadecimal Unicode escape of a -/// character, as `char`s. -/// -/// This `struct` is created by the [`escape_unicode`] method on [`char`]. See -/// its documentation for more. -/// -/// [`escape_unicode`]: ../../std/primitive.char.html#method.escape_unicode -/// [`char`]: ../../std/primitive.char.html -#[derive(Clone, Debug)] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct EscapeUnicode { - c: char, - state: EscapeUnicodeState, - - // The index of the next hex digit to be printed (0 if none), - // i.e. the number of remaining hex digits to be printed; - // increasing from the least significant digit: 0x543210 - hex_digit_idx: usize, -} - -// The enum values are ordered so that their representation is the -// same as the remaining length (besides the hexadecimal digits). This -// likely makes `len()` a single load from memory) and inline-worth. -#[derive(Clone, Debug)] -enum EscapeUnicodeState { - Done, - RightBrace, - Value, - LeftBrace, - Type, - Backslash, -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Iterator for EscapeUnicode { - type Item = char; - - fn next(&mut self) -> Option { - match self.state { - EscapeUnicodeState::Backslash => { - self.state = EscapeUnicodeState::Type; - Some('\\') - } - EscapeUnicodeState::Type => { - self.state = EscapeUnicodeState::LeftBrace; - Some('u') - } - EscapeUnicodeState::LeftBrace => { - self.state = EscapeUnicodeState::Value; - Some('{') - } - EscapeUnicodeState::Value => { - let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf; - let c = from_digit(hex_digit, 16).unwrap(); - if self.hex_digit_idx == 0 { - self.state = EscapeUnicodeState::RightBrace; - } else { - self.hex_digit_idx -= 1; - } - Some(c) - } - EscapeUnicodeState::RightBrace => { - self.state = EscapeUnicodeState::Done; - Some('}') - } - EscapeUnicodeState::Done => None, - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let n = self.len(); - (n, Some(n)) - } - - #[inline] - fn count(self) -> usize { - self.len() - } - - fn last(self) -> Option { - match self.state { - EscapeUnicodeState::Done => None, - - EscapeUnicodeState::RightBrace | - EscapeUnicodeState::Value | - EscapeUnicodeState::LeftBrace | - EscapeUnicodeState::Type | - EscapeUnicodeState::Backslash => Some('}'), - } - } -} - -#[stable(feature = "exact_size_escape", since = "1.11.0")] -impl ExactSizeIterator for EscapeUnicode { - #[inline] - fn len(&self) -> usize { - // The match is a single memory access with no branching - self.hex_digit_idx + match self.state { - EscapeUnicodeState::Done => 0, - EscapeUnicodeState::RightBrace => 1, - EscapeUnicodeState::Value => 2, - EscapeUnicodeState::LeftBrace => 3, - EscapeUnicodeState::Type => 4, - EscapeUnicodeState::Backslash => 5, - } - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for EscapeUnicode {} - -#[stable(feature = "char_struct_display", since = "1.16.0")] -impl fmt::Display for EscapeUnicode { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - for c in self.clone() { - f.write_char(c)?; - } - Ok(()) - } -} - -/// An iterator that yields the literal escape code of a `char`. -/// -/// This `struct` is created by the [`escape_default`] method on [`char`]. See -/// its documentation for more. -/// -/// [`escape_default`]: ../../std/primitive.char.html#method.escape_default -/// [`char`]: ../../std/primitive.char.html -#[derive(Clone, Debug)] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct EscapeDefault { - state: EscapeDefaultState -} - -#[derive(Clone, Debug)] -enum EscapeDefaultState { - Done, - Char(char), - Backslash(char), - Unicode(EscapeUnicode), -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Iterator for EscapeDefault { - type Item = char; - - fn next(&mut self) -> Option { - match self.state { - EscapeDefaultState::Backslash(c) => { - self.state = EscapeDefaultState::Char(c); - Some('\\') - } - EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done; - Some(c) - } - EscapeDefaultState::Done => None, - EscapeDefaultState::Unicode(ref mut iter) => iter.next(), - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let n = self.len(); - (n, Some(n)) - } - - #[inline] - fn count(self) -> usize { - self.len() - } - - fn nth(&mut self, n: usize) -> Option { - match self.state { - EscapeDefaultState::Backslash(c) if n == 0 => { - self.state = EscapeDefaultState::Char(c); - Some('\\') - }, - EscapeDefaultState::Backslash(c) if n == 1 => { - self.state = EscapeDefaultState::Done; - Some(c) - }, - EscapeDefaultState::Backslash(_) => { - self.state = EscapeDefaultState::Done; - None - }, - EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done; - - if n == 0 { - Some(c) - } else { - None - } - }, - EscapeDefaultState::Done => return None, - EscapeDefaultState::Unicode(ref mut i) => return i.nth(n), - } - } - - fn last(self) -> Option { - match self.state { - EscapeDefaultState::Unicode(iter) => iter.last(), - EscapeDefaultState::Done => None, - EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c), - } - } -} - -#[stable(feature = "exact_size_escape", since = "1.11.0")] -impl ExactSizeIterator for EscapeDefault { - fn len(&self) -> usize { - match self.state { - EscapeDefaultState::Done => 0, - EscapeDefaultState::Char(_) => 1, - EscapeDefaultState::Backslash(_) => 2, - EscapeDefaultState::Unicode(ref iter) => iter.len(), - } - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for EscapeDefault {} - -#[stable(feature = "char_struct_display", since = "1.16.0")] -impl fmt::Display for EscapeDefault { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - for c in self.clone() { - f.write_char(c)?; - } - Ok(()) - } -} - -/// An iterator that yields the literal escape code of a `char`. -/// -/// This `struct` is created by the [`escape_debug`] method on [`char`]. See its -/// documentation for more. -/// -/// [`escape_debug`]: ../../std/primitive.char.html#method.escape_debug -/// [`char`]: ../../std/primitive.char.html -#[stable(feature = "char_escape_debug", since = "1.20.0")] -#[derive(Clone, Debug)] -pub struct EscapeDebug(EscapeDefault); - -#[stable(feature = "char_escape_debug", since = "1.20.0")] -impl Iterator for EscapeDebug { - type Item = char; - fn next(&mut self) -> Option { self.0.next() } - fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } -} - -#[stable(feature = "char_escape_debug", since = "1.20.0")] -impl ExactSizeIterator for EscapeDebug { } - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for EscapeDebug {} - -#[stable(feature = "char_escape_debug", since = "1.20.0")] -impl fmt::Display for EscapeDebug { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Display::fmt(&self.0, f) - } -} - - - -/// An iterator over an iterator of bytes of the characters the bytes represent -/// as UTF-8 -#[unstable(feature = "decode_utf8", issue = "33906")] -#[derive(Clone, Debug)] -pub struct DecodeUtf8>(::iter::Peekable); - -/// Decodes an `Iterator` of bytes as UTF-8. -#[unstable(feature = "decode_utf8", issue = "33906")] -#[inline] -pub fn decode_utf8>(i: I) -> DecodeUtf8 { - DecodeUtf8(i.into_iter().peekable()) -} - -/// `::next` returns this for an invalid input sequence. -#[unstable(feature = "decode_utf8", issue = "33906")] -#[derive(PartialEq, Eq, Debug)] -pub struct InvalidSequence(()); - -#[unstable(feature = "decode_utf8", issue = "33906")] -impl> Iterator for DecodeUtf8 { - type Item = Result; - #[inline] - - fn next(&mut self) -> Option> { - self.0.next().map(|first_byte| { - // Emit InvalidSequence according to - // Unicode §5.22 Best Practice for U+FFFD Substitution - // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630 - - // Roughly: consume at least one byte, - // then validate one byte at a time and stop before the first unexpected byte - // (which might be the valid start of the next byte sequence). - - let mut code_point; - macro_rules! first_byte { - ($mask: expr) => { - code_point = u32::from(first_byte & $mask) - } - } - macro_rules! continuation_byte { - () => { continuation_byte!(0x80...0xBF) }; - ($range: pat) => { - match self.0.peek() { - Some(&byte @ $range) => { - code_point = (code_point << 6) | u32::from(byte & 0b0011_1111); - self.0.next(); - } - _ => return Err(InvalidSequence(())) - } - } - } - - match first_byte { - 0x00...0x7F => { - first_byte!(0b1111_1111); - } - 0xC2...0xDF => { - first_byte!(0b0001_1111); - continuation_byte!(); - } - 0xE0 => { - first_byte!(0b0000_1111); - continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong - continuation_byte!(); - } - 0xE1...0xEC | 0xEE...0xEF => { - first_byte!(0b0000_1111); - continuation_byte!(); - continuation_byte!(); - } - 0xED => { - first_byte!(0b0000_1111); - continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates - continuation_byte!(); - } - 0xF0 => { - first_byte!(0b0000_0111); - continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong - continuation_byte!(); - continuation_byte!(); - } - 0xF1...0xF3 => { - first_byte!(0b0000_0111); - continuation_byte!(); - continuation_byte!(); - continuation_byte!(); - } - 0xF4 => { - first_byte!(0b0000_0111); - continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX - continuation_byte!(); - continuation_byte!(); - } - _ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX - } - unsafe { - Ok(from_u32_unchecked(code_point)) - } - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (lower, upper) = self.0.size_hint(); - - // A code point is at most 4 bytes long. - let min_code_points = lower / 4; - - (min_code_points, upper) - } -} - -#[unstable(feature = "decode_utf8", issue = "33906")] -impl> FusedIterator for DecodeUtf8 {} diff --git a/src/libcore/char/convert.rs b/src/libcore/char/convert.rs new file mode 100644 index 0000000000000..150562a4a9b29 --- /dev/null +++ b/src/libcore/char/convert.rs @@ -0,0 +1,304 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Character conversions. + +use convert::TryFrom; +use fmt; +use mem::transmute; +use str::FromStr; +use super::MAX; + +/// Converts a `u32` to a `char`. +/// +/// Note that all [`char`]s are valid [`u32`]s, and can be cast to one with +/// [`as`]: +/// +/// ``` +/// let c = '💯'; +/// let i = c as u32; +/// +/// assert_eq!(128175, i); +/// ``` +/// +/// However, the reverse is not true: not all valid [`u32`]s are valid +/// [`char`]s. `from_u32()` will return `None` if the input is not a valid value +/// for a [`char`]. +/// +/// [`char`]: ../../std/primitive.char.html +/// [`u32`]: ../../std/primitive.u32.html +/// [`as`]: ../../book/first-edition/casting-between-types.html#as +/// +/// For an unsafe version of this function which ignores these checks, see +/// [`from_u32_unchecked`]. +/// +/// [`from_u32_unchecked`]: fn.from_u32_unchecked.html +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::char; +/// +/// let c = char::from_u32(0x2764); +/// +/// assert_eq!(Some('❤'), c); +/// ``` +/// +/// Returning `None` when the input is not a valid [`char`]: +/// +/// ``` +/// use std::char; +/// +/// let c = char::from_u32(0x110000); +/// +/// assert_eq!(None, c); +/// ``` +#[inline] +#[stable(feature = "rust1", since = "1.0.0")] +pub fn from_u32(i: u32) -> Option { + char::try_from(i).ok() +} + +/// Converts a `u32` to a `char`, ignoring validity. +/// +/// Note that all [`char`]s are valid [`u32`]s, and can be cast to one with +/// [`as`]: +/// +/// ``` +/// let c = '💯'; +/// let i = c as u32; +/// +/// assert_eq!(128175, i); +/// ``` +/// +/// However, the reverse is not true: not all valid [`u32`]s are valid +/// [`char`]s. `from_u32_unchecked()` will ignore this, and blindly cast to +/// [`char`], possibly creating an invalid one. +/// +/// [`char`]: ../../std/primitive.char.html +/// [`u32`]: ../../std/primitive.u32.html +/// [`as`]: ../../book/first-edition/casting-between-types.html#as +/// +/// # Safety +/// +/// This function is unsafe, as it may construct invalid `char` values. +/// +/// For a safe version of this function, see the [`from_u32`] function. +/// +/// [`from_u32`]: fn.from_u32.html +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::char; +/// +/// let c = unsafe { char::from_u32_unchecked(0x2764) }; +/// +/// assert_eq!('❤', c); +/// ``` +#[inline] +#[stable(feature = "char_from_unchecked", since = "1.5.0")] +pub unsafe fn from_u32_unchecked(i: u32) -> char { + transmute(i) +} + +#[stable(feature = "char_convert", since = "1.13.0")] +impl From for u32 { + #[inline] + fn from(c: char) -> Self { + c as u32 + } +} + +/// Maps a byte in 0x00...0xFF to a `char` whose code point has the same value, in U+0000 to U+00FF. +/// +/// Unicode is designed such that this effectively decodes bytes +/// with the character encoding that IANA calls ISO-8859-1. +/// This encoding is compatible with ASCII. +/// +/// Note that this is different from ISO/IEC 8859-1 a.k.a. ISO 8859-1 (with one less hyphen), +/// which leaves some "blanks", byte values that are not assigned to any character. +/// ISO-8859-1 (the IANA one) assigns them to the C0 and C1 control codes. +/// +/// Note that this is *also* different from Windows-1252 a.k.a. code page 1252, +/// which is a superset ISO/IEC 8859-1 that assigns some (not all!) blanks +/// to punctuation and various Latin characters. +/// +/// To confuse things further, [on the Web](https://encoding.spec.whatwg.org/) +/// `ascii`, `iso-8859-1`, and `windows-1252` are all aliases +/// for a superset of Windows-1252 that fills the remaining blanks with corresponding +/// C0 and C1 control codes. +#[stable(feature = "char_convert", since = "1.13.0")] +impl From for char { + #[inline] + fn from(i: u8) -> Self { + i as char + } +} + + +/// An error which can be returned when parsing a char. +#[stable(feature = "char_from_str", since = "1.20.0")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ParseCharError { + kind: CharErrorKind, +} + +impl ParseCharError { + #[unstable(feature = "char_error_internals", + reason = "this method should not be available publicly", + issue = "0")] + #[doc(hidden)] + pub fn __description(&self) -> &str { + match self.kind { + CharErrorKind::EmptyString => { + "cannot parse char from empty string" + }, + CharErrorKind::TooManyChars => "too many characters in string" + } + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum CharErrorKind { + EmptyString, + TooManyChars, +} + +#[stable(feature = "char_from_str", since = "1.20.0")] +impl fmt::Display for ParseCharError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.__description().fmt(f) + } +} + + +#[stable(feature = "char_from_str", since = "1.20.0")] +impl FromStr for char { + type Err = ParseCharError; + + #[inline] + fn from_str(s: &str) -> Result { + let mut chars = s.chars(); + match (chars.next(), chars.next()) { + (None, _) => { + Err(ParseCharError { kind: CharErrorKind::EmptyString }) + }, + (Some(c), None) => Ok(c), + _ => { + Err(ParseCharError { kind: CharErrorKind::TooManyChars }) + } + } + } +} + + +#[stable(feature = "try_from", since = "1.26.0")] +impl TryFrom for char { + type Error = CharTryFromError; + + #[inline] + fn try_from(i: u32) -> Result { + if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) { + Err(CharTryFromError(())) + } else { + Ok(unsafe { from_u32_unchecked(i) }) + } + } +} + +/// The error type returned when a conversion from u32 to char fails. +#[stable(feature = "try_from", since = "1.26.0")] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct CharTryFromError(()); + +#[stable(feature = "try_from", since = "1.26.0")] +impl fmt::Display for CharTryFromError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + "converted integer out of range for `char`".fmt(f) + } +} + +/// Converts a digit in the given radix to a `char`. +/// +/// A 'radix' here is sometimes also called a 'base'. A radix of two +/// indicates a binary number, a radix of ten, decimal, and a radix of +/// sixteen, hexadecimal, to give some common values. Arbitrary +/// radices are supported. +/// +/// `from_digit()` will return `None` if the input is not a digit in +/// the given radix. +/// +/// # Panics +/// +/// Panics if given a radix larger than 36. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::char; +/// +/// let c = char::from_digit(4, 10); +/// +/// assert_eq!(Some('4'), c); +/// +/// // Decimal 11 is a single digit in base 16 +/// let c = char::from_digit(11, 16); +/// +/// assert_eq!(Some('b'), c); +/// ``` +/// +/// Returning `None` when the input is not a digit: +/// +/// ``` +/// use std::char; +/// +/// let c = char::from_digit(20, 10); +/// +/// assert_eq!(None, c); +/// ``` +/// +/// Passing a large radix, causing a panic: +/// +/// ``` +/// use std::thread; +/// use std::char; +/// +/// let result = thread::spawn(|| { +/// // this panics +/// let c = char::from_digit(1, 37); +/// }).join(); +/// +/// assert!(result.is_err()); +/// ``` +#[inline] +#[stable(feature = "rust1", since = "1.0.0")] +pub fn from_digit(num: u32, radix: u32) -> Option { + if radix > 36 { + panic!("from_digit: radix is too high (maximum 36)"); + } + if num < radix { + let num = num as u8; + if num < 10 { + Some((b'0' + num) as char) + } else { + Some((b'a' + num - 10) as char) + } + } else { + None + } +} + diff --git a/src/libcore/char/decode.rs b/src/libcore/char/decode.rs new file mode 100644 index 0000000000000..48b531104f882 --- /dev/null +++ b/src/libcore/char/decode.rs @@ -0,0 +1,259 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! UTF-8 and UTF-16 decoding iterators + +use fmt; +use iter::FusedIterator; +use super::from_u32_unchecked; + +/// An iterator over an iterator of bytes of the characters the bytes represent +/// as UTF-8 +#[unstable(feature = "decode_utf8", issue = "33906")] +#[derive(Clone, Debug)] +pub struct DecodeUtf8>(::iter::Peekable); + +/// Decodes an `Iterator` of bytes as UTF-8. +#[unstable(feature = "decode_utf8", issue = "33906")] +#[inline] +pub fn decode_utf8>(i: I) -> DecodeUtf8 { + DecodeUtf8(i.into_iter().peekable()) +} + +/// `::next` returns this for an invalid input sequence. +#[unstable(feature = "decode_utf8", issue = "33906")] +#[derive(PartialEq, Eq, Debug)] +pub struct InvalidSequence(()); + +#[unstable(feature = "decode_utf8", issue = "33906")] +impl> Iterator for DecodeUtf8 { + type Item = Result; + #[inline] + + fn next(&mut self) -> Option> { + self.0.next().map(|first_byte| { + // Emit InvalidSequence according to + // Unicode §5.22 Best Practice for U+FFFD Substitution + // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630 + + // Roughly: consume at least one byte, + // then validate one byte at a time and stop before the first unexpected byte + // (which might be the valid start of the next byte sequence). + + let mut code_point; + macro_rules! first_byte { + ($mask: expr) => { + code_point = u32::from(first_byte & $mask) + } + } + macro_rules! continuation_byte { + () => { continuation_byte!(0x80...0xBF) }; + ($range: pat) => { + match self.0.peek() { + Some(&byte @ $range) => { + code_point = (code_point << 6) | u32::from(byte & 0b0011_1111); + self.0.next(); + } + _ => return Err(InvalidSequence(())) + } + } + } + + match first_byte { + 0x00...0x7F => { + first_byte!(0b1111_1111); + } + 0xC2...0xDF => { + first_byte!(0b0001_1111); + continuation_byte!(); + } + 0xE0 => { + first_byte!(0b0000_1111); + continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong + continuation_byte!(); + } + 0xE1...0xEC | 0xEE...0xEF => { + first_byte!(0b0000_1111); + continuation_byte!(); + continuation_byte!(); + } + 0xED => { + first_byte!(0b0000_1111); + continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates + continuation_byte!(); + } + 0xF0 => { + first_byte!(0b0000_0111); + continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong + continuation_byte!(); + continuation_byte!(); + } + 0xF1...0xF3 => { + first_byte!(0b0000_0111); + continuation_byte!(); + continuation_byte!(); + continuation_byte!(); + } + 0xF4 => { + first_byte!(0b0000_0111); + continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX + continuation_byte!(); + continuation_byte!(); + } + _ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX + } + unsafe { + Ok(from_u32_unchecked(code_point)) + } + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (lower, upper) = self.0.size_hint(); + + // A code point is at most 4 bytes long. + let min_code_points = lower / 4; + + (min_code_points, upper) + } +} + +#[unstable(feature = "decode_utf8", issue = "33906")] +impl> FusedIterator for DecodeUtf8 {} + +/// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. +#[stable(feature = "decode_utf16", since = "1.9.0")] +#[derive(Clone, Debug)] +pub struct DecodeUtf16 + where I: Iterator +{ + iter: I, + buf: Option, +} + +/// An error that can be returned when decoding UTF-16 code points. +#[stable(feature = "decode_utf16", since = "1.9.0")] +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct DecodeUtf16Error { + code: u16, +} + +/// Create an iterator over the UTF-16 encoded code points in `iter`, +/// returning unpaired surrogates as `Err`s. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::char::decode_utf16; +/// +/// fn main() { +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(decode_utf16(v.iter().cloned()) +/// .map(|r| r.map_err(|e| e.unpaired_surrogate())) +/// .collect::>(), +/// vec![Ok('𝄞'), +/// Ok('m'), Ok('u'), Ok('s'), +/// Err(0xDD1E), +/// Ok('i'), Ok('c'), +/// Err(0xD834)]); +/// } +/// ``` +/// +/// A lossy decoder can be obtained by replacing `Err` results with the replacement character: +/// +/// ``` +/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; +/// +/// fn main() { +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(decode_utf16(v.iter().cloned()) +/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) +/// .collect::(), +/// "𝄞mus�ic�"); +/// } +/// ``` +#[stable(feature = "decode_utf16", since = "1.9.0")] +#[inline] +pub fn decode_utf16>(iter: I) -> DecodeUtf16 { + DecodeUtf16 { + iter: iter.into_iter(), + buf: None, + } +} + +#[stable(feature = "decode_utf16", since = "1.9.0")] +impl> Iterator for DecodeUtf16 { + type Item = Result; + + fn next(&mut self) -> Option> { + let u = match self.buf.take() { + Some(buf) => buf, + None => self.iter.next()? + }; + + if u < 0xD800 || 0xDFFF < u { + // not a surrogate + Some(Ok(unsafe { from_u32_unchecked(u as u32) })) + } else if u >= 0xDC00 { + // a trailing surrogate + Some(Err(DecodeUtf16Error { code: u })) + } else { + let u2 = match self.iter.next() { + Some(u2) => u2, + // eof + None => return Some(Err(DecodeUtf16Error { code: u })), + }; + if u2 < 0xDC00 || u2 > 0xDFFF { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.buf = Some(u2); + return Some(Err(DecodeUtf16Error { code: u })); + } + + // all ok, so lets decode it. + let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; + Some(Ok(unsafe { from_u32_unchecked(c) })) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) + (low / 2, high) + } +} + +impl DecodeUtf16Error { + /// Returns the unpaired surrogate which caused this error. + #[stable(feature = "decode_utf16", since = "1.9.0")] + pub fn unpaired_surrogate(&self) -> u16 { + self.code + } +} + +#[stable(feature = "decode_utf16", since = "1.9.0")] +impl fmt::Display for DecodeUtf16Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "unpaired surrogate found: {:x}", self.code) + } +} diff --git a/src/libstd_unicode/char.rs b/src/libcore/char/methods.rs similarity index 79% rename from src/libstd_unicode/char.rs rename to src/libcore/char/methods.rs index 33e47ade8cb9c..374adafef647d 100644 --- a/src/libstd_unicode/char.rs +++ b/src/libcore/char/methods.rs @@ -8,169 +8,13 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! A character type. -//! -//! The `char` type represents a single character. More specifically, since -//! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode -//! scalar value]', which is similar to, but not the same as, a '[Unicode code -//! point]'. -//! -//! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value -//! [Unicode code point]: http://www.unicode.org/glossary/#code_point -//! -//! This module exists for technical reasons, the primary documentation for -//! `char` is directly on [the `char` primitive type](../../std/primitive.char.html) -//! itself. -//! -//! This module is the home of the iterator implementations for the iterators -//! implemented on `char`, as well as some useful constants and conversion -//! functions that convert various types to `char`. +//! impl char {} -#![stable(feature = "rust1", since = "1.0.0")] - -use core::char::CharExt as C; -use core::iter::FusedIterator; -use core::fmt::{self, Write}; -use tables::{conversions, derived_property, general_category, property}; - -// stable re-exports -#[stable(feature = "rust1", since = "1.0.0")] -pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked}; -#[stable(feature = "rust1", since = "1.0.0")] -pub use core::char::{EscapeDebug, EscapeDefault, EscapeUnicode}; -#[stable(feature = "char_from_str", since = "1.20.0")] -pub use core::char::ParseCharError; - -// unstable re-exports -#[stable(feature = "try_from", since = "1.26.0")] -pub use core::char::CharTryFromError; -#[unstable(feature = "decode_utf8", issue = "33906")] -pub use core::char::{DecodeUtf8, decode_utf8}; -#[unstable(feature = "unicode", issue = "27783")] -pub use tables::{UNICODE_VERSION}; -#[unstable(feature = "unicode", issue = "27783")] -pub use version::UnicodeVersion; - -/// Returns an iterator that yields the lowercase equivalent of a `char`. -/// -/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See -/// its documentation for more. -/// -/// [`to_lowercase`]: ../../std/primitive.char.html#method.to_lowercase -/// [`char`]: ../../std/primitive.char.html -#[stable(feature = "rust1", since = "1.0.0")] -#[derive(Debug, Clone)] -pub struct ToLowercase(CaseMappingIter); - -#[stable(feature = "rust1", since = "1.0.0")] -impl Iterator for ToLowercase { - type Item = char; - fn next(&mut self) -> Option { - self.0.next() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for ToLowercase {} - -/// Returns an iterator that yields the uppercase equivalent of a `char`. -/// -/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See -/// its documentation for more. -/// -/// [`to_uppercase`]: ../../std/primitive.char.html#method.to_uppercase -/// [`char`]: ../../std/primitive.char.html -#[stable(feature = "rust1", since = "1.0.0")] -#[derive(Debug, Clone)] -pub struct ToUppercase(CaseMappingIter); - -#[stable(feature = "rust1", since = "1.0.0")] -impl Iterator for ToUppercase { - type Item = char; - fn next(&mut self) -> Option { - self.0.next() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl FusedIterator for ToUppercase {} - -#[derive(Debug, Clone)] -enum CaseMappingIter { - Three(char, char, char), - Two(char, char), - One(char), - Zero, -} - -impl CaseMappingIter { - fn new(chars: [char; 3]) -> CaseMappingIter { - if chars[2] == '\0' { - if chars[1] == '\0' { - CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0' - } else { - CaseMappingIter::Two(chars[0], chars[1]) - } - } else { - CaseMappingIter::Three(chars[0], chars[1], chars[2]) - } - } -} - -impl Iterator for CaseMappingIter { - type Item = char; - fn next(&mut self) -> Option { - match *self { - CaseMappingIter::Three(a, b, c) => { - *self = CaseMappingIter::Two(b, c); - Some(a) - } - CaseMappingIter::Two(b, c) => { - *self = CaseMappingIter::One(c); - Some(b) - } - CaseMappingIter::One(c) => { - *self = CaseMappingIter::Zero; - Some(c) - } - CaseMappingIter::Zero => None, - } - } -} - -impl fmt::Display for CaseMappingIter { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - CaseMappingIter::Three(a, b, c) => { - f.write_char(a)?; - f.write_char(b)?; - f.write_char(c) - } - CaseMappingIter::Two(b, c) => { - f.write_char(b)?; - f.write_char(c) - } - CaseMappingIter::One(c) => { - f.write_char(c) - } - CaseMappingIter::Zero => Ok(()), - } - } -} - -#[stable(feature = "char_struct_display", since = "1.16.0")] -impl fmt::Display for ToLowercase { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Display::fmt(&self.0, f) - } -} - -#[stable(feature = "char_struct_display", since = "1.16.0")] -impl fmt::Display for ToUppercase { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - fmt::Display::fmt(&self.0, f) - } -} +use slice; +use str::from_utf8_unchecked_mut; +use super::*; +use unicode::printable::is_printable; +use unicode::tables::{conversions, derived_property, general_category, property}; #[lang = "char"] impl char { @@ -223,7 +67,7 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_digit(self, radix: u32) -> bool { - C::is_digit(self, radix) + self.to_digit(radix).is_some() } /// Converts a `char` to a digit in the given radix. @@ -277,7 +121,17 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn to_digit(self, radix: u32) -> Option { - C::to_digit(self, radix) + if radix > 36 { + panic!("to_digit: radix is too high (maximum 36)"); + } + let val = match self { + '0' ... '9' => self as u32 - '0' as u32, + 'a' ... 'z' => self as u32 - 'a' as u32 + 10, + 'A' ... 'Z' => self as u32 - 'A' as u32 + 10, + _ => return None, + }; + if val < radix { Some(val) } + else { None } } /// Returns an iterator that yields the hexadecimal Unicode escape of a @@ -317,7 +171,20 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn escape_unicode(self) -> EscapeUnicode { - C::escape_unicode(self) + let c = self as u32; + + // or-ing 1 ensures that for c==0 the code computes that one + // digit should be printed and (which is the same) avoids the + // (31 - 32) underflow + let msb = 31 - (c | 1).leading_zeros(); + + // the index of the most significant hex digit + let ms_hex_digit = msb / 4; + EscapeUnicode { + c: self, + state: EscapeUnicodeState::Backslash, + hex_digit_idx: ms_hex_digit as usize, + } } /// Returns an iterator that yields the literal escape code of a character @@ -357,7 +224,15 @@ impl char { #[stable(feature = "char_escape_debug", since = "1.20.0")] #[inline] pub fn escape_debug(self) -> EscapeDebug { - C::escape_debug(self) + let init_state = match self { + '\t' => EscapeDefaultState::Backslash('t'), + '\r' => EscapeDefaultState::Backslash('r'), + '\n' => EscapeDefaultState::Backslash('n'), + '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), + c if is_printable(c) => EscapeDefaultState::Char(c), + c => EscapeDefaultState::Unicode(c.escape_unicode()), + }; + EscapeDebug(EscapeDefault { state: init_state }) } /// Returns an iterator that yields the literal escape code of a character @@ -412,7 +287,15 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn escape_default(self) -> EscapeDefault { - C::escape_default(self) + let init_state = match self { + '\t' => EscapeDefaultState::Backslash('t'), + '\r' => EscapeDefaultState::Backslash('r'), + '\n' => EscapeDefaultState::Backslash('n'), + '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), + '\x20' ... '\x7e' => EscapeDefaultState::Char(self), + _ => EscapeDefaultState::Unicode(self.escape_unicode()) + }; + EscapeDefault { state: init_state } } /// Returns the number of bytes this `char` would need if encoded in UTF-8. @@ -463,7 +346,16 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn len_utf8(self) -> usize { - C::len_utf8(self) + let code = self as u32; + if code < MAX_ONE_B { + 1 + } else if code < MAX_TWO_B { + 2 + } else if code < MAX_THREE_B { + 3 + } else { + 4 + } } /// Returns the number of 16-bit code units this `char` would need if @@ -488,7 +380,8 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn len_utf16(self) -> usize { - C::len_utf16(self) + let ch = self as u32; + if (ch & 0xFFFF) == ch { 1 } else { 2 } } /// Encodes this character as UTF-8 into the provided byte buffer, @@ -530,7 +423,35 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { - C::encode_utf8(self, dst) + let code = self as u32; + unsafe { + let len = + if code < MAX_ONE_B && !dst.is_empty() { + *dst.get_unchecked_mut(0) = code as u8; + 1 + } else if code < MAX_TWO_B && dst.len() >= 2 { + *dst.get_unchecked_mut(0) = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + *dst.get_unchecked_mut(1) = (code & 0x3F) as u8 | TAG_CONT; + 2 + } else if code < MAX_THREE_B && dst.len() >= 3 { + *dst.get_unchecked_mut(0) = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + *dst.get_unchecked_mut(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code & 0x3F) as u8 | TAG_CONT; + 3 + } else if dst.len() >= 4 { + *dst.get_unchecked_mut(0) = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + *dst.get_unchecked_mut(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT; + *dst.get_unchecked_mut(3) = (code & 0x3F) as u8 | TAG_CONT; + 4 + } else { + panic!("encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", + from_u32_unchecked(code).len_utf8(), + code, + dst.len()) + }; + from_utf8_unchecked_mut(dst.get_unchecked_mut(..len)) + } } /// Encodes this character as UTF-16 into the provided `u16` buffer, @@ -570,7 +491,25 @@ impl char { #[stable(feature = "unicode_encode_char", since = "1.15.0")] #[inline] pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { - C::encode_utf16(self, dst) + let mut code = self as u32; + unsafe { + if (code & 0xFFFF) == code && !dst.is_empty() { + // The BMP falls through (assuming non-surrogate, as it should) + *dst.get_unchecked_mut(0) = code as u16; + slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) + } else if dst.len() >= 2 { + // Supplementary planes break into surrogates. + code -= 0x1_0000; + *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); + *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); + slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) + } else { + panic!("encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", + from_u32_unchecked(code).len_utf16(), + code, + dst.len()) + } + } } /// Returns true if this `char` is an alphabetic code point, and false if not. @@ -1452,140 +1391,3 @@ impl char { self.is_ascii() && (*self as u8).is_ascii_control() } } - -/// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. -#[stable(feature = "decode_utf16", since = "1.9.0")] -#[derive(Clone, Debug)] -pub struct DecodeUtf16 - where I: Iterator -{ - iter: I, - buf: Option, -} - -/// An error that can be returned when decoding UTF-16 code points. -#[stable(feature = "decode_utf16", since = "1.9.0")] -#[derive(Debug, Clone, Eq, PartialEq)] -pub struct DecodeUtf16Error { - code: u16, -} - -/// Create an iterator over the UTF-16 encoded code points in `iter`, -/// returning unpaired surrogates as `Err`s. -/// -/// # Examples -/// -/// Basic usage: -/// -/// ``` -/// use std::char::decode_utf16; -/// -/// fn main() { -/// // 𝄞music -/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, -/// 0x0073, 0xDD1E, 0x0069, 0x0063, -/// 0xD834]; -/// -/// assert_eq!(decode_utf16(v.iter().cloned()) -/// .map(|r| r.map_err(|e| e.unpaired_surrogate())) -/// .collect::>(), -/// vec![Ok('𝄞'), -/// Ok('m'), Ok('u'), Ok('s'), -/// Err(0xDD1E), -/// Ok('i'), Ok('c'), -/// Err(0xD834)]); -/// } -/// ``` -/// -/// A lossy decoder can be obtained by replacing `Err` results with the replacement character: -/// -/// ``` -/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; -/// -/// fn main() { -/// // 𝄞music -/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, -/// 0x0073, 0xDD1E, 0x0069, 0x0063, -/// 0xD834]; -/// -/// assert_eq!(decode_utf16(v.iter().cloned()) -/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) -/// .collect::(), -/// "𝄞mus�ic�"); -/// } -/// ``` -#[stable(feature = "decode_utf16", since = "1.9.0")] -#[inline] -pub fn decode_utf16>(iter: I) -> DecodeUtf16 { - DecodeUtf16 { - iter: iter.into_iter(), - buf: None, - } -} - -#[stable(feature = "decode_utf16", since = "1.9.0")] -impl> Iterator for DecodeUtf16 { - type Item = Result; - - fn next(&mut self) -> Option> { - let u = match self.buf.take() { - Some(buf) => buf, - None => self.iter.next()? - }; - - if u < 0xD800 || 0xDFFF < u { - // not a surrogate - Some(Ok(unsafe { from_u32_unchecked(u as u32) })) - } else if u >= 0xDC00 { - // a trailing surrogate - Some(Err(DecodeUtf16Error { code: u })) - } else { - let u2 = match self.iter.next() { - Some(u2) => u2, - // eof - None => return Some(Err(DecodeUtf16Error { code: u })), - }; - if u2 < 0xDC00 || u2 > 0xDFFF { - // not a trailing surrogate so we're not a valid - // surrogate pair, so rewind to redecode u2 next time. - self.buf = Some(u2); - return Some(Err(DecodeUtf16Error { code: u })); - } - - // all ok, so lets decode it. - let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; - Some(Ok(unsafe { from_u32_unchecked(c) })) - } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.iter.size_hint(); - // we could be entirely valid surrogates (2 elements per - // char), or entirely non-surrogates (1 element per char) - (low / 2, high) - } -} - -impl DecodeUtf16Error { - /// Returns the unpaired surrogate which caused this error. - #[stable(feature = "decode_utf16", since = "1.9.0")] - pub fn unpaired_surrogate(&self) -> u16 { - self.code - } -} - -#[stable(feature = "decode_utf16", since = "1.9.0")] -impl fmt::Display for DecodeUtf16Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "unpaired surrogate found: {:x}", self.code) - } -} - -/// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a -/// decoding error. -/// -/// It can occur, for example, when giving ill-formed UTF-8 bytes to -/// [`String::from_utf8_lossy`](../../std/string/struct.String.html#method.from_utf8_lossy). -#[stable(feature = "decode_utf16", since = "1.9.0")] -pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; diff --git a/src/libcore/char/mod.rs b/src/libcore/char/mod.rs new file mode 100644 index 0000000000000..9edc0c88756b3 --- /dev/null +++ b/src/libcore/char/mod.rs @@ -0,0 +1,506 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! A character type. +//! +//! The `char` type represents a single character. More specifically, since +//! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode +//! scalar value]', which is similar to, but not the same as, a '[Unicode code +//! point]'. +//! +//! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value +//! [Unicode code point]: http://www.unicode.org/glossary/#code_point +//! +//! This module exists for technical reasons, the primary documentation for +//! `char` is directly on [the `char` primitive type](../../std/primitive.char.html) +//! itself. +//! +//! This module is the home of the iterator implementations for the iterators +//! implemented on `char`, as well as some useful constants and conversion +//! functions that convert various types to `char`. + +#![allow(non_snake_case)] +#![stable(feature = "core_char", since = "1.2.0")] + +mod convert; +mod decode; +mod methods; + +// stable re-exports +#[stable(feature = "rust1", since = "1.0.0")] +pub use self::convert::{from_u32, from_digit}; +#[stable(feature = "char_from_unchecked", since = "1.5.0")] +pub use self::convert::from_u32_unchecked; +#[stable(feature = "char_from_str", since = "1.20.0")] +pub use self::convert::ParseCharError; +#[stable(feature = "try_from", since = "1.26.0")] +pub use self::convert::CharTryFromError; +#[stable(feature = "decode_utf16", since = "1.9.0")] +pub use self::decode::{decode_utf16, DecodeUtf16, DecodeUtf16Error}; + +// unstable re-exports +#[unstable(feature = "unicode_version", issue = "49726")] +pub use unicode::tables::UNICODE_VERSION; +#[unstable(feature = "unicode_version", issue = "49726")] +pub use unicode::version::UnicodeVersion; +#[unstable(feature = "decode_utf8", issue = "33906")] +pub use self::decode::{decode_utf8, DecodeUtf8, InvalidSequence}; + +use fmt::{self, Write}; +use iter::FusedIterator; + +// UTF-8 ranges and tags for encoding characters +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO_B: u8 = 0b1100_0000; +const TAG_THREE_B: u8 = 0b1110_0000; +const TAG_FOUR_B: u8 = 0b1111_0000; +const MAX_ONE_B: u32 = 0x80; +const MAX_TWO_B: u32 = 0x800; +const MAX_THREE_B: u32 = 0x10000; + +/* + Lu Uppercase_Letter an uppercase letter + Ll Lowercase_Letter a lowercase letter + Lt Titlecase_Letter a digraphic character, with first part uppercase + Lm Modifier_Letter a modifier letter + Lo Other_Letter other letters, including syllables and ideographs + Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) + Mc Spacing_Mark a spacing combining mark (positive advance width) + Me Enclosing_Mark an enclosing combining mark + Nd Decimal_Number a decimal digit + Nl Letter_Number a letterlike numeric character + No Other_Number a numeric character of other type + Pc Connector_Punctuation a connecting punctuation mark, like a tie + Pd Dash_Punctuation a dash or hyphen punctuation mark + Ps Open_Punctuation an opening punctuation mark (of a pair) + Pe Close_Punctuation a closing punctuation mark (of a pair) + Pi Initial_Punctuation an initial quotation mark + Pf Final_Punctuation a final quotation mark + Po Other_Punctuation a punctuation mark of other type + Sm Math_Symbol a symbol of primarily mathematical use + Sc Currency_Symbol a currency sign + Sk Modifier_Symbol a non-letterlike modifier symbol + So Other_Symbol a symbol of other type + Zs Space_Separator a space character (of various non-zero widths) + Zl Line_Separator U+2028 LINE SEPARATOR only + Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only + Cc Control a C0 or C1 control code + Cf Format a format control character + Cs Surrogate a surrogate code point + Co Private_Use a private-use character + Cn Unassigned a reserved unassigned code point or a noncharacter +*/ + +/// The highest valid code point a `char` can have. +/// +/// A [`char`] is a [Unicode Scalar Value], which means that it is a [Code +/// Point], but only ones within a certain range. `MAX` is the highest valid +/// code point that's a valid [Unicode Scalar Value]. +/// +/// [`char`]: ../../std/primitive.char.html +/// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value +/// [Code Point]: http://www.unicode.org/glossary/#code_point +#[stable(feature = "rust1", since = "1.0.0")] +pub const MAX: char = '\u{10ffff}'; + +/// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a +/// decoding error. +/// +/// It can occur, for example, when giving ill-formed UTF-8 bytes to +/// [`String::from_utf8_lossy`](../../std/string/struct.String.html#method.from_utf8_lossy). +#[stable(feature = "decode_utf16", since = "1.9.0")] +pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; + +/// Returns an iterator that yields the hexadecimal Unicode escape of a +/// character, as `char`s. +/// +/// This `struct` is created by the [`escape_unicode`] method on [`char`]. See +/// its documentation for more. +/// +/// [`escape_unicode`]: ../../std/primitive.char.html#method.escape_unicode +/// [`char`]: ../../std/primitive.char.html +#[derive(Clone, Debug)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct EscapeUnicode { + c: char, + state: EscapeUnicodeState, + + // The index of the next hex digit to be printed (0 if none), + // i.e. the number of remaining hex digits to be printed; + // increasing from the least significant digit: 0x543210 + hex_digit_idx: usize, +} + +// The enum values are ordered so that their representation is the +// same as the remaining length (besides the hexadecimal digits). This +// likely makes `len()` a single load from memory) and inline-worth. +#[derive(Clone, Debug)] +enum EscapeUnicodeState { + Done, + RightBrace, + Value, + LeftBrace, + Type, + Backslash, +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for EscapeUnicode { + type Item = char; + + fn next(&mut self) -> Option { + match self.state { + EscapeUnicodeState::Backslash => { + self.state = EscapeUnicodeState::Type; + Some('\\') + } + EscapeUnicodeState::Type => { + self.state = EscapeUnicodeState::LeftBrace; + Some('u') + } + EscapeUnicodeState::LeftBrace => { + self.state = EscapeUnicodeState::Value; + Some('{') + } + EscapeUnicodeState::Value => { + let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf; + let c = from_digit(hex_digit, 16).unwrap(); + if self.hex_digit_idx == 0 { + self.state = EscapeUnicodeState::RightBrace; + } else { + self.hex_digit_idx -= 1; + } + Some(c) + } + EscapeUnicodeState::RightBrace => { + self.state = EscapeUnicodeState::Done; + Some('}') + } + EscapeUnicodeState::Done => None, + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let n = self.len(); + (n, Some(n)) + } + + #[inline] + fn count(self) -> usize { + self.len() + } + + fn last(self) -> Option { + match self.state { + EscapeUnicodeState::Done => None, + + EscapeUnicodeState::RightBrace | + EscapeUnicodeState::Value | + EscapeUnicodeState::LeftBrace | + EscapeUnicodeState::Type | + EscapeUnicodeState::Backslash => Some('}'), + } + } +} + +#[stable(feature = "exact_size_escape", since = "1.11.0")] +impl ExactSizeIterator for EscapeUnicode { + #[inline] + fn len(&self) -> usize { + // The match is a single memory access with no branching + self.hex_digit_idx + match self.state { + EscapeUnicodeState::Done => 0, + EscapeUnicodeState::RightBrace => 1, + EscapeUnicodeState::Value => 2, + EscapeUnicodeState::LeftBrace => 3, + EscapeUnicodeState::Type => 4, + EscapeUnicodeState::Backslash => 5, + } + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for EscapeUnicode {} + +#[stable(feature = "char_struct_display", since = "1.16.0")] +impl fmt::Display for EscapeUnicode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.clone() { + f.write_char(c)?; + } + Ok(()) + } +} + +/// An iterator that yields the literal escape code of a `char`. +/// +/// This `struct` is created by the [`escape_default`] method on [`char`]. See +/// its documentation for more. +/// +/// [`escape_default`]: ../../std/primitive.char.html#method.escape_default +/// [`char`]: ../../std/primitive.char.html +#[derive(Clone, Debug)] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct EscapeDefault { + state: EscapeDefaultState +} + +#[derive(Clone, Debug)] +enum EscapeDefaultState { + Done, + Char(char), + Backslash(char), + Unicode(EscapeUnicode), +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for EscapeDefault { + type Item = char; + + fn next(&mut self) -> Option { + match self.state { + EscapeDefaultState::Backslash(c) => { + self.state = EscapeDefaultState::Char(c); + Some('\\') + } + EscapeDefaultState::Char(c) => { + self.state = EscapeDefaultState::Done; + Some(c) + } + EscapeDefaultState::Done => None, + EscapeDefaultState::Unicode(ref mut iter) => iter.next(), + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let n = self.len(); + (n, Some(n)) + } + + #[inline] + fn count(self) -> usize { + self.len() + } + + fn nth(&mut self, n: usize) -> Option { + match self.state { + EscapeDefaultState::Backslash(c) if n == 0 => { + self.state = EscapeDefaultState::Char(c); + Some('\\') + }, + EscapeDefaultState::Backslash(c) if n == 1 => { + self.state = EscapeDefaultState::Done; + Some(c) + }, + EscapeDefaultState::Backslash(_) => { + self.state = EscapeDefaultState::Done; + None + }, + EscapeDefaultState::Char(c) => { + self.state = EscapeDefaultState::Done; + + if n == 0 { + Some(c) + } else { + None + } + }, + EscapeDefaultState::Done => return None, + EscapeDefaultState::Unicode(ref mut i) => return i.nth(n), + } + } + + fn last(self) -> Option { + match self.state { + EscapeDefaultState::Unicode(iter) => iter.last(), + EscapeDefaultState::Done => None, + EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c), + } + } +} + +#[stable(feature = "exact_size_escape", since = "1.11.0")] +impl ExactSizeIterator for EscapeDefault { + fn len(&self) -> usize { + match self.state { + EscapeDefaultState::Done => 0, + EscapeDefaultState::Char(_) => 1, + EscapeDefaultState::Backslash(_) => 2, + EscapeDefaultState::Unicode(ref iter) => iter.len(), + } + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for EscapeDefault {} + +#[stable(feature = "char_struct_display", since = "1.16.0")] +impl fmt::Display for EscapeDefault { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for c in self.clone() { + f.write_char(c)?; + } + Ok(()) + } +} + +/// An iterator that yields the literal escape code of a `char`. +/// +/// This `struct` is created by the [`escape_debug`] method on [`char`]. See its +/// documentation for more. +/// +/// [`escape_debug`]: ../../std/primitive.char.html#method.escape_debug +/// [`char`]: ../../std/primitive.char.html +#[stable(feature = "char_escape_debug", since = "1.20.0")] +#[derive(Clone, Debug)] +pub struct EscapeDebug(EscapeDefault); + +#[stable(feature = "char_escape_debug", since = "1.20.0")] +impl Iterator for EscapeDebug { + type Item = char; + fn next(&mut self) -> Option { self.0.next() } + fn size_hint(&self) -> (usize, Option) { self.0.size_hint() } +} + +#[stable(feature = "char_escape_debug", since = "1.20.0")] +impl ExactSizeIterator for EscapeDebug { } + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for EscapeDebug {} + +#[stable(feature = "char_escape_debug", since = "1.20.0")] +impl fmt::Display for EscapeDebug { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +/// Returns an iterator that yields the lowercase equivalent of a `char`. +/// +/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See +/// its documentation for more. +/// +/// [`to_lowercase`]: ../../std/primitive.char.html#method.to_lowercase +/// [`char`]: ../../std/primitive.char.html +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Debug, Clone)] +pub struct ToLowercase(CaseMappingIter); + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for ToLowercase { + type Item = char; + fn next(&mut self) -> Option { + self.0.next() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for ToLowercase {} + +/// Returns an iterator that yields the uppercase equivalent of a `char`. +/// +/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See +/// its documentation for more. +/// +/// [`to_uppercase`]: ../../std/primitive.char.html#method.to_uppercase +/// [`char`]: ../../std/primitive.char.html +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Debug, Clone)] +pub struct ToUppercase(CaseMappingIter); + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for ToUppercase { + type Item = char; + fn next(&mut self) -> Option { + self.0.next() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for ToUppercase {} + +#[derive(Debug, Clone)] +enum CaseMappingIter { + Three(char, char, char), + Two(char, char), + One(char), + Zero, +} + +impl CaseMappingIter { + fn new(chars: [char; 3]) -> CaseMappingIter { + if chars[2] == '\0' { + if chars[1] == '\0' { + CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0' + } else { + CaseMappingIter::Two(chars[0], chars[1]) + } + } else { + CaseMappingIter::Three(chars[0], chars[1], chars[2]) + } + } +} + +impl Iterator for CaseMappingIter { + type Item = char; + fn next(&mut self) -> Option { + match *self { + CaseMappingIter::Three(a, b, c) => { + *self = CaseMappingIter::Two(b, c); + Some(a) + } + CaseMappingIter::Two(b, c) => { + *self = CaseMappingIter::One(c); + Some(b) + } + CaseMappingIter::One(c) => { + *self = CaseMappingIter::Zero; + Some(c) + } + CaseMappingIter::Zero => None, + } + } +} + +impl fmt::Display for CaseMappingIter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + CaseMappingIter::Three(a, b, c) => { + f.write_char(a)?; + f.write_char(b)?; + f.write_char(c) + } + CaseMappingIter::Two(b, c) => { + f.write_char(b)?; + f.write_char(c) + } + CaseMappingIter::One(c) => { + f.write_char(c) + } + CaseMappingIter::Zero => Ok(()), + } + } +} + +#[stable(feature = "char_struct_display", since = "1.16.0")] +impl fmt::Display for ToLowercase { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +#[stable(feature = "char_struct_display", since = "1.16.0")] +impl fmt::Display for ToUppercase { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs index e194b173aa718..9ff8465bc0f2d 100644 --- a/src/libcore/lib.rs +++ b/src/libcore/lib.rs @@ -180,12 +180,13 @@ pub mod hash; pub mod fmt; pub mod time; +pub mod unicode; + /* Heap memory allocator trait */ #[allow(missing_docs)] pub mod heap; // note: does not need to be public -mod char_private; mod iter_private; mod tuple; mod unit; diff --git a/src/libcore/prelude/v1.rs b/src/libcore/prelude/v1.rs index d43496c387cb8..cc3ad71117a5d 100644 --- a/src/libcore/prelude/v1.rs +++ b/src/libcore/prelude/v1.rs @@ -62,6 +62,3 @@ pub use slice::SliceExt; #[stable(feature = "core_prelude", since = "1.4.0")] #[doc(no_inline)] pub use str::StrExt; -#[stable(feature = "core_prelude", since = "1.4.0")] -#[doc(no_inline)] -pub use char::CharExt; diff --git a/src/libstd_unicode/lossy.rs b/src/libcore/str/lossy.rs similarity index 98% rename from src/libstd_unicode/lossy.rs rename to src/libcore/str/lossy.rs index cc8e93308a527..30b7267da7c5e 100644 --- a/src/libstd_unicode/lossy.rs +++ b/src/libcore/str/lossy.rs @@ -8,12 +8,11 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use core::str as core_str; -use core::fmt; -use core::fmt::Write; use char; -use core::mem; - +use str as core_str; +use fmt; +use fmt::Write; +use mem; /// Lossy UTF-8 string. #[unstable(feature = "str_internals", issue = "0")] diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 1185b7acaae1f..f1fe23092de93 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -19,13 +19,17 @@ use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher}; use char; use fmt; -use iter::{Map, Cloned, FusedIterator, TrustedLen}; +use iter::{Map, Cloned, FusedIterator, TrustedLen, Filter}; use iter_private::TrustedRandomAccess; use slice::{self, SliceIndex}; use mem; pub mod pattern; +#[unstable(feature = "str_internals", issue = "0")] +#[allow(missing_docs)] +pub mod lossy; + /// A trait to abstract the idea of creating a new instance of a type from a /// string. /// @@ -2212,6 +2216,18 @@ pub trait StrExt { fn is_empty(&self) -> bool; #[stable(feature = "core", since = "1.6.0")] fn parse(&self) -> Result; + #[stable(feature = "split_whitespace", since = "1.1.0")] + fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>; + #[stable(feature = "unicode_methods_on_intrinsics", since = "1.27.0")] + fn is_whitespace(&self) -> bool; + #[stable(feature = "unicode_methods_on_intrinsics", since = "1.27.0")] + fn is_alphanumeric(&self) -> bool; + #[stable(feature = "rust1", since = "1.0.0")] + fn trim(&self) -> &str; + #[stable(feature = "rust1", since = "1.0.0")] + fn trim_left(&self) -> &str; + #[stable(feature = "rust1", since = "1.0.0")] + fn trim_right(&self) -> &str; } // truncate `&str` to length at most equal to `max` @@ -2532,6 +2548,36 @@ impl StrExt for str { #[inline] fn parse(&self) -> Result { FromStr::from_str(self) } + + #[inline] + fn split_whitespace(&self) -> SplitWhitespace { + SplitWhitespace { inner: self.split(IsWhitespace).filter(IsNotEmpty) } + } + + #[inline] + fn is_whitespace(&self) -> bool { + self.chars().all(|c| c.is_whitespace()) + } + + #[inline] + fn is_alphanumeric(&self) -> bool { + self.chars().all(|c| c.is_alphanumeric()) + } + + #[inline] + fn trim(&self) -> &str { + self.trim_matches(|c: char| c.is_whitespace()) + } + + #[inline] + fn trim_left(&self) -> &str { + self.trim_left_matches(|c: char| c.is_whitespace()) + } + + #[inline] + fn trim_right(&self) -> &str { + self.trim_right_matches(|c: char| c.is_whitespace()) + } } #[stable(feature = "rust1", since = "1.0.0")] @@ -2547,3 +2593,75 @@ impl<'a> Default for &'a str { /// Creates an empty str fn default() -> &'a str { "" } } + +/// An iterator over the non-whitespace substrings of a string, +/// separated by any amount of whitespace. +/// +/// This struct is created by the [`split_whitespace`] method on [`str`]. +/// See its documentation for more. +/// +/// [`split_whitespace`]: ../../std/primitive.str.html#method.split_whitespace +/// [`str`]: ../../std/primitive.str.html +#[stable(feature = "split_whitespace", since = "1.1.0")] +#[derive(Clone, Debug)] +pub struct SplitWhitespace<'a> { + inner: Filter, IsNotEmpty>, +} + +#[derive(Clone)] +struct IsWhitespace; + +impl FnOnce<(char, )> for IsWhitespace { + type Output = bool; + + #[inline] + extern "rust-call" fn call_once(mut self, arg: (char, )) -> bool { + self.call_mut(arg) + } +} + +impl FnMut<(char, )> for IsWhitespace { + #[inline] + extern "rust-call" fn call_mut(&mut self, arg: (char, )) -> bool { + arg.0.is_whitespace() + } +} + +#[derive(Clone)] +struct IsNotEmpty; + +impl<'a, 'b> FnOnce<(&'a &'b str, )> for IsNotEmpty { + type Output = bool; + + #[inline] + extern "rust-call" fn call_once(mut self, arg: (&&str, )) -> bool { + self.call_mut(arg) + } +} + +impl<'a, 'b> FnMut<(&'a &'b str, )> for IsNotEmpty { + #[inline] + extern "rust-call" fn call_mut(&mut self, arg: (&&str, )) -> bool { + !arg.0.is_empty() + } +} + + +#[stable(feature = "split_whitespace", since = "1.1.0")] +impl<'a> Iterator for SplitWhitespace<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } +} + +#[stable(feature = "split_whitespace", since = "1.1.0")] +impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl<'a> FusedIterator for SplitWhitespace<'a> {} diff --git a/src/libcore/tests/lib.rs b/src/libcore/tests/lib.rs index c3162899bbd01..149269263dc8a 100644 --- a/src/libcore/tests/lib.rs +++ b/src/libcore/tests/lib.rs @@ -33,6 +33,7 @@ #![feature(sort_internals)] #![feature(specialization)] #![feature(step_trait)] +#![feature(str_internals)] #![feature(test)] #![feature(trusted_len)] #![feature(try_trait)] @@ -68,4 +69,5 @@ mod ptr; mod result; mod slice; mod str; +mod str_lossy; mod tuple; diff --git a/src/libstd_unicode/tests/lossy.rs b/src/libcore/tests/str_lossy.rs similarity index 99% rename from src/libstd_unicode/tests/lossy.rs rename to src/libcore/tests/str_lossy.rs index e05d066855635..69e28256da9c3 100644 --- a/src/libstd_unicode/tests/lossy.rs +++ b/src/libcore/tests/str_lossy.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std_unicode::lossy::*; +use core::str::lossy::*; #[test] fn chunks() { diff --git a/src/libstd_unicode/bool_trie.rs b/src/libcore/unicode/bool_trie.rs similarity index 100% rename from src/libstd_unicode/bool_trie.rs rename to src/libcore/unicode/bool_trie.rs diff --git a/src/libcore/unicode/mod.rs b/src/libcore/unicode/mod.rs new file mode 100644 index 0000000000000..b6b033adc046e --- /dev/null +++ b/src/libcore/unicode/mod.rs @@ -0,0 +1,27 @@ +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#![unstable(feature = "unicode_internals", issue = "0")] +#![allow(missing_docs)] + +mod bool_trie; +pub(crate) mod printable; +pub(crate) mod tables; +pub(crate) mod version; + +// For use in liballoc, not re-exported in libstd. +pub mod derived_property { + pub use unicode::tables::derived_property::{Case_Ignorable, Cased}; +} + +// For use in libsyntax +pub mod property { + pub use unicode::tables::property::Pattern_White_Space; +} diff --git a/src/etc/char_private.py b/src/libcore/unicode/printable.py similarity index 98% rename from src/etc/char_private.py rename to src/libcore/unicode/printable.py index cfe5b01e934e7..9410dafbbc364 100644 --- a/src/etc/char_private.py +++ b/src/libcore/unicode/printable.py @@ -187,7 +187,7 @@ def main(): // option. This file may not be copied, modified, or distributed // except according to those terms. -// NOTE: The following code was generated by "src/etc/char_private.py", +// NOTE: The following code was generated by "src/libcore/unicode/printable.py", // do not edit directly! fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], diff --git a/src/libcore/char_private.rs b/src/libcore/unicode/printable.rs similarity index 99% rename from src/libcore/char_private.rs rename to src/libcore/unicode/printable.rs index e6803745ab543..4426c32eebcee 100644 --- a/src/libcore/char_private.rs +++ b/src/libcore/unicode/printable.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -// NOTE: The following code was generated by "src/etc/char_private.py", +// NOTE: The following code was generated by "src/libcore/unicode/printable.py", // do not edit directly! fn check(x: u16, singletonuppers: &[(u8, u8)], singletonlowers: &[u8], diff --git a/src/libstd_unicode/tables.rs b/src/libcore/unicode/tables.rs similarity index 99% rename from src/libstd_unicode/tables.rs rename to src/libcore/unicode/tables.rs index b53953b62a7af..3fbbc011bc41d 100644 --- a/src/libstd_unicode/tables.rs +++ b/src/libcore/unicode/tables.rs @@ -12,11 +12,12 @@ #![allow(missing_docs, non_upper_case_globals, non_snake_case)] -use version::UnicodeVersion; -use bool_trie::{BoolTrie, SmallBoolTrie}; +use unicode::version::UnicodeVersion; +use unicode::bool_trie::{BoolTrie, SmallBoolTrie}; /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of -/// `CharExt` and `UnicodeStrPrelude` traits are based on. +/// `char` and `str` methods are based on. +#[unstable(feature = "unicode_version", issue = "49726")] pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion { major: 10, minor: 0, @@ -1138,9 +1139,6 @@ pub mod property { } pub mod conversions { - use core::option::Option; - use core::option::Option::{Some, None}; - pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { None => [c, '\0', '\0'], diff --git a/src/libstd_unicode/unicode.py b/src/libcore/unicode/unicode.py similarity index 98% rename from src/libstd_unicode/unicode.py rename to src/libcore/unicode/unicode.py index a86294930861b..75ec01944bfb9 100755 --- a/src/libstd_unicode/unicode.py +++ b/src/libcore/unicode/unicode.py @@ -39,8 +39,8 @@ #![allow(missing_docs, non_upper_case_globals, non_snake_case)] -use version::UnicodeVersion; -use bool_trie::{BoolTrie, SmallBoolTrie}; +use unicode::version::UnicodeVersion; +use unicode::bool_trie::{BoolTrie, SmallBoolTrie}; ''' # Mapping taken from Table 12 from: @@ -408,9 +408,6 @@ def emit_property_module(f, mod, tbl, emit): def emit_conversions_module(f, to_upper, to_lower, to_title): f.write("pub mod conversions {") f.write(""" - use core::option::Option; - use core::option::Option::{Some, None}; - pub fn to_lower(c: char) -> [char; 3] { match bsearch_case_table(c, to_lowercase_table) { None => [c, '\\0', '\\0'], @@ -473,7 +470,8 @@ def emit_norm_module(f, canon, compat, combine, norm_props): unicode_version = re.search(pattern, readme.read()).groups() rf.write(""" /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of -/// `CharExt` and `UnicodeStrPrelude` traits are based on. +/// `char` and `str` methods are based on. +#[unstable(feature = "unicode_version", issue = "49726")] pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion { major: %s, minor: %s, diff --git a/src/libstd_unicode/version.rs b/src/libcore/unicode/version.rs similarity index 93% rename from src/libstd_unicode/version.rs rename to src/libcore/unicode/version.rs index d82a749d91786..59ebf5f501269 100644 --- a/src/libstd_unicode/version.rs +++ b/src/libcore/unicode/version.rs @@ -12,6 +12,7 @@ /// /// See also: #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +#[unstable(feature = "unicode_version", issue = "49726")] pub struct UnicodeVersion { /// Major version. pub major: u32, diff --git a/src/librustdoc/lib.rs b/src/librustdoc/lib.rs index 730f61e0aa66d..4a062e9a55b30 100644 --- a/src/librustdoc/lib.rs +++ b/src/librustdoc/lib.rs @@ -20,7 +20,6 @@ #![feature(fs_read_write)] #![feature(set_stdio)] #![feature(test)] -#![feature(unicode)] #![feature(vec_remove_item)] #![feature(entry_and_modify)] @@ -41,7 +40,6 @@ extern crate serialize; #[macro_use] extern crate syntax; extern crate syntax_pos; extern crate test as testing; -extern crate std_unicode; #[macro_use] extern crate log; extern crate rustc_errors as errors; extern crate pulldown_cmark; diff --git a/src/libstd/lib.rs b/src/libstd/lib.rs index 672723341eb57..c82d600e4a184 100644 --- a/src/libstd/lib.rs +++ b/src/libstd/lib.rs @@ -307,7 +307,6 @@ #![feature(toowned_clone_into)] #![feature(try_reserve)] #![feature(unboxed_closures)] -#![feature(unicode)] #![feature(untagged_unions)] #![feature(unwind_attributes)] #![feature(vec_push_all)] @@ -354,7 +353,6 @@ extern crate core as __core; #[macro_reexport(vec, format)] extern crate alloc; extern crate alloc_system; -extern crate std_unicode; #[doc(masked)] extern crate libc; @@ -455,7 +453,7 @@ pub use alloc::string; #[stable(feature = "rust1", since = "1.0.0")] pub use alloc::vec; #[stable(feature = "rust1", since = "1.0.0")] -pub use std_unicode::char; +pub use core::char; #[stable(feature = "i128", since = "1.26.0")] pub use core::u128; diff --git a/src/libstd/sys/redox/os_str.rs b/src/libstd/sys/redox/os_str.rs index da27787babb97..eb3a1ead58c94 100644 --- a/src/libstd/sys/redox/os_str.rs +++ b/src/libstd/sys/redox/os_str.rs @@ -19,7 +19,7 @@ use rc::Rc; use sync::Arc; use sys_common::{AsInner, IntoInner}; use sys_common::bytestring::debug_fmt_bytestring; -use std_unicode::lossy::Utf8Lossy; +use core::str::lossy::Utf8Lossy; #[derive(Clone, Hash)] pub struct Buf { diff --git a/src/libstd/sys/unix/os_str.rs b/src/libstd/sys/unix/os_str.rs index e43bc6da5f1f8..01c0fb830aadd 100644 --- a/src/libstd/sys/unix/os_str.rs +++ b/src/libstd/sys/unix/os_str.rs @@ -19,7 +19,7 @@ use rc::Rc; use sync::Arc; use sys_common::{AsInner, IntoInner}; use sys_common::bytestring::debug_fmt_bytestring; -use std_unicode::lossy::Utf8Lossy; +use core::str::lossy::Utf8Lossy; #[derive(Clone, Hash)] pub struct Buf { diff --git a/src/libstd/sys/wasm/os_str.rs b/src/libstd/sys/wasm/os_str.rs index 84f560af69bec..e0da5bdf36c14 100644 --- a/src/libstd/sys/wasm/os_str.rs +++ b/src/libstd/sys/wasm/os_str.rs @@ -19,7 +19,7 @@ use rc::Rc; use sync::Arc; use sys_common::{AsInner, IntoInner}; use sys_common::bytestring::debug_fmt_bytestring; -use std_unicode::lossy::Utf8Lossy; +use core::str::lossy::Utf8Lossy; #[derive(Clone, Hash)] pub struct Buf { diff --git a/src/libstd/sys_common/bytestring.rs b/src/libstd/sys_common/bytestring.rs index eb9cad0991505..971b83938c167 100644 --- a/src/libstd/sys_common/bytestring.rs +++ b/src/libstd/sys_common/bytestring.rs @@ -11,7 +11,7 @@ #![allow(dead_code)] use fmt::{Formatter, Result, Write}; -use std_unicode::lossy::{Utf8Lossy, Utf8LossyChunk}; +use core::str::lossy::{Utf8Lossy, Utf8LossyChunk}; pub fn debug_fmt_bytestring(slice: &[u8], f: &mut Formatter) -> Result { // Writes out a valid unicode string with the correct escape sequences diff --git a/src/libstd_unicode/Cargo.toml b/src/libstd_unicode/Cargo.toml index 283070a0e2cf7..b1c55c2e4b6ce 100644 --- a/src/libstd_unicode/Cargo.toml +++ b/src/libstd_unicode/Cargo.toml @@ -9,10 +9,6 @@ path = "lib.rs" test = false bench = false -[[test]] -name = "std_unicode_tests" -path = "tests/lib.rs" - [dependencies] core = { path = "../libcore" } compiler_builtins = { path = "../rustc/compiler_builtins_shim" } diff --git a/src/libstd_unicode/lib.rs b/src/libstd_unicode/lib.rs index cf8c101a2f91f..c0d47f1fcb42b 100644 --- a/src/libstd_unicode/lib.rs +++ b/src/libstd_unicode/lib.rs @@ -27,38 +27,10 @@ html_playground_url = "https://play.rust-lang.org/", issue_tracker_base_url = "https://github.com/rust-lang/rust/issues/", test(no_crate_inject, attr(allow(unused_variables), deny(warnings))))] -#![deny(missing_debug_implementations)] #![no_std] -#![feature(ascii_ctype)] -#![feature(core_char_ext)] -#![feature(str_internals)] -#![feature(decode_utf8)] -#![feature(fn_traits)] -#![feature(lang_items)] -#![feature(non_exhaustive)] +#![feature(unicode_internals)] #![feature(staged_api)] -#![feature(unboxed_closures)] +#![rustc_deprecated(since = "1.27.0", reason = "moved into libcore")] -mod bool_trie; -mod tables; -mod u_str; -mod version; -pub mod char; -pub mod lossy; - -#[allow(deprecated)] -pub mod str { - pub use u_str::{SplitWhitespace, UnicodeStr}; - pub use u_str::Utf16Encoder; -} - -// For use in liballoc, not re-exported in libstd. -pub mod derived_property { - pub use tables::derived_property::{Case_Ignorable, Cased}; -} - -// For use in libsyntax -pub mod property { - pub use tables::property::Pattern_White_Space; -} +pub use core::unicode::*; diff --git a/src/libstd_unicode/tests/lib.rs b/src/libstd_unicode/tests/lib.rs deleted file mode 100644 index 9535ec18763e6..0000000000000 --- a/src/libstd_unicode/tests/lib.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#![feature(str_internals, unicode)] - -extern crate std_unicode; - -mod lossy; diff --git a/src/libstd_unicode/u_str.rs b/src/libstd_unicode/u_str.rs deleted file mode 100644 index a72e1210d93f6..0000000000000 --- a/src/libstd_unicode/u_str.rs +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! Unicode-intensive string manipulations. -//! -//! This module provides functionality to `str` that requires the Unicode -//! methods provided by the unicode parts of the CharExt trait. - -use core::char; -use core::iter::{Filter, FusedIterator}; -use core::str::Split; - -/// An iterator over the non-whitespace substrings of a string, -/// separated by any amount of whitespace. -/// -/// This struct is created by the [`split_whitespace`] method on [`str`]. -/// See its documentation for more. -/// -/// [`split_whitespace`]: ../../std/primitive.str.html#method.split_whitespace -/// [`str`]: ../../std/primitive.str.html -#[stable(feature = "split_whitespace", since = "1.1.0")] -#[derive(Clone, Debug)] -pub struct SplitWhitespace<'a> { - inner: Filter, IsNotEmpty>, -} - -/// Methods for Unicode string slices -#[allow(missing_docs)] // docs in liballoc -pub trait UnicodeStr { - fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>; - fn is_whitespace(&self) -> bool; - fn is_alphanumeric(&self) -> bool; - fn trim(&self) -> &str; - fn trim_left(&self) -> &str; - fn trim_right(&self) -> &str; -} - -impl UnicodeStr for str { - #[inline] - fn split_whitespace(&self) -> SplitWhitespace { - SplitWhitespace { inner: self.split(IsWhitespace).filter(IsNotEmpty) } - } - - #[inline] - fn is_whitespace(&self) -> bool { - self.chars().all(|c| c.is_whitespace()) - } - - #[inline] - fn is_alphanumeric(&self) -> bool { - self.chars().all(|c| c.is_alphanumeric()) - } - - #[inline] - fn trim(&self) -> &str { - self.trim_matches(|c: char| c.is_whitespace()) - } - - #[inline] - fn trim_left(&self) -> &str { - self.trim_left_matches(|c: char| c.is_whitespace()) - } - - #[inline] - fn trim_right(&self) -> &str { - self.trim_right_matches(|c: char| c.is_whitespace()) - } -} - -/// Iterator adaptor for encoding `char`s to UTF-16. -#[derive(Clone)] -#[allow(missing_debug_implementations)] -pub struct Utf16Encoder { - chars: I, - extra: u16, -} - -impl Utf16Encoder { - /// Create a UTF-16 encoder from any `char` iterator. - pub fn new(chars: I) -> Utf16Encoder - where I: Iterator - { - Utf16Encoder { - chars, - extra: 0, - } - } -} - -impl Iterator for Utf16Encoder - where I: Iterator -{ - type Item = u16; - - #[inline] - fn next(&mut self) -> Option { - if self.extra != 0 { - let tmp = self.extra; - self.extra = 0; - return Some(tmp); - } - - let mut buf = [0; 2]; - self.chars.next().map(|ch| { - let n = CharExt::encode_utf16(ch, &mut buf).len(); - if n == 2 { - self.extra = buf[1]; - } - buf[0] - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.chars.size_hint(); - // every char gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low, high.and_then(|n| n.checked_mul(2))) - } -} - -impl FusedIterator for Utf16Encoder - where I: FusedIterator {} - -#[derive(Clone)] -struct IsWhitespace; - -impl FnOnce<(char, )> for IsWhitespace { - type Output = bool; - - #[inline] - extern "rust-call" fn call_once(mut self, arg: (char, )) -> bool { - self.call_mut(arg) - } -} - -impl FnMut<(char, )> for IsWhitespace { - #[inline] - extern "rust-call" fn call_mut(&mut self, arg: (char, )) -> bool { - arg.0.is_whitespace() - } -} - -#[derive(Clone)] -struct IsNotEmpty; - -impl<'a, 'b> FnOnce<(&'a &'b str, )> for IsNotEmpty { - type Output = bool; - - #[inline] - extern "rust-call" fn call_once(mut self, arg: (&&str, )) -> bool { - self.call_mut(arg) - } -} - -impl<'a, 'b> FnMut<(&'a &'b str, )> for IsNotEmpty { - #[inline] - extern "rust-call" fn call_mut(&mut self, arg: (&&str, )) -> bool { - !arg.0.is_empty() - } -} - - -#[stable(feature = "split_whitespace", since = "1.1.0")] -impl<'a> Iterator for SplitWhitespace<'a> { - type Item = &'a str; - - fn next(&mut self) -> Option<&'a str> { - self.inner.next() - } -} - -#[stable(feature = "split_whitespace", since = "1.1.0")] -impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() - } -} - -#[stable(feature = "fused", since = "1.26.0")] -impl<'a> FusedIterator for SplitWhitespace<'a> {} diff --git a/src/libsyntax/lib.rs b/src/libsyntax/lib.rs index d80430f609b3a..4e1e9fa2b46ea 100644 --- a/src/libsyntax/lib.rs +++ b/src/libsyntax/lib.rs @@ -19,7 +19,7 @@ html_root_url = "https://doc.rust-lang.org/nightly/", test(attr(deny(warnings))))] -#![feature(unicode)] +#![feature(unicode_internals)] #![feature(rustc_diagnostic_macros)] #![feature(non_exhaustive)] #![feature(const_atomic_usize_new)] @@ -32,9 +32,9 @@ extern crate rustc_cratesio_shim; #[macro_use] extern crate bitflags; +extern crate core; extern crate serialize; #[macro_use] extern crate log; -extern crate std_unicode; pub extern crate rustc_errors as errors; extern crate syntax_pos; extern crate rustc_data_structures; diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 39b2f77f2305e..cb3323c7eca4e 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -15,7 +15,7 @@ use errors::{FatalError, DiagnosticBuilder}; use parse::{token, ParseSess}; use str::char_at; use symbol::{Symbol, keywords}; -use std_unicode::property::Pattern_White_Space; +use core::unicode::property::Pattern_White_Space; use std::borrow::Cow; use std::char; diff --git a/src/test/compile-fail/single-primitive-inherent-impl.rs b/src/test/compile-fail/single-primitive-inherent-impl.rs index 5ceb870528a65..365387c3e5e27 100644 --- a/src/test/compile-fail/single-primitive-inherent-impl.rs +++ b/src/test/compile-fail/single-primitive-inherent-impl.rs @@ -15,9 +15,9 @@ #![no_std] // OK -#[lang = "char"] -impl char {} +#[lang = "str"] +impl str {} -impl char { -//~^ error: only a single inherent implementation marked with `#[lang = "char"]` is allowed for the `char` primitive +impl str { +//~^ error: only a single inherent implementation marked with `#[lang = "str"]` is allowed for the `str` primitive } diff --git a/src/test/run-pass/char_unicode.rs b/src/test/run-pass/char_unicode.rs index b4884acdd078f..bfc7faac06ebe 100644 --- a/src/test/run-pass/char_unicode.rs +++ b/src/test/run-pass/char_unicode.rs @@ -8,9 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. - -#![feature(unicode)] - +#![feature(unicode_version)] /// Tests access to the internal Unicode Version type and value. pub fn main() {