Skip to content

Commit

Permalink
remove XID and Pattern_White_Space unicode tables from libcore
Browse files Browse the repository at this point in the history
They are only used by rustc_lexer, and are not needed elsewhere.

So we move the relevant definitions into rustc_lexer (while the actual
unicode data comes from the unicode-xid crate) and make the rest of
the compiler use it.
  • Loading branch information
matklad committed Sep 4, 2019
1 parent b9de4ef commit a0c186c
Show file tree
Hide file tree
Showing 16 changed files with 69 additions and 464 deletions.
17 changes: 13 additions & 4 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,7 @@ dependencies = [
name = "fmt_macros"
version = "0.0.0"
dependencies = [
"rustc_lexer",
"syntax_pos",
]

Expand Down Expand Up @@ -2372,7 +2373,7 @@ version = "0.4.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759"
dependencies = [
"unicode-xid",
"unicode-xid 0.1.0",
]

[[package]]
Expand Down Expand Up @@ -3290,7 +3291,7 @@ dependencies = [
name = "rustc_lexer"
version = "0.1.0"
dependencies = [
"unicode-xid",
"unicode-xid 0.2.0",
]

[[package]]
Expand Down Expand Up @@ -3368,6 +3369,7 @@ dependencies = [
"rustc_apfloat",
"rustc_data_structures",
"rustc_errors",
"rustc_lexer",
"rustc_target",
"serialize",
"smallvec",
Expand Down Expand Up @@ -3976,7 +3978,7 @@ checksum = "641e117d55514d6d918490e47102f7e08d096fdde360247e4a10f7a91a8478d3"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
"unicode-xid 0.1.0",
]

[[package]]
Expand All @@ -3988,7 +3990,7 @@ dependencies = [
"proc-macro2",
"quote",
"syn",
"unicode-xid",
"unicode-xid 0.1.0",
]

[[package]]
Expand Down Expand Up @@ -4017,6 +4019,7 @@ dependencies = [
"log",
"rustc_data_structures",
"rustc_errors",
"rustc_lexer",
"rustc_target",
"smallvec",
"syntax",
Expand Down Expand Up @@ -4532,6 +4535,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"

[[package]]
name = "unicode-xid"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"

[[package]]
name = "unicode_categories"
version = "0.1.1"
Expand Down
23 changes: 0 additions & 23 deletions src/libcore/char/methods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -547,29 +547,6 @@ impl char {
}
}

/// Returns `true` if this `char` satisfies the `XID_Start` Unicode property, and false
/// otherwise.
///
/// `XID_Start` is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to `ID_Start` but modified for closure under `NFKx`.
#[unstable(feature = "unicode_internals", issue = "0")]
pub fn is_xid_start(self) -> bool {
derived_property::XID_Start(self)
}

/// Returns `true` if this `char` satisfies the `XID_Continue` Unicode property, and false
/// otherwise.
///
/// `XID_Continue` is a Unicode Derived Property specified in
/// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
/// mostly similar to `ID_Continue` but modified for closure under NFKx.
#[unstable(feature = "unicode_internals", issue = "0")]
#[inline]
pub fn is_xid_continue(self) -> bool {
derived_property::XID_Continue(self)
}

/// Returns `true` if this `char` is lowercase.
///
/// 'Lowercase' is defined according to the terms of the Unicode Derived Core
Expand Down
5 changes: 0 additions & 5 deletions src/libcore/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,3 @@ pub mod derived_property {
pub mod conversions {
pub use crate::unicode::tables::conversions::{to_lower, to_upper};
}

// For use in libsyntax
pub mod property {
pub use crate::unicode::tables::property::Pattern_White_Space;
}
375 changes: 0 additions & 375 deletions src/libcore/unicode/tables.rs

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions src/libcore/unicode/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ def generate_property_module(mod, grouped_categories, category_subset):

yield "pub(crate) mod %s {\n" % mod
for cat in sorted(category_subset):
if cat in ("Cc", "White_Space", "Pattern_White_Space"):
if cat in ("Cc", "White_Space"):
generator = generate_small_bool_trie("%s_table" % cat, grouped_categories[cat])
else:
generator = generate_bool_trie("%s_table" % cat, grouped_categories[cat])
Expand Down Expand Up @@ -841,19 +841,18 @@ def main():
unicode_data = load_unicode_data(get_path(UnicodeFiles.UNICODE_DATA))
load_special_casing(get_path(UnicodeFiles.SPECIAL_CASING), unicode_data)

want_derived = {"XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase",
want_derived = {"Alphabetic", "Lowercase", "Uppercase",
"Cased", "Case_Ignorable", "Grapheme_Extend"}
derived = load_properties(get_path(UnicodeFiles.DERIVED_CORE_PROPERTIES), want_derived)

props = load_properties(get_path(UnicodeFiles.PROPS),
{"White_Space", "Join_Control", "Noncharacter_Code_Point",
"Pattern_White_Space"})
{"White_Space", "Join_Control", "Noncharacter_Code_Point"})

# Category tables
for (name, categories, category_subset) in (
("general_category", unicode_data.general_categories, ["N", "Cc"]),
("derived_property", derived, want_derived),
("property", props, ["White_Space", "Pattern_White_Space"])
("property", props, ["White_Space"])
):
for fragment in generate_property_module(name, categories, category_subset):
buf.write(fragment)
Expand Down
2 changes: 1 addition & 1 deletion src/libfmt_macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ path = "lib.rs"

[dependencies]
syntax_pos = { path = "../libsyntax_pos" }

rustc_lexer = { path = "../librustc_lexer" }
10 changes: 5 additions & 5 deletions src/libfmt_macros/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use std::string;
use std::iter;

use syntax_pos::{InnerSpan, Symbol};
use rustc_lexer::character_properties::{is_id_start, is_id_continue};

#[derive(Copy, Clone)]
struct InnerOffset(usize);
Expand Down Expand Up @@ -597,12 +598,11 @@ impl<'a> Parser<'a> {
}
}

/// Parses a word starting at the current position. A word is considered to
/// be an alphabetic character followed by any number of alphanumeric
/// characters.
/// Parses a word starting at the current position. A word is the same as
/// Rust identifier, except that it can't start with `_` character.
fn word(&mut self) -> &'a str {
let start = match self.cur.peek() {
Some(&(pos, c)) if c.is_xid_start() => {
Some(&(pos, c)) if c != '_' && is_id_start(c) => {
self.cur.next();
pos
}
Expand All @@ -611,7 +611,7 @@ impl<'a> Parser<'a> {
}
};
while let Some(&(pos, c)) = self.cur.peek() {
if c.is_xid_continue() {
if is_id_continue(c) {
self.cur.next();
} else {
return &self.input[start..pos];
Expand Down
8 changes: 4 additions & 4 deletions src/librustc_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ name = "rustc_lexer"
version = "0.1.0"
edition = "2018"

# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = { version = "0.1.0", optional = true }

# Note: do not remove this blank `[lib]` section.
# This will be used when publishing this crate as `rustc-ap-rustc_lexer`.
[lib]
doctest = false
name = "rustc_lexer"

# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = "0.2.0"
63 changes: 30 additions & 33 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
// We want to be able to build this crate with a stable compiler, so feature
// flags should be optional.
#![cfg_attr(not(feature = "unicode-xid"), feature(unicode_internals))]
// We want to be able to build this crate with a stable compiler, so no
// `#![feature]` attributes should be added.

mod cursor;
pub mod unescape;
Expand Down Expand Up @@ -507,54 +506,52 @@ impl Cursor<'_> {
}

pub mod character_properties {
// this is Pattern_White_Space
#[cfg(feature = "unicode-xid")]
// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
// classes.

// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.
pub fn is_whitespace(c: char) -> bool {
match c {
'\u{0009}' | '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{0020}'
| '\u{0085}' | '\u{200E}' | '\u{200F}' | '\u{2028}' | '\u{2029}' => true,
// Usual ASCII suspects
| '\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space

// NEXT LINE from latin1
| '\u{0085}'

// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK

// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
_ => false,
}
}

#[cfg(not(feature = "unicode-xid"))]
pub fn is_whitespace(c: char) -> bool {
core::unicode::property::Pattern_White_Space(c)
}

// this is XID_Start OR '_' (which formally is not a XID_Start)
#[cfg(feature = "unicode-xid")]
// This is XID_Start OR '_' (which formally is not a XID_Start).
pub fn is_id_start(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
}

#[cfg(not(feature = "unicode-xid"))]
pub fn is_id_start(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && c.is_xid_start())
}

// this is XID_Continue
#[cfg(feature = "unicode-xid")]
// This is XID_Continue.
pub fn is_id_continue(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
}

#[cfg(not(feature = "unicode-xid"))]
pub fn is_id_continue(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_'
|| (c > '\x7f' && c.is_xid_continue())
}
}
1 change: 1 addition & 0 deletions src/librustc_mir/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ rustc = { path = "../librustc" }
rustc_target = { path = "../librustc_target" }
rustc_data_structures = { path = "../librustc_data_structures" }
rustc_errors = { path = "../librustc_errors" }
rustc_lexer = { path = "../librustc_lexer" }
rustc_serialize = { path = "../libserialize", package = "serialize" }
syntax = { path = "../libsyntax" }
syntax_pos = { path = "../libsyntax_pos" }
Expand Down
5 changes: 2 additions & 3 deletions src/librustc_mir/borrow_check/move_errors.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use core::unicode::property::Pattern_White_Space;

use rustc::mir::*;
use rustc::ty;
use rustc_errors::{DiagnosticBuilder,Applicability};
use rustc_lexer::character_properties::is_whitespace;
use syntax_pos::Span;

use crate::borrow_check::MirBorrowckCtxt;
Expand Down Expand Up @@ -526,7 +525,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> {
let suggestion;
let to_remove;
if pat_snippet.starts_with("mut")
&& pat_snippet["mut".len()..].starts_with(Pattern_White_Space)
&& pat_snippet["mut".len()..].starts_with(is_whitespace)
{
suggestion = pat_snippet["mut".len()..].trim_start();
to_remove = "&mut";
Expand Down
4 changes: 2 additions & 2 deletions src/librustc_mir/borrow_check/mutability_errors.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use core::unicode::property::Pattern_White_Space;
use rustc_lexer::character_properties::is_whitespace;
use rustc::hir;
use rustc::hir::Node;
use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body};
Expand Down Expand Up @@ -715,7 +715,7 @@ fn annotate_struct_field(
fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option<String> {
let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?;
if hi_src.starts_with("ref")
&& hi_src["ref".len()..].starts_with(Pattern_White_Space)
&& hi_src["ref".len()..].starts_with(is_whitespace)
{
let replacement = format!("ref mut{}", &hi_src["ref".len()..]);
Some(replacement)
Expand Down
1 change: 1 addition & 0 deletions src/librustdoc/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ extern crate rustc_interface;
extern crate rustc_metadata;
extern crate rustc_target;
extern crate rustc_typeck;
extern crate rustc_lexer;
extern crate serialize;
extern crate syntax;
extern crate syntax_pos;
Expand Down
5 changes: 3 additions & 2 deletions src/librustdoc/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use rustc::hir;
use rustc::hir::intravisit;
use rustc::session::{self, config, DiagnosticOutput};
use rustc::util::common::ErrorReported;
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
use syntax::ast;
use syntax::with_globals;
use syntax::source_map::SourceMap;
Expand Down Expand Up @@ -763,8 +764,8 @@ impl Tester for Collector {
// We use these headings as test names, so it's good if
// they're valid identifiers.
let name = name.chars().enumerate().map(|(i, c)| {
if (i == 0 && c.is_xid_start()) ||
(i != 0 && c.is_xid_continue()) {
if (i == 0 && is_id_start(c)) ||
(i != 0 && is_id_continue(c)) {
c
} else {
'_'
Expand Down
4 changes: 2 additions & 2 deletions src/libsyntax/ext/proc_macro_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint}

use errors::{Diagnostic, DiagnosticBuilder};
use rustc_data_structures::sync::Lrc;
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span};
use syntax_pos::symbol::{kw, sym, Symbol};

Expand Down Expand Up @@ -322,8 +323,7 @@ impl Ident {
fn is_valid(string: &str) -> bool {
let mut chars = string.chars();
if let Some(start) = chars.next() {
(start == '_' || start.is_xid_start())
&& chars.all(|cont| cont == '_' || cont.is_xid_continue())
is_id_start(start) && chars.all(is_id_continue)
} else {
false
}
Expand Down
1 change: 1 addition & 0 deletions src/libsyntax_ext/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ rustc_target = { path = "../librustc_target" }
smallvec = { version = "0.6.7", features = ["union", "may_dangle"] }
syntax = { path = "../libsyntax" }
syntax_pos = { path = "../libsyntax_pos" }
rustc_lexer = { path = "../librustc_lexer" }

0 comments on commit a0c186c

Please sign in to comment.