Skip to content

Commit

Permalink
flatten rustc_lexer::character_properties module
Browse files Browse the repository at this point in the history
On the call site, `rustc_lexer::is_whitespace` reads much better than
`character_properties::is_whitespace`.
  • Loading branch information
matklad committed Sep 4, 2019
1 parent a0c186c commit 206fe8e
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 86 deletions.
5 changes: 2 additions & 3 deletions src/libfmt_macros/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ use std::string;
use std::iter;

use syntax_pos::{InnerSpan, Symbol};
use rustc_lexer::character_properties::{is_id_start, is_id_continue};

#[derive(Copy, Clone)]
struct InnerOffset(usize);
Expand Down Expand Up @@ -602,7 +601,7 @@ impl<'a> Parser<'a> {
/// Rust identifier, except that it can't start with `_` character.
fn word(&mut self) -> &'a str {
let start = match self.cur.peek() {
Some(&(pos, c)) if c != '_' && is_id_start(c) => {
Some(&(pos, c)) if c != '_' && rustc_lexer::is_id_start(c) => {
self.cur.next();
pos
}
Expand All @@ -611,7 +610,7 @@ impl<'a> Parser<'a> {
}
};
while let Some(&(pos, c)) = self.cur.peek() {
if is_id_continue(c) {
if rustc_lexer::is_id_continue(c) {
self.cur.next();
} else {
return &self.input[start..pos];
Expand Down
135 changes: 70 additions & 65 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,62 @@ pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
})
}

// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
// classes.

/// True if `c` is considered a whitespace according to Rust language definition.
pub fn is_whitespace(c: char) -> bool {
// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.

match c {
// Usual ASCII suspects
| '\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space

// NEXT LINE from latin1
| '\u{0085}'

// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK

// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
_ => false,
}
}

/// True if `c` is valid as a first character of an identifier.
pub fn is_id_start(c: char) -> bool {
// This is XID_Start OR '_' (which formally is not a XID_Start).
// We also add fast-path for ascii idents
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
}

/// True if `c` is valid as a non-first character of an identifier.
pub fn is_id_continue(c: char) -> bool {
// This is exactly XID_Continue.
// We also add fast-path for ascii idents
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
}


impl Cursor<'_> {
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
Expand All @@ -111,9 +167,9 @@ impl Cursor<'_> {
'*' => self.block_comment(),
_ => Slash,
},
c if character_properties::is_whitespace(c) => self.whitespace(),
c if is_whitespace(c) => self.whitespace(),
'r' => match (self.nth_char(0), self.nth_char(1)) {
('#', c1) if character_properties::is_id_start(c1) => self.raw_ident(),
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
let suffix_start = self.len_consumed();
Expand Down Expand Up @@ -158,7 +214,7 @@ impl Cursor<'_> {
}
_ => self.ident(),
},
c if character_properties::is_id_start(c) => self.ident(),
c if is_id_start(c) => self.ident(),
c @ '0'..='9' => {
let literal_kind = self.number(c);
let suffix_start = self.len_consumed();
Expand Down Expand Up @@ -246,8 +302,8 @@ impl Cursor<'_> {
}

fn whitespace(&mut self) -> TokenKind {
debug_assert!(character_properties::is_whitespace(self.prev()));
while character_properties::is_whitespace(self.nth_char(0)) {
debug_assert!(is_whitespace(self.prev()));
while is_whitespace(self.nth_char(0)) {
self.bump();
}
Whitespace
Expand All @@ -257,19 +313,19 @@ impl Cursor<'_> {
debug_assert!(
self.prev() == 'r'
&& self.nth_char(0) == '#'
&& character_properties::is_id_start(self.nth_char(1))
&& is_id_start(self.nth_char(1))
);
self.bump();
self.bump();
while character_properties::is_id_continue(self.nth_char(0)) {
while is_id_continue(self.nth_char(0)) {
self.bump();
}
RawIdent
}

fn ident(&mut self) -> TokenKind {
debug_assert!(character_properties::is_id_start(self.prev()));
while character_properties::is_id_continue(self.nth_char(0)) {
debug_assert!(is_id_start(self.prev()));
while is_id_continue(self.nth_char(0)) {
self.bump();
}
Ident
Expand Down Expand Up @@ -314,7 +370,7 @@ impl Cursor<'_> {
// integer literal followed by field/method access or a range pattern
// (`0..2` and `12.foo()`)
'.' if self.nth_char(1) != '.'
&& !character_properties::is_id_start(self.nth_char(1)) =>
&& !is_id_start(self.nth_char(1)) =>
{
// might have stuff after the ., and if it does, it needs to start
// with a number
Expand Down Expand Up @@ -344,15 +400,15 @@ impl Cursor<'_> {
fn lifetime_or_char(&mut self) -> TokenKind {
debug_assert!(self.prev() == '\'');
let mut starts_with_number = false;
if (character_properties::is_id_start(self.nth_char(0))
if (is_id_start(self.nth_char(0))
|| self.nth_char(0).is_digit(10) && {
starts_with_number = true;
true
})
&& self.nth_char(1) != '\''
{
self.bump();
while character_properties::is_id_continue(self.nth_char(0)) {
while is_id_continue(self.nth_char(0)) {
self.bump();
}

Expand Down Expand Up @@ -494,64 +550,13 @@ impl Cursor<'_> {
}

fn eat_literal_suffix(&mut self) {
if !character_properties::is_id_start(self.nth_char(0)) {
if !is_id_start(self.nth_char(0)) {
return;
}
self.bump();

while character_properties::is_id_continue(self.nth_char(0)) {
while is_id_continue(self.nth_char(0)) {
self.bump();
}
}
}

pub mod character_properties {
// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
// classes.

// This is Pattern_White_Space.
//
// Note that this set is stable (ie, it doesn't change with different
// Unicode versions), so it's ok to just hard-code the values.
pub fn is_whitespace(c: char) -> bool {
match c {
// Usual ASCII suspects
| '\u{0009}' // \t
| '\u{000A}' // \n
| '\u{000B}' // vertical tab
| '\u{000C}' // form feed
| '\u{000D}' // \r
| '\u{0020}' // space

// NEXT LINE from latin1
| '\u{0085}'

// Bidi markers
| '\u{200E}' // LEFT-TO-RIGHT MARK
| '\u{200F}' // RIGHT-TO-LEFT MARK

// Dedicated whitespace characters from Unicode
| '\u{2028}' // LINE SEPARATOR
| '\u{2029}' // PARAGRAPH SEPARATOR
=> true,
_ => false,
}
}

// This is XID_Start OR '_' (which formally is not a XID_Start).
pub fn is_id_start(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
}

// This is XID_Continue.
pub fn is_id_continue(c: char) -> bool {
('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| c == '_'
|| (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
}
}
3 changes: 1 addition & 2 deletions src/librustc_mir/borrow_check/move_errors.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use rustc::mir::*;
use rustc::ty;
use rustc_errors::{DiagnosticBuilder,Applicability};
use rustc_lexer::character_properties::is_whitespace;
use syntax_pos::Span;

use crate::borrow_check::MirBorrowckCtxt;
Expand Down Expand Up @@ -525,7 +524,7 @@ impl<'a, 'tcx> MirBorrowckCtxt<'a, 'tcx> {
let suggestion;
let to_remove;
if pat_snippet.starts_with("mut")
&& pat_snippet["mut".len()..].starts_with(is_whitespace)
&& pat_snippet["mut".len()..].starts_with(rustc_lexer::is_whitespace)
{
suggestion = pat_snippet["mut".len()..].trim_start();
to_remove = "&mut";
Expand Down
3 changes: 1 addition & 2 deletions src/librustc_mir/borrow_check/mutability_errors.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use rustc_lexer::character_properties::is_whitespace;
use rustc::hir;
use rustc::hir::Node;
use rustc::mir::{self, BindingForm, ClearCrossCrate, Local, Location, Body};
Expand Down Expand Up @@ -715,7 +714,7 @@ fn annotate_struct_field(
fn suggest_ref_mut(tcx: TyCtxt<'_>, binding_span: Span) -> Option<String> {
let hi_src = tcx.sess.source_map().span_to_snippet(binding_span).ok()?;
if hi_src.starts_with("ref")
&& hi_src["ref".len()..].starts_with(is_whitespace)
&& hi_src["ref".len()..].starts_with(rustc_lexer::is_whitespace)
{
let replacement = format!("ref mut{}", &hi_src["ref".len()..]);
Some(replacement)
Expand Down
5 changes: 2 additions & 3 deletions src/librustdoc/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ use rustc::hir;
use rustc::hir::intravisit;
use rustc::session::{self, config, DiagnosticOutput};
use rustc::util::common::ErrorReported;
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
use syntax::ast;
use syntax::with_globals;
use syntax::source_map::SourceMap;
Expand Down Expand Up @@ -764,8 +763,8 @@ impl Tester for Collector {
// We use these headings as test names, so it's good if
// they're valid identifiers.
let name = name.chars().enumerate().map(|(i, c)| {
if (i == 0 && is_id_start(c)) ||
(i != 0 && is_id_continue(c)) {
if (i == 0 && rustc_lexer::is_id_start(c)) ||
(i != 0 && rustc_lexer::is_id_continue(c)) {
c
} else {
'_'
Expand Down
3 changes: 1 addition & 2 deletions src/libsyntax/ext/proc_macro_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use crate::tokenstream::{self, DelimSpan, IsJoint::*, TokenStream, TreeAndJoint}

use errors::{Diagnostic, DiagnosticBuilder};
use rustc_data_structures::sync::Lrc;
use rustc_lexer::character_properties::{is_id_start, is_id_continue};
use syntax_pos::{BytePos, FileName, MultiSpan, Pos, SourceFile, Span};
use syntax_pos::symbol::{kw, sym, Symbol};

Expand Down Expand Up @@ -323,7 +322,7 @@ impl Ident {
fn is_valid(string: &str) -> bool {
let mut chars = string.chars();
if let Some(start) = chars.next() {
is_id_start(start) && chars.all(is_id_continue)
rustc_lexer::is_id_start(start) && chars.all(rustc_lexer::is_id_continue)
} else {
false
}
Expand Down
14 changes: 5 additions & 9 deletions src/libsyntax/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
(None, None) => return true,
(None, _) => return false,
(Some(&a), None) => {
if is_pattern_whitespace(a) {
if rustc_lexer::is_whitespace(a) {
break // trailing whitespace check is out of loop for borrowck
} else {
return false
Expand All @@ -72,11 +72,11 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
(Some(&a), Some(&b)) => (a, b)
};

if is_pattern_whitespace(a) && is_pattern_whitespace(b) {
if rustc_lexer::is_whitespace(a) && rustc_lexer::is_whitespace(b) {
// skip whitespace for a and b
scan_for_non_ws_or_end(&mut a_iter);
scan_for_non_ws_or_end(&mut b_iter);
} else if is_pattern_whitespace(a) {
} else if rustc_lexer::is_whitespace(a) {
// skip whitespace for a
scan_for_non_ws_or_end(&mut a_iter);
} else if a == b {
Expand All @@ -88,20 +88,16 @@ crate fn matches_codepattern(a : &str, b : &str) -> bool {
}

// check if a has *only* trailing whitespace
a_iter.all(is_pattern_whitespace)
a_iter.all(rustc_lexer::is_whitespace)
}

/// Advances the given peekable `Iterator` until it reaches a non-whitespace character
fn scan_for_non_ws_or_end<I: Iterator<Item = char>>(iter: &mut Peekable<I>) {
while iter.peek().copied().map(|c| is_pattern_whitespace(c)) == Some(true) {
while iter.peek().copied().map(|c| rustc_lexer::is_whitespace(c)) == Some(true) {
iter.next();
}
}

fn is_pattern_whitespace(c: char) -> bool {
rustc_lexer::character_properties::is_whitespace(c)
}

/// Identify a position in the text by the Nth occurrence of a string.
struct Position {
string: &'static str,
Expand Down

0 comments on commit 206fe8e

Please sign in to comment.