Skip to content

Commit

Permalink
parser: remove Regexes from whitespace parser (#1008)
Browse files Browse the repository at this point in the history
removing Regexes from whitespace parser allows ditching of thread local storage + lazy initialization cost

This shows a modest 2% improvement in overall parse time (inflate is improved by 10%)
  • Loading branch information
zsol authored Sep 9, 2023
1 parent 377a292 commit 94dd20e
Showing 1 changed file with 106 additions and 34 deletions.
140 changes: 106 additions & 34 deletions native/libcst/src/tokenizer/whitespace_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,13 @@ use crate::nodes::{
Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace,
SimpleWhitespace, TrailingWhitespace,
};
use memchr::memchr2_iter;
use regex::Regex;
use memchr::{memchr2, memchr2_iter};
use thiserror::Error;

use crate::Token;

use super::TokType;

thread_local! {
static SIMPLE_WHITESPACE_RE: Regex = Regex::new(r"\A([ \f\t]|\\(\r\n?|\n))*").expect("regex");
static NEWLINE_RE: Regex = Regex::new(r"\A(\r\n?|\n)").expect("regex");
static COMMENT_RE: Regex = Regex::new(r"\A#[^\r\n]*").expect("regex");
}

#[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)]
#[derive(Error, Debug, PartialEq, Eq)]
pub enum WhitespaceError {
Expand Down Expand Up @@ -231,29 +224,34 @@ pub fn parse_empty_lines<'a>(

pub fn parse_comment<'a>(config: &Config<'a>, state: &mut State) -> Result<Option<Comment<'a>>> {
let newline_after = config.get_line_after_column(state.line, state.column_byte)?;
if let Some(comment_match) = COMMENT_RE.with(|r| r.find(newline_after)) {
let comment_str = comment_match.as_str();
advance_this_line(
config,
state,
comment_str.chars().count(),
comment_str.len(),
)?;
return Ok(Some(Comment(comment_str)));
if newline_after.as_bytes().first() != Some(&b'#') {
return Ok(None);
}
Ok(None)
let comment_str = if let Some(idx) = memchr2(b'\n', b'\r', newline_after.as_bytes()) {
&newline_after[..idx]
} else {
newline_after
};
advance_this_line(
config,
state,
comment_str.chars().count(),
comment_str.len(),
)?;
Ok(Some(Comment(comment_str)))
}

pub fn parse_newline<'a>(config: &Config<'a>, state: &mut State) -> Result<Option<Newline<'a>>> {
let newline_after = config.get_line_after_column(state.line, state.column_byte)?;
if let Some(newline_match) = NEWLINE_RE.with(|r| r.find(newline_after)) {
let newline_str = newline_match.as_str();
advance_this_line(
config,
state,
newline_str.chars().count(),
newline_str.len(),
)?;
let len = match newline_after.as_bytes() {
[b'\n', ..] => 1,
[b'\r', b'\n', ..] => 2,
[b'\r', ..] => 1,
_ => 0,
};
if len > 0 {
let newline_str = &newline_after[..len];
advance_this_line(config, state, len, len)?;
if state.column_byte != config.get_line(state.line)?.len() {
return Err(WhitespaceError::InternalError(format!(
"Found newline at ({}, {}) but it's not EOL",
Expand Down Expand Up @@ -376,13 +374,18 @@ pub fn parse_simple_whitespace<'a>(
state: &mut State,
) -> Result<SimpleWhitespace<'a>> {
let capture_ws = |line, col| -> Result<&'a str> {
let x = config.get_line_after_column(line, col);
let x = x?;
Ok(SIMPLE_WHITESPACE_RE.with(|r| {
r.find(x)
.expect("SIMPLE_WHITESPACE_RE supports 0-length matches, so it must always match")
.as_str()
}))
let line = config.get_line_after_column(line, col)?;
let bytes = line.as_bytes();
let mut idx = 0;
while idx < bytes.len() {
match bytes[idx..] {
[b' ' | b'\t' | b'\x0c', ..] => idx += 1,
[b'\\', b'\r', b'\n', ..] => idx += 3,
[b'\\', b'\r' | b'\n', ..] => idx += 2,
_ => break,
}
}
Ok(&line[..idx])
};
let start_offset = state.byte_offset;
let mut prev_line: &str;
Expand Down Expand Up @@ -436,7 +439,9 @@ pub fn parse_parenthesized_whitespace<'a>(

#[cfg(test)]
mod tests {
use crate::{tokenize, Config, Result};
use crate::{tokenize, Comment, Config, Result, SimpleWhitespace};

use super::{parse_comment, parse_simple_whitespace};

#[test]
fn config_mixed_newlines() -> Result<'static, ()> {
Expand All @@ -452,4 +457,71 @@ mod tests {

Ok(())
}

fn _parse_simple_whitespace(src: &str) -> Result<SimpleWhitespace> {
let tokens = tokenize(src)?;
let config = Config::new(src, &tokens);
let mut state = Default::default();
Ok(parse_simple_whitespace(&config, &mut state)?)
}

#[test]
fn simple_whitespace_line_continuations() -> Result<'static, ()> {
assert_eq!(
_parse_simple_whitespace(" \\\n # foo")?,
SimpleWhitespace(" \\\n ")
);

assert_eq!(
_parse_simple_whitespace(" \\\r # foo")?,
SimpleWhitespace(" \\\r ")
);
assert_eq!(
_parse_simple_whitespace(" \\\r\n # foo")?,
SimpleWhitespace(" \\\r\n ")
);

assert_eq!(
_parse_simple_whitespace(" \\\r\n\\\n # foo")?,
SimpleWhitespace(" \\\r\n\\\n ")
);

Ok(())
}

#[test]
fn simple_whitespace_mixed() -> Result<'static, ()> {
assert_eq!(
_parse_simple_whitespace(" \t\x0clol")?,
SimpleWhitespace(" \t\x0c"),
);

Ok(())
}

fn _parse_comment(src: &str) -> Result<Option<Comment>> {
let tokens = tokenize(src)?;
let config = Config::new(src, &tokens);
let mut state = Default::default();
Ok(parse_comment(&config, &mut state)?)
}

#[test]
fn single_comment() -> Result<'static, ()> {
assert_eq!(_parse_comment("# foo\n# bar")?, Some(Comment("# foo")));
Ok(())
}

#[test]
fn comment_until_eof() -> Result<'static, ()> {
assert_eq!(_parse_comment("#")?, Some(Comment("#")));
Ok(())
}

#[test]
fn no_comment() -> Result<'static, ()> {
assert_eq!(_parse_comment("foo")?, None);
assert_eq!(_parse_comment("\n")?, None);
Ok(())
}
}

0 comments on commit 94dd20e

Please sign in to comment.