Skip to content

Commit

Permalink
Regex Engine: Subexpression call: build multiple copies if needed
Browse files Browse the repository at this point in the history
do not recurse into subexpressions in the same level or below
  • Loading branch information
alimpfard committed Mar 27, 2020
1 parent 399d1a6 commit 80d1a30
Show file tree
Hide file tree
Showing 12 changed files with 33 additions and 4 deletions.
11 changes: 11 additions & 0 deletions src/lexer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,7 @@ std::optional<Regexp> NLexer::_regexp() {
advance(-1);
// reset capture indices
nested_index = 0;
inside_index = 0;
while (branch_reset_indices.size())
branch_reset_indices.pop();

Expand Down Expand Up @@ -1228,6 +1229,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
RegexpType::SubExprCall, (char)backrefnum,
regexp_debug_info(this, "\\g", 2)};
reg.subexprcall = backrefnum;
reg.inside_subexpr = inside_index;
return reg;
}
lexer_error(*this, Errors::InvalidRegexpSyntax, error_token(),
Expand Down Expand Up @@ -1463,6 +1465,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
// parenthesised expression
if (c == '(') {
nested_index++;
inside_index++;
std::optional<int> reset_branch{};
int branch = 0;
if (!branch_reset_indices.empty()) {
Expand Down Expand Up @@ -1492,6 +1495,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
}
}
advance(1); // consume ')'
inside_index--;
if (seen_newline) {
const Token &mtoken = error_token();
lexer_error(*this, Errors::InvalidRegexpSyntax, mtoken,
Expand All @@ -1511,6 +1515,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
// what _this_ index is
branch_reset_indices.push(nested_index);
auto reg = regexp();
inside_index--;
if (reset_branch.has_value()) {
nested_index = *reset_branch;
branch_reset_indices.push(branch);
Expand Down Expand Up @@ -1540,6 +1545,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
nested_index = *reset_branch;
branch_reset_indices.push(branch);
}
inside_index--;
return {};
}
if (c == '<' || c == '=') {
Expand All @@ -1550,6 +1556,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
nested_index = *reset_branch;
branch_reset_indices.push(branch);
}
inside_index--;
return {};
}
if (c == '>') {
Expand All @@ -1560,6 +1567,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
nested_index = *reset_branch;
branch_reset_indices.push(branch);
}
inside_index--;
return {};
}
if (c == ':') {
Expand All @@ -1572,6 +1580,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
if (!reg.has_value())
return reg;
auto &rv = reg.value();
inside_index--;
c = *source_p;
if (c != ')') {
const Token &mtoken = error_token();
Expand All @@ -1589,6 +1598,7 @@ std::optional<Regexp> NLexer::regexp_expression() {
}
auto my_index = nested_index;
auto reg = regexp();
inside_index--;
if (reset_branch.has_value()) {
nested_index = *reset_branch;
branch_reset_indices.push(branch);
Expand Down Expand Up @@ -2068,6 +2078,7 @@ Regexp::compile(std::multimap<const Regexp *, NFANode<std::string> *> &cache,
NFANode<std::string> *tl = new NFANode<std::string>{"R<>" + mangle()};
parent->epsilon_transition_to(tl);
tl->subexpr_call = subexprcall;
tl->inside_subexpr = inside_subexpr;
tl->named_rule = namef;
result = tl;
result->debug_info = debug_info;
Expand Down
1 change: 1 addition & 0 deletions src/lexer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ class NLexer {
int offset;
char buffer[1024000];
int nested_index = 0;
int inside_index = 0;
/// If there's anything on this, reset index to it
/// when matching alternatives
std::stack<int> branch_reset_indices{};
Expand Down
3 changes: 2 additions & 1 deletion src/nfa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ template <typename StateInfoT> class DFANode {
std::vector<RegexpAssertion> assertions = {};
std::set<int> subexpr_idxs = {};
std::set<int> subexpr_end_idxs = {};
int inside_subexpr = -1;
std::optional<int> backreference{};
int subexpr_call = -1;
bool subexpr_recurses = false;
Expand Down Expand Up @@ -128,6 +129,7 @@ template <typename StateInfoT> class NFANode {
subexpr_end = false, reference_node = false;
int max_opt_steps = 50;
int opt_step = max_opt_steps;
int inside_subexpr = -1;

std::optional<std::string> inline_code =
{}; // code that would be executed should this node match
Expand All @@ -150,7 +152,6 @@ template <typename StateInfoT> class NFANode {
int subexpr_idx = -1;
int subexpr_end_idx = -1;
int subexpr_call = -1;
bool subexpr_recurses = false;

NFANode(StateInfoT s) : state_info(s) {}
NFANode() {}
Expand Down
8 changes: 7 additions & 1 deletion src/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ inline static void parser_error_impl(char const *fmt, va_list arg) {
std::vprintf(fmt, arg);
}

static int sexpr_being_built = 0;

char *parser_errors[(int)ParserErrors::LAST - 10] = {
[(int)ParserErrors::InvalidToken - 11] = "Invalid token",
[(int)ParserErrors::FeatureUnsupported - 11] = "Unsupported feature",
Expand Down Expand Up @@ -1298,6 +1300,7 @@ template <typename T> DFANode<std::set<NFANode<T> *>> *NFANode<T>::to_dfa() {
}
dfanode->subexpr_recurses =
dfanode->subexpr_call <= dfanode->subexpr_end_idxs.size();
dfanode->inside_subexpr = s->inside_subexpr;
dfanode->subexpr_call = s->subexpr_call;
}
if (s->backreference.has_value()) {
Expand Down Expand Up @@ -1539,6 +1542,8 @@ void DFANLVMCodeGenerator<T>::generate(
for (auto subexpr_idx : node->subexpr_idxs) {
if (!subexprFunc.count(subexpr_idx))
continue;
int sbb = sexpr_being_built;
sexpr_being_built = subexpr_idx;
decltype(visited) _visited;
typename std::remove_reference<decltype(blk)>::type _blocks;
auto scope = subexprFunc[subexpr_idx];
Expand Down Expand Up @@ -1585,6 +1590,7 @@ void DFANLVMCodeGenerator<T>::generate(
dbuilder.CreateCondBr(matched, mroot, builder.module.BBfinalise);

builder.module.exit_main();
sexpr_being_built = sbb;
}
}
builder.issubexp = wasub;
Expand Down Expand Up @@ -1913,7 +1919,7 @@ void DFANLVMCodeGenerator<T>::generate(
}
// if there is a subexpr call, create it now
if (node->subexpr_call > -1 &&
(node->subexpr_recurses || node->subexpr_call > subexprFunc.size())) {
(node->subexpr_recurses || sexpr_being_built < node->subexpr_call)) {
llvm::Function *fn;
auto val = builder.module.current_main()->arg_begin();
if (subexprFunc.count(node->subexpr_call))
Expand Down
1 change: 1 addition & 0 deletions src/regexp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class Regexp {
bool plus = false, star = false, lazy = false, store = false;
int index = 0; // applies for nested and backref (escape)
int subexprcall = -1; // applies for SubExprCall
int inside_subexpr = -1; // applies for SubExprCall

std::optional<RepeatQuantifier> repeat;

Expand Down
1 change: 1 addition & 0 deletions tests/inputs/0012-subexpr-expr.input
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testhelloooohellotest
1 change: 1 addition & 0 deletions tests/list-tests
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
0009-pos
0010-pl
0011-subexpr
0012-subexpr-expr
2 changes: 1 addition & 1 deletion tests/outputs/0008-literal-match.stdout
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
res at 0x7ffe8df3e678, s at 0x7ff892c2e010
res at 0x7ffc26eda248, s at 0x7fc1109a1010
processing - 'HELP😒
'
match {'HELP' - (null) - 4 literal} is a stopword
Expand Down
2 changes: 1 addition & 1 deletion tests/outputs/0011-subexpr.stdout
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
res at 0x7ffdeebc6688, s at 0x7fcaae9ea010
res at 0x7ffd5f1a3468, s at 0x7ff8bcfe3010
processing - 'testtest'
match {'testtest' - (null) - 8 expr} is not a stopword
no match {'' - (null) - 0 expr} is not a stopword
Empty file.
5 changes: 5 additions & 0 deletions tests/outputs/0012-subexpr-expr.stdout
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
res at 0x7ffc99366f18, s at 0x7fc02f781010
processing - 'testhelloooohellotest'
match {'testhelloooo' - (null) - 12 expr} is not a stopword
match {'hellotest' - (null) - 9 expr} is not a stopword
no match {'' - (null) - 0 expr} is not a stopword
2 changes: 2 additions & 0 deletions tests/sources/0012-subexpr-expr.nlex
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
expr :: (test|hello+)\g1

0 comments on commit 80d1a30

Please sign in to comment.