diff --git a/.clang-format b/.clang-format index bd35115950..9c0b3a8142 100644 --- a/.clang-format +++ b/.clang-format @@ -77,6 +77,8 @@ IncludeCategories: Priority: 3 - Regex: '^<(vtrasterizer)/' Priority: 4 + - Regex: '^<(regex_dfa)/' + Priority: 5 - Regex: '^<(text_shaper)/' Priority: 5 - Regex: '^<(crispy)/' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7b9da0c303..42dfbe2923 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -215,6 +215,7 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Debug -DLIBTERMINAL_BUILD_BENCH_HEADLESS=ON -DCONTOUR_QT_VERSION=6 -S . -B build cmake --build build/ -j2 ./build/src/crispy/crispy_test + ./build/src/regex_dfa/regex_dfa_test ./build/src/vtparser/vtparser_test ./build/src/vtbackend/vtbackend_test rm -rf _deps build @@ -257,6 +258,8 @@ jobs: run: cmake --build build/ - name: "test: crispy" run: ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser" run: ./build/src/vtparser/vtparser_test - name: "test: vtbackend" @@ -326,6 +329,8 @@ jobs: run: cmake --build build/ --config Release - name: "test: crispy" run: .\build\src\crispy\Release\crispy_test.exe + - name: "test: regex_dfa" + run: .\build\src\regex_dfa\Release\regex_dfa_test.exe - name: "test: vtparser" run: .\build\src\vtparser\Release\vtparser_test.exe - name: "test: vtbackend" @@ -450,6 +455,8 @@ jobs: run: cmake --build build/ -- -j3 - name: "test: crispy" run: ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser" run: ./build/src/vtparser/vtparser_test - name: "test: vtbackend" @@ -461,6 +468,7 @@ jobs: name: contour-ubuntu2204-tests path: | build/src/crispy/crispy_test + build/src/regex_dfa/regex_dfa_test build/src/vtparser/vtparser_test build/src/vtbackend/vtbackend_test build/src/vtbackend/bench-headless @@ -516,6 +524,8 @@ jobs: # run: cmake --build build/ -- -j3 # - name: "test: crispy" # run: ./build/src/crispy/crispy_test + # - name: "test: regex_dfa" + # run: ./build/src/regex_dfa/regex_dfa_test # - name: "test: vtparser" # run: ./build/src/vtparser/vtparser_test # - name: "test: vtbackend" @@ -664,6 +674,8 @@ jobs: run: cmake --build build/ -- -j3 - name: "test: crispy" run: ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser" run: ./build/src/vtparser/vtparser_test - name: "test: vtbackend" @@ -728,6 +740,8 @@ jobs: valgrind - name: "test: crispy (via valgrind)" run: valgrind --error-exitcode=64 ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: valgrind --error-exitcode=64 ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser (via valgrind)" run: valgrind --error-exitcode=64 ./build/src/vtparser/vtparser_test - name: "test: vtbackend (via valgrind)" diff --git a/cmake/ContourThirdParties.cmake b/cmake/ContourThirdParties.cmake index 75f9334f09..ecad1d2e08 100644 --- a/cmake/ContourThirdParties.cmake +++ b/cmake/ContourThirdParties.cmake @@ -130,6 +130,9 @@ endif() ContourThirdParties_Embed_boxed_cpp() set(THIRDPARTY_BUILDIN_boxed_cpp "embedded") +ContourThirdParties_Embed_ctre() +set(THIRDPARTY_BUILDIN_ctre "embedded") + macro(ContourThirdPartiesSummary2) message(STATUS "==============================================================================") message(STATUS " Contour ThirdParties") @@ -144,5 +147,6 @@ macro(ContourThirdPartiesSummary2) message(STATUS "libunicode ${THIRDPARTY_BUILTIN_unicode_core} (${LIBUNICODE_LIBS})") message(STATUS "yaml-cpp ${THIRDPARTY_BUILTIN_yaml_cpp}") message(STATUS "boxed-cpp ${THIRDPARTY_BUILDIN_boxed_cpp}") + message(STATUS "CTRE ${THIRDPARTY_BUILDIN_ctre}") message(STATUS "------------------------------------------------------------------------------") endmacro() diff --git a/scripts/install-deps.sh b/scripts/install-deps.sh index 011372964b..787df91f2c 100755 --- a/scripts/install-deps.sh +++ b/scripts/install-deps.sh @@ -108,6 +108,16 @@ fetch_and_unpack_termbenchpro() termbench_pro } +fetch_and_unpack_ctre() +{ + local ctre_git_sha="0fdd96db416188a07833606b16633fb977c0cc11" + fetch_and_unpack \ + compile-time-regular-expressions-$ctre_git_sha \ + ctre-$ctre_git_sha.tar.gz \ + https://github.com/hanickadot/compile-time-regular-expressions/archive/$ctre_git_sha.tar.gz \ + ctre +} + fetch_and_unpack_boxed() { local boxed_cpp_git_sha="daa702e22e71f3da3eef838e4946b6c3df1f16b1" @@ -573,6 +583,7 @@ main() fetch_and_unpack_boxed fetch_and_unpack_termbenchpro + fetch_and_unpack_ctre } main $* diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 89c65121bc..f945b3715c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,7 @@ endif() include(PedanticCompiler) add_subdirectory(crispy) +add_subdirectory(regex_dfa) add_subdirectory(text_shaper) add_subdirectory(vtpty) add_subdirectory(vtparser) diff --git a/src/contour/Config.h b/src/contour/Config.h index 5c9fc948f6..619f82528a 100644 --- a/src/contour/Config.h +++ b/src/contour/Config.h @@ -180,6 +180,8 @@ struct TerminalProfile bool highlightDoubleClickedWord = true; terminal::StatusDisplayType initialStatusDisplayType = terminal::StatusDisplayType::None; + std::string urlPattern = R"((https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])"; + terminal::Opacity backgroundOpacity; // value between 0 (fully transparent) and 0xFF (fully visible). bool backgroundBlur; // On Windows 10, this will enable Acrylic Backdrop. diff --git a/src/contour/TerminalSession.cpp b/src/contour/TerminalSession.cpp index f7905c623c..a4e5df9581 100644 --- a/src/contour/TerminalSession.cpp +++ b/src/contour/TerminalSession.cpp @@ -122,6 +122,7 @@ namespace settings.primaryScreen.allowReflowOnResize = config.reflowOnResize; settings.highlightDoubleClickedWord = profile.highlightDoubleClickedWord; settings.highlightTimeout = profile.highlightTimeout; + settings.urlPattern = profile.urlPattern; return settings; } diff --git a/src/regex_dfa/Alphabet.cpp b/src/regex_dfa/Alphabet.cpp new file mode 100644 index 0000000000..8dccced1e1 --- /dev/null +++ b/src/regex_dfa/Alphabet.cpp @@ -0,0 +1,56 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +void Alphabet::insert(Symbol ch) +{ + if (_alphabet.find(ch) == _alphabet.end()) + { + DEBUG("Alphabet: insert '{:}'", prettySymbol(ch)); + _alphabet.insert(ch); + } +} + +string Alphabet::to_string() const +{ + stringstream sstr; + + sstr << '{'; + + for (Symbol c: _alphabet) + sstr << prettySymbol(c); + + sstr << '}'; + + return sstr.str(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Alphabet.h b/src/regex_dfa/Alphabet.h new file mode 100644 index 0000000000..ec6d37cd1a --- /dev/null +++ b/src/regex_dfa/Alphabet.h @@ -0,0 +1,60 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include + +namespace regex_dfa +{ + +/** + * Represents the alphabet of a finite automaton or regular expression. + */ +class Alphabet +{ + public: + using set_type = std::set; + using iterator = set_type::iterator; + + [[nodiscard]] size_t size() const noexcept { return _alphabet.size(); } + + void insert(Symbol ch); + + [[nodiscard]] std::string to_string() const; + + [[nodiscard]] iterator begin() const { return _alphabet.begin(); } + [[nodiscard]] iterator end() const { return _alphabet.end(); } + + private: + set_type _alphabet; +}; + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::Alphabet& v, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{}", v.to_string()); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/CMakeLists.txt b/src/regex_dfa/CMakeLists.txt new file mode 100644 index 0000000000..a415336887 --- /dev/null +++ b/src/regex_dfa/CMakeLists.txt @@ -0,0 +1,43 @@ +add_library(regex_dfa STATIC + Alphabet.cpp + Compiler.cpp + DFA.cpp + DFABuilder.cpp + DFAMinimizer.cpp + DotWriter.cpp + MultiDFA.cpp + NFA.cpp + NFABuilder.cpp + RegExpr.cpp + RegExprParser.cpp + RuleParser.cpp + State.cpp + Symbols.cpp + Report.cpp + SourceLocation.cpp +) + +target_include_directories(regex_dfa PUBLIC ${PROJECT_SOURCE_DIR}/src ${CMAKE_SOURCE_DIR}/src) +target_link_libraries(regex_dfa PUBLIC fmt::fmt-header-only) + +# ---------------------------------------------------------------------------- +option(REGEX_DFA_TESTING "Enables building of unittests for regex_dfa library [default: ON]" ON) +if(REGEX_DFA_TESTING) + enable_testing() + add_executable(regex_dfa_test + regex_dfa_test.cpp + DFABuilder_test.cpp + DotWriter_test.cpp + Lexer_test.cpp + NFA_test.cpp + RegExprParser_test.cpp + RuleParser_test.cpp + State_test.cpp + Symbols_test.cpp + util/iterator_test.cpp + ) + + target_link_libraries(regex_dfa_test PUBLIC regex_dfa) + target_link_libraries(regex_dfa_test PUBLIC Catch2::Catch2) + target_link_libraries(regex_dfa_test PUBLIC fmt::fmt-header-only) +endif(REGEX_DFA_TESTING) diff --git a/src/regex_dfa/CharStream.h b/src/regex_dfa/CharStream.h new file mode 100644 index 0000000000..d0d0e2d96f --- /dev/null +++ b/src/regex_dfa/CharStream.h @@ -0,0 +1,67 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +namespace regex_dfa +{ + +class CharStream +{ + public: + virtual ~CharStream() = default; + + [[nodiscard]] virtual bool isEof() const noexcept = 0; + virtual char get() = 0; + virtual void rollback(int count) = 0; + virtual void rewind() = 0; +}; + +class StringStream: public CharStream +{ + public: + explicit StringStream(std::string&& s): _source { std::move(s) } {} + + [[nodiscard]] bool isEof() const noexcept override { return _pos >= _source.size(); } + char get() override { return _source[_pos++]; } + void rollback(int count) override { _pos -= count; } + void rewind() override { _pos = 0; } + + private: + std::string _source; + size_t _pos = 0; +}; + +class StandardStream: public CharStream +{ + public: + explicit StandardStream(std::istream* source); + + [[nodiscard]] bool isEof() const noexcept override { return !_source->good(); } + char get() override { return static_cast(_source->get()); } + + void rollback(int count) override + { + _source->clear(); + _source->seekg(-count, std::ios::cur); + } + + void rewind() override + { + _source->clear(); + _source->seekg(_initialOffset, std::ios::beg); + } + + private: + std::istream* _source; + std::streamoff _initialOffset; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/Compiler.cpp b/src/regex_dfa/Compiler.cpp new file mode 100644 index 0000000000..676ef26ac0 --- /dev/null +++ b/src/regex_dfa/Compiler.cpp @@ -0,0 +1,189 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +void Compiler::parse(string text) +{ + parse(make_unique(std::move(text))); +} + +void Compiler::parse(unique_ptr stream) +{ + declareAll(RuleParser { std::move(stream) }.parseRules()); +} + +void Compiler::declareAll(RuleList rules) +{ + rules_.reserve(rules_.size() + rules.size()); + + // populate RegExpr + for (Rule& rule: rules) + rule.regexpr = make_unique(RegExprParser {}.parse(rule.pattern, rule.line, rule.column)); + + containsBeginOfLine_ = any_of(rules.begin(), rules.end(), ruleContainsBeginOfLine); + + if (containsBeginOfLine_) + { + // We have at least one BOL-rule. + for (Rule& rule: rules) + { + if (!regex_dfa::containsBeginOfLine(*rule.regexpr)) + { + NFA nfa = NFABuilder {}.construct(*rule.regexpr, rule.tag); + for (const string& condition: rule.conditions) + { + NFA& fa = fa_[condition]; + if (fa.empty()) + fa = nfa.clone(); + else + fa.alternate(nfa.clone()); + } + declare(rule); + } + declare(rule, "_0"); // BOL + } + } + else + { + // No BOL-rules present, just declare them then. + for (Rule& rule: rules) + declare(rule); + } + + for (Rule& rule: rules) + { + if (auto i = names_.find(rule.tag); i != names_.end() && i->first != rule.tag) + // Can actually only happen on "ignore" attributed rule count > 1. + names_[rule.tag] = fmt::format("{}, {}", i->second, rule.name); + else + names_[rule.tag] = rule.name; + + rules_.emplace_back(std::move(rule)); + } +} + +size_t Compiler::size() const +{ + size_t result = 0; + for (const pair& fa: fa_) + result += fa.second.size(); + return result; +} + +void Compiler::declare(const Rule& rule, const string& conditionSuffix) +{ + NFA nfa = NFABuilder {}.construct(*rule.regexpr, rule.tag); + + for (const string& condition: rule.conditions) + { + NFA& fa = fa_[condition + conditionSuffix]; + + if (fa.empty()) + fa = nfa.clone(); + else + fa.alternate(nfa.clone()); + } +} + +// const map& Compiler::automata() const { +// return fa_; +// } + +MultiDFA Compiler::compileMultiDFA(OvershadowMap* overshadows) +{ + map dfaMap; + for (const auto& fa: fa_) + dfaMap[fa.first] = DFABuilder { fa.second.clone() }.construct(overshadows); + + return constructMultiDFA(std::move(dfaMap)); +} + +DFA Compiler::compileDFA(OvershadowMap* overshadows) +{ + assert((!containsBeginOfLine_ && fa_.size() == 1) || (containsBeginOfLine_ && fa_.size() == 2)); + return DFABuilder { fa_.begin()->second.clone() }.construct(overshadows); +} + +DFA Compiler::compileMinimalDFA() +{ + return DFAMinimizer { compileDFA() }.constructDFA(); +} + +LexerDef Compiler::compile() +{ + return generateTables(compileMinimalDFA(), containsBeginOfLine_, std::move(names_)); +} + +LexerDef Compiler::compileMulti(OvershadowMap* overshadows) +{ + MultiDFA multiDFA = compileMultiDFA(overshadows); + multiDFA = DFAMinimizer { multiDFA }.constructMultiDFA(); + return generateTables(multiDFA, containsBeginOfLine_, names()); +} + +LexerDef Compiler::generateTables(const DFA& dfa, bool requiresBeginOfLine, map names) +{ + const Alphabet alphabet = dfa.alphabet(); + TransitionMap transitionMap; + + for (StateId state = 0, sE = dfa.lastState(); state <= sE; ++state) + for (Symbol c: alphabet) + if (optional nextState = dfa.delta(state, c); nextState.has_value()) + transitionMap.define(state, c, nextState.value()); + + map acceptStates; + for (StateId s: dfa.acceptStates()) + acceptStates.emplace(s, *dfa.acceptTag(s)); + + // TODO: many initial states ! + return LexerDef { { { "INITIAL", dfa.initialState() } }, + requiresBeginOfLine, + std::move(transitionMap), + std::move(acceptStates), + dfa.backtracking(), + std::move(names) }; +} + +LexerDef Compiler::generateTables(const MultiDFA& multiDFA, bool requiresBeginOfLine, map names) +{ + const Alphabet alphabet = multiDFA.dfa.alphabet(); + TransitionMap transitionMap; + + for (StateId state = 0, sE = multiDFA.dfa.lastState(); state <= sE; ++state) + for (const Symbol c: alphabet) + if (optional nextState = multiDFA.dfa.delta(state, c); nextState.has_value()) + transitionMap.define(state, c, nextState.value()); + + map acceptStates; + for (StateId s: multiDFA.dfa.acceptStates()) + acceptStates.emplace(s, *multiDFA.dfa.acceptTag(s)); + + // TODO: many initial states ! + return LexerDef { multiDFA.initialStates, requiresBeginOfLine, std::move(transitionMap), + std::move(acceptStates), multiDFA.dfa.backtracking(), std::move(names) }; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Compiler.h b/src/regex_dfa/Compiler.h new file mode 100644 index 0000000000..9e8f1846d2 --- /dev/null +++ b/src/regex_dfa/Compiler.h @@ -0,0 +1,104 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +struct MultiDFA; + +/** + * Top-Level API for compiling lexical patterns into table definitions for Lexer. + * + * @see Lexer + */ +class Compiler +{ + public: + using TagNameMap = std::map; + using OvershadowMap = DFABuilder::OvershadowMap; + using AutomataMap = std::map; + + Compiler(): rules_ {}, containsBeginOfLine_ { false }, fa_ {}, names_ {} {} + + /** + * Parses a @p stream of textual rule definitions to construct their internal data structures. + */ + void parse(std::unique_ptr stream); + void parse(std::string text); + + /** + * Parses a list of @p rules to construct their internal data structures. + */ + void declareAll(RuleList rules); + + [[nodiscard]] const RuleList& rules() const noexcept { return rules_; } + [[nodiscard]] const TagNameMap& names() const noexcept { return names_; } + [[nodiscard]] size_t size() const; + + /** + * Compiles all previousely parsed rules into a DFA. + */ + DFA compileDFA(OvershadowMap* overshadows = nullptr); + MultiDFA compileMultiDFA(OvershadowMap* overshadows = nullptr); + + /** + * Compiles all previousely parsed rules into a minimal DFA. + */ + DFA compileMinimalDFA(); + + /** + * Compiles all previousely parsed rules into a suitable data structure for Lexer. + * + * @see Lexer + */ + LexerDef compile(); + + /** + * Compiles all previousely parsed rules into a suitable data structure for Lexer, taking care of + * multiple conditions as well as begin-of-line. + */ + LexerDef compileMulti(OvershadowMap* overshadows = nullptr); + + /** + * Translates the given DFA @p dfa with a given TagNameMap @p names into trivial table mappings. + * + * @see Lexer + */ + static LexerDef generateTables(const DFA& dfa, bool requiresBeginOfLine, TagNameMap names); + static LexerDef generateTables(const MultiDFA& dfa, bool requiresBeginOfLine, TagNameMap names); + + [[nodiscard]] const std::map& automata() const { return fa_; } + + [[nodiscard]] bool containsBeginOfLine() const noexcept { return containsBeginOfLine_; } + + private: + /** + * Parses a single @p rule to construct their internal data structures. + */ + void declare(const Rule& rule, const std::string& conditionSuffix = ""); + + private: + RuleList rules_; + bool containsBeginOfLine_; + AutomataMap fa_; + TagNameMap names_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFA.cpp b/src/regex_dfa/DFA.cpp new file mode 100644 index 0000000000..f0df7a8eac --- /dev/null +++ b/src/regex_dfa/DFA.cpp @@ -0,0 +1,158 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include +#include +#include +#include +#include + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +using namespace std; + +namespace regex_dfa +{ + +Alphabet DFA::alphabet() const +{ + Alphabet alphabet; + for (const State& state: states_) + for (pair const t: state.transitions) + alphabet.insert(t.first); + + return alphabet; +} + +vector DFA::acceptStates() const +{ + vector states; + states.reserve(acceptTags_.size()); + for_each(begin(acceptTags_), end(acceptTags_), [&](const pair& s) { + states.push_back(s.first); + }); + return states; +} + +// -------------------------------------------------------------------------- + +void DFA::createStates(size_t count) +{ + states_.resize(states_.size() + count); +} + +void DFA::setInitialState(StateId s) +{ + // TODO: assert (s is having no predecessors) + initialState_ = s; +} + +void DFA::setTransition(StateId from, Symbol symbol, StateId to) +{ + // if (auto i = states_[from].transitions.find(symbol); i != states_[from].transitions.end()) + // fmt::print("overwriting transition! {} --({})--> {} (new: {})\n", from, prettySymbol(symbol), + // i->second, to); + + // XXX assert(s.transitions.find(symbol) == s.transitions.end()); + states_[from].transitions[symbol] = to; +} + +void DFA::removeTransition(StateId from, Symbol symbol) +{ + State& s = states_[from]; + if (auto i = s.transitions.find(symbol); i != s.transitions.end()) + s.transitions.erase(i); +} + +StateId DFA::append(DFA&& other, StateId q0) +{ + assert(other.initialState() == 0); + + other.prepareStateIds(states_.size(), q0); + + states_.reserve(size() + other.size() - 1); + states_[q0] = other.states_[0]; + states_.insert(states_.end(), next(other.states_.begin()), other.states_.end()); + backtrackStates_.insert(other.backtrackStates_.begin(), other.backtrackStates_.end()); + acceptTags_.insert(other.acceptTags_.begin(), other.acceptTags_.end()); + + return other.initialState(); +} + +void DFA::prepareStateIds(StateId baseId, StateId q0) +{ + // adjust transition state IDs + // traverse through each state's transition set + // traverse through each transition in the transition set + // traverse through each element and add BASE_ID + + auto transformId = [baseId, q0, this](StateId s) -> StateId { + // we subtract 1, because we already have a slot for q0 elsewhere (pre-allocated) + return s != initialState_ ? baseId + s - 1 : q0; + }; + + // for each state's transitions + for (State& state: states_) + for (pair& t: state.transitions) + t.second = transformId(t.second); + + AcceptMap remapped; + for (auto& a: acceptTags_) + remapped[transformId(a.first)] = a.second; + acceptTags_ = std::move(remapped); + + BacktrackingMap backtracking; + for (const auto& bt: backtrackStates_) + backtracking[transformId(bt.first)] = transformId(bt.second); + backtrackStates_ = std::move(backtracking); + + initialState_ = q0; +} + +void DFA::visit(DotVisitor& v) const +{ + v.start(initialState_); + + // STATE: initial + v.visitNode(initialState_, true, isAccepting(initialState_)); + + // STATE: accepting + for (StateId s: acceptStates()) + if (s != initialState_) + v.visitNode(s, false, true); + + // STATE: any other + for (StateId s = 0, sE = lastState(); s != sE; ++s) + if (s != initialState_ && !isAccepting(s)) + v.visitNode(s, false, false); + + // TRANSITIONS + for (StateId s = 0, sE = size(); s != sE; ++s) + { + const TransitionMap& T = states_[s].transitions; + for_each(T.begin(), T.end(), [&](const auto& t) { v.visitEdge(s, t.second, t.first); }); + for_each(T.begin(), T.end(), [&](const auto& t) { v.endVisitEdge(s, t.second); }); + } + v.end(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFA.h b/src/regex_dfa/DFA.h new file mode 100644 index 0000000000..a2d4881fab --- /dev/null +++ b/src/regex_dfa/DFA.h @@ -0,0 +1,170 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace regex_dfa +{ + +class NFA; +class DFABuilder; +class DotVisitor; + +/** + * Represents a deterministic finite automaton. + */ +class DFA +{ + public: + using TransitionMap = std::map; + struct State + { + // std::vector states; + TransitionMap transitions; + }; + using StateVec = std::vector; + + //! defines a mapping between accept state ID and another (prior) ID to track roll back the input stream + //! to. + using BacktrackingMap = std::map; + + DFA(const DFA& other) = delete; + DFA& operator=(const DFA& other) = delete; + DFA(DFA&&) = default; + DFA& operator=(DFA&&) = default; + ~DFA() = default; + + DFA(): states_ {}, initialState_ { 0 }, backtrackStates_ {}, acceptTags_ {} {} + + [[nodiscard]] bool empty() const noexcept { return states_.empty(); } + [[nodiscard]] size_t size() const noexcept { return states_.size(); } + + [[nodiscard]] StateId lastState() const noexcept + { + assert(!empty()); + return states_.size() - 1; + } + + //! Retrieves the alphabet of this finite automaton. + [[nodiscard]] Alphabet alphabet() const; + + //! Retrieves the initial state. + [[nodiscard]] StateId initialState() const { return initialState_; } + + //! Retrieves the list of available states. + [[nodiscard]] const StateVec& states() const { return states_; } + [[nodiscard]] StateVec& states() { return states_; } + + [[nodiscard]] StateIdVec stateIds() const + { + StateIdVec v; + v.reserve(states_.size()); + for (size_t i = 0, e = states_.size(); i != e; ++i) + v.push_back(i); // funny, I know + return v; + } + + //! Retrieves the list of accepting states. + [[nodiscard]] std::vector acceptStates() const; + + /** + * Traverses all states and edges in this NFA and calls @p visitor for each state & edge. + * + * Use this function to e.g. get a GraphViz dot-file drawn. + */ + void visit(DotVisitor& visitor) const; + + void createStates(size_t count); + + void setInitialState(StateId state); + + [[nodiscard]] const TransitionMap& stateTransitions(StateId id) const + { + return states_[static_cast(id)].transitions; + } + + // {{{ backtracking (for lookahead) + void setBacktrack(StateId from, StateId to) { backtrackStates_[from] = to; } + + [[nodiscard]] std::optional backtrack(StateId acceptState) const + { + if (auto i = backtrackStates_.find(acceptState); i != backtrackStates_.end()) + return i->second; + + return std::nullopt; + } + + [[nodiscard]] const BacktrackingMap& backtracking() const noexcept { return backtrackStates_; } + // }}} + + //! Flags given state as accepting-state with given Tag @p acceptTag. + void setAccept(StateId state, Tag acceptTag) { acceptTags_[state] = acceptTag; } + + [[nodiscard]] bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } + + [[nodiscard]] std::optional acceptTag(StateId s) const + { + if (auto i = acceptTags_.find(s); i != acceptTags_.end()) + return i->second; + + return std::nullopt; + } + + [[nodiscard]] std::optional delta(StateId state, Symbol symbol) const + { + const auto& T = states_[state].transitions; + if (auto i = T.find(symbol); i != T.end()) + return i->second; + + return std::nullopt; + } + + void setTransition(StateId from, Symbol symbol, StateId to); + void removeTransition(StateId from, Symbol symbol); + + [[nodiscard]] StateIdVec nonAcceptStates() const + { + StateIdVec result; + result.reserve( + std::abs(static_cast(states_.size()) - static_cast(acceptTags_.size()))); + + for (StateId s = 0, sE = size(); s != sE; ++s) + if (!isAccepting(s)) + result.push_back(s); + + return result; + } + + [[nodiscard]] bool isAcceptor(Tag t) const + { + for (std::pair p: acceptTags_) + if (p.second == t) + return true; + + return false; + } + + StateId append(DFA&& other, StateId q0); + + private: + void prepareStateIds(StateId baseId, StateId q0); + + private: + StateVec states_; + StateId initialState_; + BacktrackingMap backtrackStates_; + AcceptMap acceptTags_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFABuilder.cpp b/src/regex_dfa/DFABuilder.cpp new file mode 100644 index 0000000000..aa8fd393ff --- /dev/null +++ b/src/regex_dfa/DFABuilder.cpp @@ -0,0 +1,220 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +struct DFABuilder::TransitionTable +{ // {{{ + void insert(StateId q, Symbol c, StateId t); + unordered_map> transitions; +}; + +inline void DFABuilder::TransitionTable::insert(StateId q, Symbol c, StateId t) +{ + transitions[q][c] = t; +} +// }}} + +/* DFA construction visualization + REGEX: a(b|c)* + + NFA: n0 --(a)--> n1 --> n2 -----------------------------------> "n7" + \ ^ + \---> n3 <------------------------ / + \ \ \ / + \ \----> n4 --(b)--> n5 --> n6 + \ ^ + \----> n8 --(c)--> n9 ---/ + + DFA: + <--- + d0 --(a)--> "d1" ----(b)--> "d2"--(b) + \ |^ + \ (c)||(b) + \ v| + \--(c)--> "d3"--(c) + <--- + + + TABLE: + + set | DFA | NFA | + name | state | state | 'a' | 'b' | 'c' + -------------------------------------------------------------------------------------------------------- + q0 | d0 | {n0} | {n1,n2,n3,n4,n7,n8} | -none- | -none- + q1 | d1 | {n1,n2,n3,n4,n7,n8} | -none- | {n3,n4,n5,n6,n7,n8} | {n3,n4,n6,n7,n8,n9} + q2 | d2 | {n3,n4,n5,n6,n7,n8} | -none- | q2 | q3 + q3 | d3 | {n3,n4,n6,n7,n8,n9} | -none- | q2 | q3 +*/ + +DFA DFABuilder::construct(OvershadowMap* overshadows) +{ + const StateIdVec q_0 = nfa_.epsilonClosure({ nfa_.initialStateId() }); + vector Q = { q_0 }; // resulting states + deque workList = { q_0 }; + TransitionTable T; + + const Alphabet alphabet = nfa_.alphabet(); + + StateIdVec eclosure; + StateIdVec delta; + while (!workList.empty()) + { + const StateIdVec q = + std::move(workList.front()); // each set q represents a valid configuration from the NFA + workList.pop_front(); + const StateId q_i = *configurationNumber(Q, q); + + for (Symbol c: alphabet) + { + nfa_.epsilonClosure(*nfa_.delta(q, c, &delta), &eclosure); + if (!eclosure.empty()) + { + if (optional t_i = configurationNumber(Q, eclosure); t_i.has_value()) + T.insert(q_i, c, *t_i); // T[q][c] = eclosure; + else + { + Q.emplace_back(eclosure); + t_i = StateId { Q.size() - 1 }; // equal to configurationNumber(Q, eclosure); + T.insert(q_i, c, *t_i); // T[q][c] = eclosure; + workList.emplace_back(std::move(eclosure)); + } + eclosure.clear(); + } + delta.clear(); + } + } + + // Q now contains all the valid configurations and T all transitions between them + return constructDFA(Q, T, overshadows); +} + +DFA DFABuilder::constructDFA(const vector& Q, + const TransitionTable& T, + OvershadowMap* overshadows) const +{ + DFA dfa; + dfa.createStates(Q.size()); + + // build remaps table (used as cache for quickly finding DFA StateIds from NFA StateIds) + unordered_map remaps; + for_each(begin(Q), end(Q), [q_i = StateId { 0 }, &remaps](StateIdVec const& q) mutable { + for_each(begin(q), end(q), [&](StateId s) { remaps[s] = q_i; }); + q_i++; + }); + + // map q_i to d_i and flag accepting states + map overshadowing; + StateId q_i = 0; + for (const StateIdVec& q: Q) + { + // d_i represents the corresponding state in the DFA for all states of q from the NFA + const StateId d_i = q_i; + // cerr << fmt::format("map q{} to d{} for {} states, {}.\n", q_i, d_i->id(), q.size(), + // to_string(q, "d")); + + // if q contains an accepting state, then d is an accepting state in the DFA + if (nfa_.isAnyAccepting(q)) + { + optional tag = determineTag(q, &overshadowing); + assert(tag.has_value() && "DFA accept state merged from input states with different tags."); + // DEBUG("determineTag: q{} tag {} from {}.", q_i, *tag, q); + dfa.setAccept(d_i, *tag); + } + + if (optional bt = nfa_.containsBacktrackState(q); bt.has_value()) + { + // TODO: verify: must not contain more than one backtracking mapping + assert(dfa.isAccepting(d_i)); + dfa.setBacktrack(d_i, remaps[*bt]); + } + + q_i++; + } + + // observe mapping from q_i to d_i + for (auto const& [q_i, branch]: T.transitions) + for (auto&& [c, t_i]: branch) + dfa.setTransition(q_i, c, t_i); + + // q_0 becomes d_0 (initial state) + dfa.setInitialState(0); + + if (overshadows) + { + // check if tag is an acceptor in NFA but not in DFA, hence, it was overshadowed by another rule + for (const pair a: nfa_.acceptMap()) + { + const Tag tag = a.second; + if (!dfa.isAcceptor(tag)) + if (auto i = overshadowing.find(tag); i != overshadowing.end()) + overshadows->emplace_back(tag, i->second); + } + } + + return dfa; +} + +optional DFABuilder::configurationNumber(const vector& Q, const StateIdVec& t) +{ + if (auto i = find(begin(Q), end(Q), t); i != end(Q)) + return distance(begin(Q), i); + else + return nullopt; +} + +optional DFABuilder::determineTag(const StateIdVec& qn, map* overshadows) const +{ + deque tags; + + for (StateId s: qn) + if (optional t = nfa_.acceptTag(s); t.has_value()) + tags.push_back(*t); + + if (tags.empty()) + return nullopt; + + sort(begin(tags), end(tags)); + + optional lowestTag = tags.front(); + tags.erase(begin(tags)); + + for (Tag tag: tags) + (*overshadows)[tag] = *lowestTag; // {tag} is overshadowed by {lowestTag} + + return lowestTag; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFABuilder.h b/src/regex_dfa/DFABuilder.h new file mode 100644 index 0000000000..0cbaf5adeb --- /dev/null +++ b/src/regex_dfa/DFABuilder.h @@ -0,0 +1,64 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include +#include + +namespace regex_dfa +{ + +class DFA; +class State; + +class DFABuilder +{ + public: + //! Map of rules that shows which rule is overshadowed by which other rule. + using OvershadowMap = std::vector>; + + explicit DFABuilder(NFA&& nfa): nfa_ { std::move(nfa) } {} + + /** + * Constructs a DFA out of the NFA. + * + * @param overshadows if not nullptr, it will be used to store semantic information about + * which rule tags have been overshadowed by which. + */ + [[nodiscard]] DFA construct(OvershadowMap* overshadows = nullptr); + + private: + struct TransitionTable; + + [[nodiscard]] DFA constructDFA(const std::vector& Q, + const TransitionTable& T, + OvershadowMap* overshadows) const; + + /** + * Finds @p t in @p Q and returns its offset (aka configuration number) or -1 if not found. + */ + [[nodiscard]] static std::optional configurationNumber(const std::vector& Q, + const StateIdVec& t); + + /** + * Determines the tag to use for the deterministic state representing @p q from non-deterministic FA @p + * fa. + * + * @param q the set of states that reflect a single state in the DFA equal to the input FA + * + * @returns the determined tag or std::nullopt if none + */ + [[nodiscard]] std::optional determineTag(const StateIdVec& q, std::map* overshadows) const; + + private: + const NFA nfa_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFABuilder_test.cpp b/src/regex_dfa/DFABuilder_test.cpp new file mode 100644 index 0000000000..86a9613e0d --- /dev/null +++ b/src/regex_dfa/DFABuilder_test.cpp @@ -0,0 +1,33 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include + +#include + +#include +#include + +using namespace regex_dfa; + +TEST_CASE("regex_DFABuilder.shadowing") +{ + Compiler cc; + cc.parse(std::make_unique(R"( + Identifier ::= [a-z][a-z0-9]* + TrueLiteral ::= "true" + )")); + // rule 2 is overshadowed by rule 1 + Compiler::OvershadowMap overshadows; + DFA dfa = cc.compileDFA(&overshadows); + REQUIRE(1 == overshadows.size()); + CHECK(2 == overshadows[0].first); // overshadowee + CHECK(1 == overshadows[0].second); // overshadower +} diff --git a/src/regex_dfa/DFAMinimizer.cpp b/src/regex_dfa/DFAMinimizer.cpp new file mode 100644 index 0000000000..9c5a58e53d --- /dev/null +++ b/src/regex_dfa/DFAMinimizer.cpp @@ -0,0 +1,277 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +DFAMinimizer::DFAMinimizer(const DFA& dfa): + dfa_ { dfa }, + initialStates_ { { "INITIAL", dfa.initialState() } }, + alphabet_ { dfa_.alphabet() }, + targetStateIdMap_ {} +{ +} + +DFAMinimizer::DFAMinimizer(const MultiDFA& multiDFA): + dfa_ { multiDFA.dfa }, + initialStates_ { multiDFA.initialStates }, + alphabet_ { dfa_.alphabet() }, + targetStateIdMap_ {} +{ +} + +/** + * Tests whether or not StateId @p s is an initial state in any of the DFAs of the MultiDFA. + */ +bool DFAMinimizer::isMultiInitialState(StateId s) const +{ + return any_of(initialStates_.begin(), initialStates_.end(), [s](const auto& p) { return p.second == s; }); +} + +/** + * Tests whether any s in S is the initial state in the DFA that is to be minimized. + */ +bool DFAMinimizer::containsInitialState(const StateIdVec& S) const +{ + return any_of(S.begin(), S.end(), [this](StateId s) { return s == dfa_.initialState(); }); +} + +DFAMinimizer::PartitionVec::iterator DFAMinimizer::findGroup(StateId s) +{ + return find_if(begin(T), end(T), [&](StateIdVec& group) { + return dfa_.acceptTag(group.front()) == dfa_.acceptTag(s); + }); +} + +int DFAMinimizer::partitionId(StateId s) const +{ + auto i = + find_if(P.begin(), P.end(), [s](const auto& p) { return find(p.begin(), p.end(), s) != p.end(); }); + assert(i != P.end() && "State ID must be present in any of the partition sets."); + return static_cast(distance(P.begin(), i)); +} + +DFAMinimizer::PartitionVec DFAMinimizer::split(const StateIdVec& S) const +{ + for (Symbol c: alphabet_) + { + // if c splits S into s_1 and s_2 + // that is, phi(s_1, c) and phi(s_2, c) reside in two different p_i's (partitions) + // then return {s_1, s_2} + + map t_i; + for (StateId s: S) + { + if (const optional t = dfa_.delta(s, c); t.has_value()) + t_i[partitionId(*t)].push_back(s); + else + t_i[-1].push_back(s); + } + if (t_i.size() > 1) + { + DEBUG("split: {} on character '{}' into {} sets", to_string(S), (char) c, t_i.size()); + PartitionVec result; + for (auto&& t: t_i) + { + result.emplace_back(std::move(t.second)); + DEBUG(" partition {}: {}", t.first, t.second); + } + return result; + } + + assert(t_i.size() == 1); + + // t_i's only element thus is a reconstruction of S. + assert(t_i.begin()->second == S); + + for (StateId s: S) + { + PartitionVec result; + StateIdVec main; + + if (isMultiInitialState(s)) + result.emplace_back(StateIdVec { s }); + else + main.emplace_back(s); + + if (!main.empty()) + result.emplace_back(std::move(main)); + } + } + + DEBUG("split: no split needed for {}", to_string(S)); + return { S }; +} + +void DFAMinimizer::dumpGroups(const PartitionVec& T) +{ + DEBUG("dumping groups ({})", T.size()); + [[maybe_unused]] int groupNr = 0; + for (const auto& t: T) + { + stringstream sstr; + sstr << "{"; + for (size_t i = 0, e = t.size(); i != e; ++i) + { + if (i) + sstr << ", "; + sstr << "n" << t[i]; + } + sstr << "}"; + DEBUG("group {}: {}", groupNr, sstr.str()); + groupNr++; + } +} + +DFA DFAMinimizer::constructDFA() +{ + constructPartitions(); + return constructFromPartitions(P); +} + +MultiDFA DFAMinimizer::constructMultiDFA() +{ + constructPartitions(); + DFA dfamin = constructFromPartitions(P); + + // patch initialStates and the master-initial-state's transition symbol + MultiDFA::InitialStateMap initialStates; + for (const pair& p: initialStates_) + dfamin.removeTransition(dfamin.initialState(), static_cast(p.second)); + + for (const pair& p: initialStates_) + { + const StateId t = targetStateId(p.second); + initialStates[p.first] = t; + dfamin.setTransition(dfamin.initialState(), static_cast(t), t); + } + + return MultiDFA { std::move(initialStates), std::move(dfamin) }; +} + +void DFAMinimizer::constructPartitions() +{ + // group all accept states by their tag + for (StateId s: dfa_.acceptStates()) + { + if (auto group = findGroup(s); group != T.end()) + group->push_back(s); + else + T.push_back({ s }); + } + + // add another group for all non-accept states + T.emplace_back(dfa_.nonAcceptStates()); + + dumpGroups(T); + + PartitionVec splits; + while (P != T) + { + swap(P, T); + T.clear(); + + for (StateIdVec& p: P) + T.splice(T.end(), split(p)); + } + + // build up cache to quickly get target state ID from input DFA's state ID + targetStateIdMap_ = [&]() { + unordered_map remaps; + StateId p_i = 0; + for (const StateIdVec& p: P) + { + for (StateId s: p) + remaps[s] = p_i; + + p_i++; + } + return remaps; + }(); +} + +DFA DFAMinimizer::constructFromPartitions(const PartitionVec& P) const +{ + DEBUG("minimization terminated with {} unique partition sets", P.size()); + + // instanciate states + DFA dfamin; + dfamin.createStates(P.size()); + StateId p_i = 0; + for (const StateIdVec& p: P) + { + const StateId s = *p.begin(); + const StateId q = p_i; + DEBUG("Creating p{}: {} {}", + p_i, + dfa_.isAccepting(s) ? "accepting" : "rejecting", + containsInitialState(p) ? "initial" : ""); + if (optional tag = dfa_.acceptTag(s); tag.has_value()) + dfamin.setAccept(q, *tag); + + if (containsInitialState(p)) + dfamin.setInitialState(q); + + if (optional bt = containsBacktrackState(p); bt.has_value()) + dfamin.setBacktrack(p_i, targetStateId(*bt)); + + p_i++; + } + + // setup transitions + p_i = 0; + for (const StateIdVec& p: P) + { + const StateId s = *p.begin(); + for (pair const transition: dfa_.stateTransitions(s)) + { + auto const t_i = partitionId(transition.second); + DEBUG("map p{} --({})--> p{}", p_i, prettySymbol(transition.first), t_i); + dfamin.setTransition(p_i, transition.first, t_i); + } + p_i++; + } + + return dfamin; +} + +optional DFAMinimizer::containsBacktrackState(const StateIdVec& Q) const +{ + for (StateId q: Q) + if (optional t = dfa_.backtrack(q); t.has_value()) + return *t; + + return nullopt; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFAMinimizer.h b/src/regex_dfa/DFAMinimizer.h new file mode 100644 index 0000000000..0f30d06267 --- /dev/null +++ b/src/regex_dfa/DFAMinimizer.h @@ -0,0 +1,65 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class DFA; + +class DFAMinimizer +{ + public: + explicit DFAMinimizer(const DFA& dfa); + explicit DFAMinimizer(const MultiDFA& multiDFA); + + DFA constructDFA(); + MultiDFA constructMultiDFA(); + + private: + using PartitionVec = std::list; + + void constructPartitions(); + [[nodiscard]] StateIdVec nonAcceptStates() const; + [[nodiscard]] bool containsInitialState(const StateIdVec& S) const; + [[nodiscard]] bool isMultiInitialState(StateId s) const; + [[nodiscard]] PartitionVec::iterator findGroup(StateId s); + [[nodiscard]] int partitionId(StateId s) const; + [[nodiscard]] PartitionVec split(const StateIdVec& S) const; + [[nodiscard]] DFA constructFromPartitions(const PartitionVec& P) const; + [[nodiscard]] std::optional containsBacktrackState(const StateIdVec& Q) const; + + static void dumpGroups(const PartitionVec& T); + + [[nodiscard]] StateId targetStateId(StateId oldId) const + { + auto i = targetStateIdMap_.find(oldId); + assert(i != targetStateIdMap_.end()); + return i->second; + } + + private: + const DFA& dfa_; + const MultiDFA::InitialStateMap initialStates_; + const Alphabet alphabet_; + PartitionVec T; + PartitionVec P; + std::unordered_map targetStateIdMap_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotVisitor.h b/src/regex_dfa/DotVisitor.h new file mode 100644 index 0000000000..303dec8373 --- /dev/null +++ b/src/regex_dfa/DotVisitor.h @@ -0,0 +1,29 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +#include + +namespace regex_dfa +{ + +class DotVisitor +{ + public: + virtual ~DotVisitor() = default; + + virtual void start(StateId initialState) = 0; + virtual void visitNode(StateId number, bool start, bool accept) = 0; + virtual void visitEdge(StateId from, StateId to, Symbol s) = 0; + virtual void endVisitEdge(StateId from, StateId to) = 0; + virtual void end() = 0; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotWriter.cpp b/src/regex_dfa/DotWriter.cpp new file mode 100644 index 0000000000..36d98e4681 --- /dev/null +++ b/src/regex_dfa/DotWriter.cpp @@ -0,0 +1,115 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +template +static string escapeString(const StringType& str) +{ + stringstream stream_; + for (char ch: str) + { + // \t\n\r is already converted to escape sequence + switch (ch) + { + case '\\': stream_ << "\\\\"; break; + case '"': stream_ << "\\\""; break; + default: stream_ << ch; break; + } + } + return stream_.str(); +} + +void DotWriter::start(StateId initialState) +{ + initialState_ = initialState; + stream_ << "digraph {\n"; + stream_ << " rankdir=LR;\n"; + // stream_ << " label=\"" << escapeString("FA" /*TODO*/) << "\";\n"; +} + +void DotWriter::visitNode(StateId number, bool start, bool accept) +{ + if (start) + { + const string_view shape = accept ? "doublecircle" : "circle"; + stream_ << " \"\" [shape=plaintext];\n"; + stream_ << " node [shape=" << shape << ",color=red];\n"; + stream_ << " \"\" -> " << stateLabelPrefix_ << number << ";\n"; + stream_ << " node [color=black];\n"; + } + else if (accept) + { + stream_ << " node [shape=doublecircle]; " << stateLabelPrefix_ << number << ";\n"; + stream_ << " node [shape=circle,color=black];\n"; + } + else + { + // stream_ << stateLabelPrefix_ << number << ";\n"; + } +} + +void DotWriter::visitEdge(StateId /*from*/, StateId to, Symbol s) +{ + transitionGroups_[to].push_back(s); +} + +void DotWriter::endVisitEdge(StateId from, StateId to) +{ + auto& tgroup = transitionGroups_[to]; + if (!tgroup.empty()) + { + if (from == initialState_ && initialStates_ != nullptr) + { + for (Symbol s: tgroup) + { + const string label = [this, s]() { + for (const auto& p: *initialStates_) + if (p.second == static_cast(s)) + return fmt::format("<{}>", p.first); + return prettySymbol(s); + }(); + stream_ << fmt::format(" {}{} -> {}{} [label=\"{}\"];\n", + stateLabelPrefix_, + from, + stateLabelPrefix_, + to, + escapeString(label)); + } + } + else + { + string label = groupCharacterClassRanges(std::move(tgroup)); + stream_ << fmt::format(" {}{} -> {}{} [label=\"{}\"];\n", + stateLabelPrefix_, + from, + stateLabelPrefix_, + to, + escapeString(label)); + } + tgroup.clear(); + } +} + +void DotWriter::end() +{ + stream_ << "}\n"; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotWriter.h b/src/regex_dfa/DotWriter.h new file mode 100644 index 0000000000..66fa177ec5 --- /dev/null +++ b/src/regex_dfa/DotWriter.h @@ -0,0 +1,84 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class DotWriter: public DotVisitor +{ + public: + DotWriter(std::ostream& os, std::string stateLabelPrefix): + ownedStream_ {}, + stream_ { os }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { nullptr }, + initialState_ { 0 } + { + } + + DotWriter(const std::string& filename, std::string stateLabelPrefix): + ownedStream_ { std::make_unique(filename) }, + stream_ { *ownedStream_.get() }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { nullptr }, + initialState_ { 0 } + { + } + + DotWriter(std::ostream& os, std::string stateLabelPrefix, const MultiDFA::InitialStateMap& initialStates): + ownedStream_ {}, + stream_ { os }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { &initialStates }, + initialState_ { 0 } + { + } + + DotWriter(const std::string& filename, + std::string stateLabelPrefix, + const MultiDFA::InitialStateMap& initialStates): + ownedStream_ { std::make_unique(filename) }, + stream_ { *ownedStream_.get() }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { &initialStates }, + initialState_ { 0 } + { + } + + public: + void start(StateId initialState) override; + void visitNode(StateId number, bool start, bool accept) override; + void visitEdge(StateId from, StateId to, Symbol s) override; + void endVisitEdge(StateId from, StateId to) override; + void end() override; + + private: + std::unique_ptr ownedStream_; + std::ostream& stream_; + std::string stateLabelPrefix_; + std::map /*transition symbols*/> transitionGroups_; + const MultiDFA::InitialStateMap* initialStates_; + StateId initialState_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotWriter_test.cpp b/src/regex_dfa/DotWriter_test.cpp new file mode 100644 index 0000000000..4c659a1cfa --- /dev/null +++ b/src/regex_dfa/DotWriter_test.cpp @@ -0,0 +1,67 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +using namespace std; +using namespace regex_dfa; + +TEST_CASE("regex_DotWriter.simple") +{ + stringstream sstr; + DotWriter dw(sstr, "n"); + + dw.start(0); + dw.visitNode(0, true, true); + dw.visitEdge(0, 1, 'a'); + dw.endVisitEdge(0, 1); + + dw.visitNode(1, false, true); + dw.visitEdge(1, 1, 'b'); + dw.visitEdge(1, 1, '\r'); + dw.visitEdge(1, 1, '\n'); + dw.visitEdge(1, 1, '\t'); + dw.visitEdge(1, 1, ' '); + dw.endVisitEdge(1, 1); + dw.end(); + + REQUIRE(!sstr.str().empty()); + // just make sure it processes +} + +TEST_CASE("regex_DotWriter.multidfa_simple") +{ + stringstream sstr; + const MultiDFA::InitialStateMap mis { { "foo", 1 }, { "bar", 2 } }; + DotWriter dw(sstr, "n", mis); + + dw.start(0); + dw.visitNode(0, true, false); + dw.visitNode(1, false, true); + dw.visitNode(2, false, true); + + dw.visitEdge(0, 1, 0x01); + dw.endVisitEdge(0, 1); + + dw.visitEdge(0, 2, 0x02); + dw.endVisitEdge(0, 2); + + dw.visitEdge(1, 1, 'a'); + dw.endVisitEdge(1, 1); + + dw.visitEdge(2, 2, 'a'); + dw.endVisitEdge(2, 2); + + dw.end(); + + REQUIRE(!sstr.str().empty()); + // just make sure it processes +} diff --git a/src/regex_dfa/Lexable.h b/src/regex_dfa/Lexable.h new file mode 100644 index 0000000000..6524896b81 --- /dev/null +++ b/src/regex_dfa/Lexable.h @@ -0,0 +1,591 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include // TokenInfo: TODO: remove that header/API (inline TokenInfo here then) +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +//! Runtime exception that is getting thrown when a word could not be recognized. +struct LexerError: public std::runtime_error +{ + explicit LexerError(unsigned int offset): + std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", offset) }, + offset { offset } + { + } + + unsigned int offset; +}; + +template +class LexerIterator +{ + public: + using TokenInfo = regex_dfa::TokenInfo; + using TraceFn = std::function; + + using difference_type = long; + using value_type = TokenInfo; + using pointer = TokenInfo*; + using reference = TokenInfo&; + using iterator_category = std::forward_iterator_tag; + + enum class Eof + { + EofMark + }; + + /** + * Initializes a LexerIterator that purely marks the end of a lexically analyzed stream. + */ + explicit LexerIterator(Eof); + + /** + * Initializes a LexerIterator for a given source to be analyzed with given lexer definition . + */ + LexerIterator(const LexerDef& ld, std::istream& source, TraceFn trace = TraceFn {}); + + /** + * Retrieves the default DFA machine that is used to recognize words. + */ + [[nodiscard]] Machine defaultMachine() const noexcept; + + /** + * Sets the active deterministic finite automaton to use for recognizing words. + * + * @param machine the DFA machine to use for recognizing words. + * @return the previous Machine state. + */ + Machine setMachine(Machine machine); + + [[nodiscard]] const TokenInfo& operator*() const noexcept { return currentToken_; } + [[nodiscard]] auto offset() const noexcept { return currentToken_.offset; } + [[nodiscard]] auto literal() const noexcept -> const std::string& { return currentToken_.literal; } + [[nodiscard]] auto token() const noexcept { return currentToken_.token; } + [[nodiscard]] auto name() const noexcept { return name(token()); } + + [[nodiscard]] bool operator==(const LexerIterator& rhs) const noexcept; + [[nodiscard]] bool operator!=(const LexerIterator& rhs) const noexcept; + + LexerIterator& operator++(); + LexerIterator& operator++(int); + + private: + void recognize(); + [[nodiscard]] Token recognizeOne(); + + // --------------------------------------------------------------------------------- + // state helpers + + static constexpr StateId BadState = std::numeric_limits::max(); + + [[nodiscard]] StateId getInitialState() const noexcept; + [[nodiscard]] bool isAcceptState(StateId state) const; + + /** + * Retrieves the next state for given input state and input symbol. + * + * @param currentState the current State the DFA is in to. + * @param inputSymbol the input symbol that is used for transitioning from current state to the next + * state. + * @returns the next state to transition to. + */ + [[nodiscard]] StateId delta(StateId currentState, Symbol inputSymbol) const; + + // --------------------------------------------------------------------------------- + // stream helpers + + [[nodiscard]] int currentChar() const noexcept { return currentChar_; } + [[nodiscard]] bool eof() const noexcept { return !source_->good(); } + Symbol nextChar(); + void rollback(); + + // --------------------------------------------------------------------------------- + // debugging helpers + + template + void tracef(fmt::format_string msg, Args&&... args) const; + + [[nodiscard]] const std::string& name(Token t) const; + + [[nodiscard]] std::string toString(const std::deque& stack); + [[nodiscard]] Token token(StateId s) const; + [[nodiscard]] static std::string stateName(StateId s); + + private: + const LexerDef* def_ = nullptr; + const TraceFn trace_; + std::istream* source_ = nullptr; + int eof_ = 0; // 0=No, 1=EOF_INIT, 2=EOF_FINAL + + TokenInfo currentToken_; + Machine initialStateId_ = def_ ? defaultMachine() : Machine {}; + unsigned offset_ = 0; + bool isBeginOfLine_ = true; + int currentChar_ = -1; + std::vector buffered_; +}; + +template +inline Token token(const LexerIterator& it) +{ + return it.token(); +} + +template +inline size_t offset(const LexerIterator& it) +{ + return it.offset(); +} + +template +inline const std::string& literal(const LexerIterator& it) +{ + return it.literal(); +} + +/** + * @brief Holds a lexically analyzable stream of characters with a Lexer definition. + */ +template +class Lexable +{ + public: + using TraceFn = std::function; + using iterator = LexerIterator; + using value_type = TokenInfo; + + Lexable(const LexerDef& ld, std::istream& src, TraceFn trace = TraceFn {}): + def_ { ld }, source_ { &src }, initialOffset_ { source_->tellg() }, trace_ { std::move(trace) } + { + if constexpr (!RequiresBeginOfLine) + if (def_.containsBeginOfLineStates) + throw std::invalid_argument { + "LexerDef contains a grammar that requires begin-of-line handling, but this Lexer has " + "begin-of-line support disabled." + }; + } + + Lexable(const LexerDef& ld, const std::string& src, TraceFn trace = TraceFn {}): + Lexable { ld, std::make_unique(src), std::move(trace) } + { + } + + Lexable(const LexerDef& ld, std::unique_ptr&& src, TraceFn trace = TraceFn {}): + Lexable(ld, *src, std::move(trace)) + { + ownedSource_ = std::move(src); + } + + auto begin() const + { + source_->clear(); + source_->seekg(initialOffset_, std::ios::beg); + return iterator { def_, *source_, trace_ }; + } + + auto end() const { return iterator { iterator::Eof::EofMark }; } + + private: + const LexerDef& def_; + std::unique_ptr ownedSource_; + std::istream* source_; + std::streamoff initialOffset_; + TraceFn trace_; +}; + +template +inline auto begin(const Lexable& ls) +{ + return ls.begin(); +} + +template +inline auto end(const Lexable& ls) +{ + return ls.end(); +} + +// {{{ LexerIterator: impl +template +LexerIterator::LexerIterator(Eof): eof_ { 2 } +{ +} + +template +LexerIterator::LexerIterator(const LexerDef& ld, + std::istream& source, + TraceFn trace): + def_ { &ld }, trace_ { std::move(trace) }, source_ { &source } +{ + recognize(); +} + +template +Machine LexerIterator::defaultMachine() const noexcept +{ + auto i = def_->initialStates.find("INITIAL"); + assert(i != def_->initialStates.end()); + return static_cast(i->second); +} + +template +Machine LexerIterator::setMachine(Machine machine) +{ + return initialStateId_ = static_cast(machine); +} + +template +bool LexerIterator::operator==( + const LexerIterator& rhs) const noexcept +{ + return offset_ == rhs.offset_ || (eof_ == 2 && rhs.eof_ == 2); +} + +template +bool LexerIterator::operator!=( + const LexerIterator& rhs) const noexcept +{ + return !(*this == rhs); +} + +template +LexerIterator& LexerIterator::operator++() +{ + if (eof()) + eof_++; + + recognize(); + return *this; +} + +template +LexerIterator& LexerIterator::operator++(int) +{ + if (eof()) + eof_++; + + recognize(); + return *this; +} + +template +inline void LexerIterator::recognize() +{ + for (;;) + if (Token tag = recognizeOne(); static_cast(tag) != IgnoreTag) + return; +} + +template +inline Token LexerIterator::recognizeOne() +{ + // init + currentToken_.offset = offset_; + currentToken_.literal.clear(); + + StateId state = getInitialState(); + std::deque stack; + stack.push_back(BadState); + + if constexpr (Trace) + tracef("recognizeOne: startState {}, offset {} {}", + stateName(state), + offset_, + isBeginOfLine_ ? "BOL" : "no-BOL"); + + // advance + while (state != ErrorState) + { + Symbol ch = nextChar(); // one of: input character, ERROR or EOF + currentToken_.literal.push_back(ch); + + // we do not stack.clear() stack if isAcceptState(state) as we need this information iff + // lookahead is required. Otherwise we could clear here (for space savings) + + stack.push_back(state); + state = delta(state, ch); + } + + // backtrack to last (right-most) accept state + while (state != BadState && !isAcceptState(state)) + { + if constexpr (Trace) + tracef("recognizeOne: backtrack: current state {} {}; stack: {}", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + toString(stack)); + + state = stack.back(); + stack.pop_back(); + if (!currentToken_.literal.empty()) + { + rollback(); + currentToken_.literal.resize(currentToken_.literal.size() - 1); + } + } + + // backtrack to right-most non-lookahead position in input stream + if (auto i = def_->backtrackingStates.find(state); i != def_->backtrackingStates.end()) + { + const StateId tmp = state; + const StateId backtrackState = i->second; + if constexpr (Trace) + tracef("recognize: backtracking from {} to {}; stack: {}", + stateName(state), + stateName(backtrackState), + toString(stack)); + while (!stack.empty() && state != backtrackState) + { + state = stack.back(); + stack.pop_back(); + if constexpr (Trace) + tracef("recognize: backtrack: state {}", stateName(state)); + if (!currentToken_.literal.empty()) + { + rollback(); + currentToken_.literal.resize(currentToken_.literal.size() - 1); + } + } + state = tmp; + } + + if constexpr (Trace) + tracef("recognize: final state {} {} {} {}-{} {} [currentChar: {}]", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + isAcceptState(state) ? name(token(state)) : std::string(), + currentToken_.offset, + offset_, + quotedString(currentToken_.literal), + prettySymbol(currentChar_)); + + if (!isAcceptState(state)) + throw LexerError { offset_ }; + + auto i = def_->acceptStates.find(state); + assert(i != def_->acceptStates.end() && "Accept state hit, but no tag assigned."); + isBeginOfLine_ = currentToken_.literal.back() == '\n'; + + return currentToken_.token = static_cast(i->second); +} + +template +inline StateId LexerIterator::getInitialState() const noexcept +{ + if constexpr (RequiresBeginOfLine) + if (isBeginOfLine_ && def_->containsBeginOfLineStates) + return static_cast(initialStateId_) + 1; + + return static_cast(initialStateId_); +} + +template +inline bool LexerIterator::isAcceptState(StateId id) const +{ + return def_->acceptStates.find(id) != def_->acceptStates.end(); +} + +template +StateId LexerIterator::delta(StateId currentState, + Symbol inputSymbol) const +{ + const StateId nextState = def_->transitions.apply(currentState, inputSymbol); + if constexpr (Trace) + { + if (isAcceptState(nextState)) + tracef("recognize: state {:>4} --{:-^7}--> {:<6} (accepting: {})", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState), + name(token(nextState))); + else + tracef("recognize: state {:>4} --{:-^7}--> {:<6}", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState)); + } + + return nextState; +} + +template +inline Symbol LexerIterator::nextChar() +{ + if (!buffered_.empty()) + { + int ch = buffered_.back(); + currentChar_ = ch; + buffered_.resize(buffered_.size() - 1); + if constexpr (Trace) + tracef("Lexer:{}: advance (buffered) '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; + } + + if (!source_->good()) + { // EOF or I/O error + if constexpr (Trace) + tracef("Lexer:{}: advance '<<{}>>'", offset_, "EOF"); + return Symbols::EndOfFile; + } + + int ch = source_->get(); + if (ch < 0) + { + currentChar_ = Symbols::EndOfFile; + offset_++; + if constexpr (Trace) + tracef("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + return currentChar_; + } + + currentChar_ = ch; + if constexpr (Trace) + tracef("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; +} + +template +inline void LexerIterator::rollback() +{ + currentChar_ = currentToken_.literal.back(); + if (currentToken_.literal.back() != -1) + { + offset_--; + buffered_.push_back(static_cast(static_cast(currentToken_.literal.back()))); + tracef("Lexer:{}: rollback '{}'", offset_, prettySymbol(buffered_.back())); + } +} + +// ================================================================================= + +template +template +inline void LexerIterator::tracef(fmt::format_string msg, + Args&&... args) const +{ + if constexpr (Trace) + if (trace_) + trace_(fmt::format(msg, std::forward(args)...)); +} + +template +inline const std::string& LexerIterator::name(Token t) const +{ + auto i = def_->tagNames.find(static_cast(t)); + assert(i != def_->tagNames.end()); + return i->second; +} + +template +inline std::string LexerIterator::toString( + const std::deque& stack) +{ + std::stringstream sstr; + sstr << "{"; + int i = 0; + for (const auto s: stack) + { + if (i) + sstr << ","; + sstr << stateName(s); + i++; + } + + sstr << "}"; + return sstr.str(); +} + +template +Token LexerIterator::token(StateId s) const +{ + auto i = def_->acceptStates.find(s); + assert(i != def_->acceptStates.end()); + return static_cast(i->second); +} + +template +inline std::string LexerIterator::stateName(StateId s) +{ + switch (s) + { + case BadState: return "Bad"; + case ErrorState: return "Error"; + default: return fmt::format("n{}", std::to_string(s)); + } +} +// }}} + +} // namespace regex_dfa + +namespace std +{ +template +struct iterator_traits> +{ + using iterator = regex_dfa::LexerIterator; + + using difference_type = typename iterator::difference_type; + using value_type = typename iterator::value_type; + using pointer = typename iterator::pointer; + using reference = typename iterator::reference; + using iterator_category = typename iterator::iterator_category; +}; +} // namespace std + +namespace fmt +{ +template +struct formatter> +{ + using TokenInfo = regex_dfa::TokenInfo; + using LexerIterator = regex_dfa::LexerIterator; + + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const LexerIterator& v, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{} ({})", v.literal(), v.name()); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/Lexer-inl.h b/src/regex_dfa/Lexer-inl.h new file mode 100644 index 0000000000..25b2aaae11 --- /dev/null +++ b/src/regex_dfa/Lexer-inl.h @@ -0,0 +1,331 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +static inline std::string quotedString(const std::string& s) +{ + std::stringstream sstr; + sstr << std::quoted(s); + return sstr.str(); +} + +template +inline Lexer::Lexer(const LexerDef& info, DebugLogger logger): + def_ { info }, + debug_ { logger }, + initialStateId_ { defaultMachine() }, + word_ {}, + ownedStream_ {}, + stream_ { nullptr }, + oldOffset_ { 0 }, + offset_ { 0 }, + fileSize_ { 0 }, + isBeginOfLine_ { true }, + token_ { 0 } +{ + if constexpr (!RequiresBeginOfLine) + if (def_.containsBeginOfLineStates) + throw std::invalid_argument { + "LexerDef contains a grammar that requires begin-of-line handling, but this Lexer has " + "begin-of-line support disabled." + }; +} + +template +inline Lexer::Lexer(const LexerDef& info, + std::unique_ptr stream, + DebugLogger logger): + Lexer { info, std::move(logger) } +{ + reset(std::move(stream)); +} + +template +inline Lexer::Lexer(const LexerDef& info, + std::istream& stream, + DebugLogger logger): + Lexer { info, std::move(logger) } +{ + stream_ = &stream; + fileSize_ = getFileSize(); +} + +template +inline Lexer::Lexer(const LexerDef& info, + std::string input, + DebugLogger logger): + Lexer { info, std::move(logger) } +{ + reset(std::make_unique(std::move(input))); +} + +template +inline void Lexer::reset(std::unique_ptr stream) +{ + ownedStream_ = std::move(stream); + stream_ = ownedStream_.get(); + oldOffset_ = 0; + offset_ = 0; + isBeginOfLine_ = true; + fileSize_ = getFileSize(); +} + +template +inline void Lexer::reset(const std::string& text) +{ + reset(std::make_unique(text)); +} + +template +inline size_t Lexer::getFileSize() +{ + std::streamoff oldpos = stream_->tellg(); + stream_->seekg(0, stream_->end); + + std::streamoff theSize = stream_->tellg(); + stream_->seekg(oldpos, stream_->beg); + + return static_cast(theSize); +} + +template +inline std::string Lexer::stateName(StateId s, std::string_view n) +{ + switch (s) + { + case BadState: return "Bad"; + case ErrorState: return "Error"; + default: return fmt::format("{}{}", n, std::to_string(s)); + } +} + +template +inline std::string Lexer::toString( + const std::deque& stack) +{ + std::stringstream sstr; + sstr << "{"; + int i = 0; + for (const auto s: stack) + { + if (i) + sstr << ","; + sstr << stateName(s); + i++; + } + + sstr << "}"; + return sstr.str(); +} + +template +inline auto Lexer::recognize() -> TokenInfo +{ + for (;;) + if (Token tag = recognizeOne(); static_cast(tag) != IgnoreTag) + return TokenInfo { tag, word_, oldOffset_ }; +} + +template +inline StateId Lexer::getInitialState() const noexcept +{ + if constexpr (RequiresBeginOfLine) + { + if (isBeginOfLine_ && def_.containsBeginOfLineStates) + { + return static_cast(initialStateId_) + 1; + } + } + + return static_cast(initialStateId_); +} + +template +inline Token Lexer::recognizeOne() +{ + // init + oldOffset_ = offset_; + word_.clear(); + StateId state = getInitialState(); + std::deque stack; + stack.push_back(BadState); + + if constexpr (Debug) + debugf("recognize: startState {}, offset {} {}", + stateName(state), + offset_, + isBeginOfLine_ ? "BOL" : "no-BOL"); + + // advance + while (state != ErrorState) + { + Symbol ch = nextChar(); // one of: input character, ERROR or EOF + word_.push_back(ch); + + // we do not stack.clear() stack if isAcceptState(state) as we need this information iff + // lookahead is required. Otherwise we could clear here (for space savings) + + stack.push_back(state); + state = delta(state, ch); + } + + // backtrack to last (right-most) accept state + while (state != BadState && !isAcceptState(state)) + { + if constexpr (Debug) + debugf("recognize: backtrack: current state {} {}; stack: {}", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + toString(stack)); + + state = stack.back(); + stack.pop_back(); + if (!word_.empty()) + { + rollback(); + word_.resize(word_.size() - 1); + } + } + + // backtrack to right-most non-lookahead position in input stream + if (auto i = def_.backtrackingStates.find(state); i != def_.backtrackingStates.end()) + { + const StateId tmp = state; + const StateId backtrackState = i->second; + if constexpr (Debug) + debugf("recognize: backtracking from {} to {}; stack: {}", + stateName(state), + stateName(backtrackState), + toString(stack)); + while (!stack.empty() && state != backtrackState) + { + state = stack.back(); + stack.pop_back(); + if constexpr (Debug) + debugf("recognize: backtrack: state {}", stateName(state)); + if (!word_.empty()) + { + rollback(); + word_.resize(word_.size() - 1); + } + } + state = tmp; + } + + if constexpr (Debug) + debugf("recognize: final state {} {} {} {}-{} {} [currentChar: {}]", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + isAcceptState(state) ? name(token(state)) : std::string(), + oldOffset_, + offset_, + quotedString(word_), + prettySymbol(currentChar_)); + + if (!isAcceptState(state)) + throw LexerError { offset_ }; + + auto i = def_.acceptStates.find(state); + assert(i != def_.acceptStates.end() && "Accept state hit, but no tag assigned."); + isBeginOfLine_ = word_.back() == '\n'; + return token_ = static_cast(i->second); +} + +template +inline StateId Lexer::delta(StateId currentState, + Symbol inputSymbol) const +{ + const StateId nextState = def_.transitions.apply(currentState, inputSymbol); + if constexpr (Debug) + { + if (isAcceptState(nextState)) + { + debugf("recognize: state {:>4} --{:-^7}--> {:<6} (accepting: {})", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState), + name(token(nextState))); + } + else + { + debugf("recognize: state {:>4} --{:-^7}--> {:<6}", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState)); + } + } + + return nextState; +} + +template +inline bool Lexer::isAcceptState(StateId state) const noexcept +{ + return def_.acceptStates.find(state) != def_.acceptStates.end(); +} + +template +inline Symbol Lexer::nextChar() +{ + if (!buffered_.empty()) + { + int ch = buffered_.back(); + currentChar_ = ch; + buffered_.resize(buffered_.size() - 1); + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; + } + + if (!stream_->good()) + { // EOF or I/O error + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, "EOF"); + return Symbols::EndOfFile; + } + + int ch = stream_->get(); + if (ch < 0) + { + currentChar_ = Symbols::EndOfFile; + offset_++; + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + return currentChar_; + } + + currentChar_ = ch; + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; +} + +template +inline void Lexer::rollback() +{ + currentChar_ = word_.back(); + if (word_.back() != -1) + { + offset_--; + buffered_.push_back(word_.back()); + } +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Lexer.h b/src/regex_dfa/Lexer.h new file mode 100644 index 0000000000..769ce34cad --- /dev/null +++ b/src/regex_dfa/Lexer.h @@ -0,0 +1,289 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +template +struct TokenInfo +{ + Token token; + std::string literal; + size_t offset; + + operator Token() const noexcept { return token; } + + friend bool operator==(const TokenInfo& a, Token b) noexcept { return a.token == b; } + friend bool operator!=(const TokenInfo& a, Token b) noexcept { return a.token != b; } + friend bool operator==(Token a, const TokenInfo& b) noexcept { return b == a; } + friend bool operator!=(Token a, const TokenInfo& b) noexcept { return b != a; } +}; + +template +[[nodiscard]] inline Token token(const TokenInfo& it) +{ + return it.token; +} + +template +[[nodiscard]] inline size_t offset(const TokenInfo& it) +{ + return it.offset; +} + +template +[[nodiscard]] inline const std::string& literal(const TokenInfo& it) +{ + return it.literal; +} + +template +[[nodiscard]] inline const std::string& to_string(const TokenInfo& info) noexcept +{ + return info.literal; +} + +/** + * Lexer API for recognizing words. + */ +template +class Lexer +{ + public: + using value_type = Token; + using DebugLogger = std::function; + using TokenInfo = regex_dfa::TokenInfo; + + //! Constructs the Lexer with the given information table. + explicit Lexer(const LexerDef& info, DebugLogger logger = DebugLogger {}); + + //! Constructs the Lexer with the given information table and input stream. + Lexer(const LexerDef& info, std::unique_ptr input, DebugLogger logger = DebugLogger {}); + + //! Constructs the Lexer with the given information table and input stream. + Lexer(const LexerDef& info, std::istream& input, DebugLogger logger = DebugLogger {}); + + //! Constructs the Lexer with the given information table and input stream. + Lexer(const LexerDef& info, std::string input, DebugLogger logger = DebugLogger {}); + + /** + * Open given input stream. + */ + void reset(std::unique_ptr input); + void reset(const std::string& input); + + /** + * Recognizes one token (ignored patterns are skipped). + */ + [[nodiscard]] TokenInfo recognize(); + + /** + * Recognizes one token, regardless of it is to be ignored or not. + */ + [[nodiscard]] Token recognizeOne(); + + //! the underlying word of the currently recognized token + [[nodiscard]] const std::string& word() const { return word_; } + + //! @returns the absolute offset of the file the lexer is currently reading from. + [[nodiscard]] std::pair offset() const noexcept + { + return std::make_pair(oldOffset_, offset_); + } + + //! @returns the last recognized token. + [[nodiscard]] Token token() const noexcept { return token_; } + + //! @returns the name of the current token. + [[nodiscard]] const std::string& name() const { return name(token_); } + + //! @returns the name of the token represented by Token @p t. + [[nodiscard]] const std::string& name(Token t) const + { + auto i = def_.tagNames.find(static_cast(t)); + assert(i != def_.tagNames.end()); + return i->second; + } + + /** + * Retrieves the next state for given input state and input symbol. + * + * @param currentState the current State the DFA is in to. + * @param inputSymbol the input symbol that is used for transitioning from current state to the next + * state. + * @returns the next state to transition to. + */ + [[nodiscard]] inline StateId delta(StateId currentState, Symbol inputSymbol) const; + + /** + * Sets the active deterministic finite automaton to use for recognizing words. + * + * @param machine the DFA machine to use for recognizing words. + */ + Machine setMachine(Machine machine) + { + auto const oldMachine = initialStateId_; + // since Machine is a 1:1 mapping into the State's ID, we can simply cast here. + initialStateId_ = static_cast(machine); + return oldMachine; + } + + /** + * Retrieves the default DFA machine that is used to recognize words. + */ + [[nodiscard]] Machine defaultMachine() const + { + auto i = def_.initialStates.find("INITIAL"); + assert(i != def_.initialStates.end()); + return static_cast(i->second); + } + + /** + * Runtime exception that is getting thrown when a word could not be recognized. + */ + struct LexerError: public std::runtime_error + { + LexerError(unsigned int offset): + std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", offset) }, + offset { offset } + { + } + + unsigned int offset; + }; + + struct iterator // NOLINT(readability-identifier-naming) + { + Lexer& lx; + int end; + TokenInfo info; + + const TokenInfo& operator*() const { return info; } + + iterator& operator++() + { + if (lx.eof()) + ++end; + + info = lx.recognize(); + + return *this; + } + + iterator& operator++(int) { return ++*this; } + bool operator==(const iterator& rhs) const noexcept { return end == rhs.end; } + bool operator!=(const iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + iterator begin() + { + const Token t = recognize(); + return iterator { *this, 0, TokenInfo { t, word() } }; + } + + iterator end() { return iterator { *this, 2, TokenInfo { 0, "" } }; } + + [[nodiscard]] bool eof() const { return !stream_->good(); } + + [[nodiscard]] size_t fileSize() const noexcept { return fileSize_; } + + private: + template + inline void debugf(const char* msg, Args... args) const + { + if constexpr (Debug) + if (debug_) + debug_(fmt::format(msg, args...)); + } + + [[nodiscard]] Symbol nextChar(); + void rollback(); + [[nodiscard]] StateId getInitialState() const noexcept; + [[nodiscard]] bool isAcceptState(StateId state) const noexcept; + [[nodiscard]] static std::string stateName(StateId s, std::string_view n = "n"); + static constexpr StateId BadState = 101010; + [[nodiscard]] std::string toString(const std::deque& stack); + + [[nodiscard]] int currentChar() const noexcept { return currentChar_; } + + [[nodiscard]] Token token(StateId s) const + { + auto i = def_.acceptStates.find(s); + assert(i != def_.acceptStates.end()); + return static_cast(i->second); + } + + [[nodiscard]] size_t getFileSize(); + + private: + const LexerDef& def_; + const DebugLogger debug_; + + Machine initialStateId_; + std::string word_; + std::unique_ptr ownedStream_; + std::istream* stream_; + std::vector buffered_; + unsigned oldOffset_; + unsigned offset_; + size_t fileSize_; // cache + bool isBeginOfLine_; + int currentChar_; + Token token_; +}; + +template +inline const std::string& to_string( + const typename Lexer::iterator& it) noexcept +{ + return it.info.literal; +} + +} // namespace regex_dfa + +namespace fmt +{ +template +struct formatter> +{ + using TokenInfo = regex_dfa::TokenInfo; + + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const TokenInfo& v, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{}", v.literal); + } +}; +} // namespace fmt +#include diff --git a/src/regex_dfa/LexerDef.h b/src/regex_dfa/LexerDef.h new file mode 100644 index 0000000000..3a774827c2 --- /dev/null +++ b/src/regex_dfa/LexerDef.h @@ -0,0 +1,87 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include +#include +#include + +namespace regex_dfa +{ + +// special tags +constexpr Tag IgnoreTag = static_cast(-1); +constexpr Tag FirstUserTag = 1; + +using AcceptStateMap = std::map; + +//! defines a mapping between accept state ID and another (prior) ID to track roll back the input stream to. +using BacktrackingMap = std::map; + +struct LexerDef +{ + std::map initialStates; + bool containsBeginOfLineStates; + TransitionMap transitions; + AcceptStateMap acceptStates; + BacktrackingMap backtrackingStates; + std::map tagNames; + + [[nodiscard]] std::string to_string() const; + + [[nodiscard]] bool isValidTag(Tag t) const noexcept { return tagNames.find(t) != tagNames.end(); } + + [[nodiscard]] std::string tagName(Tag t) const + { + auto i = tagNames.find(t); + assert(i != tagNames.end()); + return i->second; + } +}; + +inline std::string LexerDef::to_string() const +{ + std::stringstream sstr; + + sstr << fmt::format("initializerStates:\n"); + for (const std::pair q0: initialStates) + sstr << fmt::format(" {}: {}\n", q0.first, q0.second); + sstr << fmt::format("totalStates: {}\n", transitions.states().size()); + + sstr << "transitions:\n"; + for (StateId inputState: transitions.states()) + { + std::map> T; + for (const std::pair p: transitions.map(inputState)) + { + T[p.second].push_back(p.first); + } + for (auto& t: T) + { + sstr << fmt::format( + "- n{} --({})--> n{}\n", inputState, groupCharacterClassRanges(std::move(t.second)), t.first); + } + } + + sstr << "accepts:\n"; + for (const std::pair a: acceptStates) + sstr << fmt::format("- n{} to {} ({})\n", a.first, a.second, tagName(a.second)); + + if (!backtrackingStates.empty()) + { + sstr << "backtracking:\n"; + for (const std::pair bt: backtrackingStates) + sstr << fmt::format("- n{} to n{}\n", bt.first, bt.second); + } + + return sstr.str(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Lexer_test.cpp b/src/regex_dfa/Lexer_test.cpp new file mode 100644 index 0000000000..1bc732eb28 --- /dev/null +++ b/src/regex_dfa/Lexer_test.cpp @@ -0,0 +1,602 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include +#include +#include + +#include + +using namespace std; +using namespace regex_dfa; +using namespace regex_dfa::util::literals; + +/* FEATURE UNITTEST CHECKLIST: + * + * - [ ] concatenation + * - [ ] alternation + * - [ ] {n} + * - [ ] {m,n} + * - [ ] {m,} + * - [ ] ? + * - [ ] character class, [a-z], [a-z0-9] + * - [ ] character class by name, such as [[:upper:]] + * - [ ] inverted character class, [^a-z], [^a-z0-9] + * - [ ] generic lookahead r/s + * - [ ] EOL lookahead r$ + * - [ ] BOL lookbehind ^r + */ + +const string RULES = R"( + Space(ignore) ::= [\s\t\n]+ + Eof ::= <> + ABBA ::= abba + AB_CD ::= ab/cd + CD ::= cd + CDEF ::= cdef + EOL_LF ::= eol$ + XAnyLine ::= x.* +)"; + +enum class LookaheadToken +{ + Eof = 1, + ABBA, + AB_CD, // NOLINT(readability-identifier-naming) + CD, + CDEF, + EOL_LF, // NOLINT(readability-identifier-naming) + XAnyLine +}; + +namespace fmt +{ // it sucks that I've to specify that here +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const LookaheadToken& v, FormatContext& ctx) + { + switch (v) + { + case LookaheadToken::Eof: return fmt::format_to(ctx.out(), "Eof"); + case LookaheadToken::ABBA: return fmt::format_to(ctx.out(), "abba"); + case LookaheadToken::AB_CD: return fmt::format_to(ctx.out(), "ab/cd"); + case LookaheadToken::CD: return fmt::format_to(ctx.out(), "cd"); + case LookaheadToken::CDEF: return fmt::format_to(ctx.out(), "cdef"); + case LookaheadToken::EOL_LF: return fmt::format_to(ctx.out(), "eol$"); + case LookaheadToken::XAnyLine: return fmt::format_to(ctx.out(), ""); + default: return fmt::format_to(ctx.out(), "<{}>", static_cast(v)); + } + } +}; +} // namespace fmt + +TEST_CASE("regex_Lexer.lookahead") +{ + Compiler cc; + cc.parse(RULES); + + LexerDef const lexerDef = cc.compile(); + CAPTURE(lexerDef.to_string()); + Lexable ls { lexerDef, "abba abcdef", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + + REQUIRE(LookaheadToken::ABBA == *lexer); + REQUIRE(LookaheadToken::AB_CD == *++lexer); + REQUIRE(LookaheadToken::CDEF == *++lexer); + REQUIRE(LookaheadToken::Eof == *++lexer); + REQUIRE(end(ls) == ++lexer); +} + +TEST_CASE("regex_Lexable.one") +{ + Compiler cc; + cc.parse(RULES); + + LexerDef const ld = cc.compile(); + CAPTURE(ld.to_string()); + auto src = Lexable { ld, + make_unique("abba abcdef"), + [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(src); + auto eof = end(src); + + REQUIRE(lexer != eof); + CHECK(LookaheadToken::ABBA == token(lexer)); + CHECK(0 == offset(lexer)); + + ++lexer; + CHECK(LookaheadToken::AB_CD == token(lexer)); + CHECK(5 == offset(lexer)); + + ++lexer; + CHECK(LookaheadToken::CDEF == token(lexer)); + CHECK(7 == offset(lexer)); + + ++lexer; + CHECK(LookaheadToken::Eof == token(lexer)); + CHECK(11 == offset(lexer)); + + ++lexer; + REQUIRE(!(lexer != eof)); // TODO: make that work +} + +TEST_CASE("regex_Lexer.LexerError") +{ + Compiler cc; + cc.parse(RULES); + + const LexerDef ld = cc.compile(); + Lexable ls { ld, "invalid" }; + CHECK_THROWS_AS((void) begin(ls), LexerError); +} + +TEST_CASE("regex_Lexer.evaluateDotToken") +{ + Compiler cc; + cc.parse(RULES); + + const LexerDef ld = cc.compile(); + Lexable ls { ld, "xanything" }; + auto lexer = begin(ls); + + REQUIRE(LookaheadToken::XAnyLine == *lexer); + REQUIRE(LookaheadToken::Eof == *++lexer); +} + +TEST_CASE("regex_Lexer.match_eol") +{ + Compiler cc; + cc.parse(RULES); + + LexerDef ld = cc.compile(); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + Lexable ls { ld, "abba eol\nabba", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + + REQUIRE(LookaheadToken::ABBA == *lexer); + CHECK(0 == offset(lexer)); + + REQUIRE(LookaheadToken::EOL_LF == *++lexer); + CHECK(5 == offset(lexer)); + + REQUIRE(LookaheadToken::ABBA == *++lexer); + CHECK(9 == offset(lexer)); + + REQUIRE(LookaheadToken::Eof == *++lexer); +} + +TEST_CASE("regex_Lexer.bol") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Unknown ::= . + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + Lexable ls { ld, "pragma", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + REQUIRE(1 == *lexer); // ^pragma + REQUIRE(4 == *++lexer); // EOS +} + +TEST_CASE("regex_Lexer.bol_no_match") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Unknown ::= . + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + Lexable ls { ld, "test pragma", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + REQUIRE(2 == *lexer); // test + + // pragma (char-wise) - must not be recognized as ^pragma + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + + REQUIRE(4 == *++lexer); // EOS +} + +TEST_CASE("regex_Lexer.bol_line2") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + Lexable ls { ld, "test\npragma", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + REQUIRE(2 == *lexer); // test + REQUIRE(1 == *++lexer); // ^pragma + REQUIRE(3 == *++lexer); // EOS +} + +TEST_CASE("regex_Lexer.bol_and_other_conditions") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Eof ::= <> + |Jump ::= jmp)"_multiline); + LexerDef ld = cc.compileMulti(); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + + Lexable ls { ld, "pragma test", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + REQUIRE(1 == *lexer); // ^pragma + REQUIRE(2 == *++lexer); // test + REQUIRE(3 == *++lexer); // <> +} + +TEST_CASE("regex_Lexer.bol_rules_on_non_bol_lexer") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |Test ::= "test" + |Pragma ::= ^"pragma" + |Unknown ::= . + |)"_multiline); + + LexerDef ld = cc.compile(); + using SimpleLexer = Lexable; + CHECK_THROWS_AS(SimpleLexer(ld, "pragma"), std::invalid_argument); +} + +TEST_CASE("regex_Lexer.non_bol_rules_on_non_bol_lexer") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |Test ::= "test" + |Unknown ::= . + |)"_multiline); + + LexerDef ld = cc.compile(); + Lexable ls { ld, " test " }; + auto lexer = begin(ls); + + REQUIRE(2 == *lexer); // "test" + REQUIRE(1 == *++lexer); // <> +} + +TEST_CASE("regex_Lexer.non_bol_rules_on_bol_lexer") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |Test ::= "test" + |Unknown ::= . + |)"_multiline); + + LexerDef ld = cc.compile(); + Lexable ls { ld, " test " }; + auto lexer = begin(ls); + + REQUIRE(2 == *lexer); // "test" + REQUIRE(1 == *++lexer); // <> +} + +TEST_CASE("regex_Lexer.iterator") +{ + Compiler cc; + cc.parse(make_unique(R"( + Spacing(ignore) ::= [\s\t\n]+ + A ::= a + B ::= b + Eof ::= <> + )")); + + auto const ld = cc.compile(); + auto const ls = Lexable { ld, make_unique("a b b a") }; + auto const e = ls.end(); + auto i = ls.begin(); + + // a + REQUIRE(1 == *i); + REQUIRE(i != e); + + // b + i++; + REQUIRE(2 == *i); + REQUIRE(i != e); + + // b + i++; + REQUIRE(2 == *i); + REQUIRE(i != e); + + // a + i++; + REQUIRE(1 == *i); + REQUIRE(i != e); + + // <> + i++; + REQUIRE(3 == *i); + REQUIRE(i != e); + + i++; + REQUIRE(3 == *i); // still EOF + REQUIRE(i == e); +} + +TEST_CASE("regex_Lexer.empty_alt") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Test ::= aa(bb|) + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + Lexable ls { ld, "aabb aa aabb", [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + + REQUIRE(1 == *lexer); + REQUIRE(1 == *++lexer); + REQUIRE(1 == *++lexer); + REQUIRE(2 == *++lexer); // EOF +} + +TEST_CASE("regex_Lexer.ignore_many") +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Comment(ignore) ::= #.* + |Eof ::= <> + |Foo ::= foo + |Bar ::= bar + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + Lexable ls { ld, + R"(|# some foo + |foo + | + |# some bar + |bar + |)"_multiline, + [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + + REQUIRE(2 == *lexer); + REQUIRE("foo" == literal(lexer)); + + REQUIRE(3 == *++lexer); + REQUIRE("bar" == literal(lexer)); + + REQUIRE(1 == *++lexer); // EOF +} + +TEST_CASE("regex_Lexer.realworld_ipv4") +{ + Compiler cc; + cc.parse(R"(| + |Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |IPv4Octet(ref) ::= [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] + |IPv4(ref) ::= {IPv4Octet}(\.{IPv4Octet}){3} + |IPv4Literal ::= {IPv4} + |)"_multiline); + + auto ld = cc.compile(); + auto ls = Lexable { ld, + R"(0.0.0.0 4.2.2.1 10.10.40.199 255.255.255.255)", + [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + + REQUIRE(2 == *lexer); + REQUIRE("0.0.0.0" == literal(lexer)); + + REQUIRE(2 == *++lexer); + REQUIRE("4.2.2.1" == literal(lexer)); + + REQUIRE(2 == *++lexer); + REQUIRE("10.10.40.199" == literal(lexer)); + + REQUIRE(2 == *++lexer); + REQUIRE("255.255.255.255" == literal(lexer)); + + REQUIRE(1 == *++lexer); +} + +enum class RealWorld +{ + Eof = 1, + IPv4, + IPv6 +}; +namespace fmt +{ // it sucks that I've to specify that here +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const RealWorld& v, FormatContext& ctx) + { + switch (v) + { + case RealWorld::Eof: return fmt::format_to(ctx.out(), "Eof"); + case RealWorld::IPv4: return fmt::format_to(ctx.out(), "IPv4"); + case RealWorld::IPv6: return fmt::format_to(ctx.out(), "IPv6"); + default: return fmt::format_to(ctx.out(), "<{}>", static_cast(v)); + } + } +}; +} // namespace fmt + +TEST_CASE("regex_Lexer.realworld_ipv6") +{ + Compiler cc; + cc.parse(R"(| + |Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + | + |IPv4Octet(ref) ::= [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] + |IPv4(ref) ::= {IPv4Octet}(\.{IPv4Octet}){3} + |IPv4Literal ::= {IPv4} + | + |ipv6Part(ref) ::= [[:xdigit:]]{1,4} + |IPv6 ::= {ipv6Part}(:{ipv6Part}){7,7} + | | ({ipv6Part}:){1,7}: + | | :(:{ipv6Part}){1,7} + | | :: + | | ({ipv6Part}:){1}(:{ipv6Part}){0,6} + | | ({ipv6Part}:){2}(:{ipv6Part}){0,5} + | | ({ipv6Part}:){3}(:{ipv6Part}){0,4} + | | ({ipv6Part}:){4}(:{ipv6Part}){0,3} + | | ({ipv6Part}:){5}(:{ipv6Part}){0,2} + | | ({ipv6Part}:){6}(:{ipv6Part}){0,1} + | | ::[fF]{4}:{IPv4} + )"_multiline); + + static const string TEXT = R"(|0:0:0:0:0:0:0:0 + |1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd + |2001:0db8:85a3:0000:0000:8a2e:0370:7334 + |1234:5678:: + |0:: + |::0 + |:: + |1::3:4:5:6:7:8 + |1::4:5:6:7:8 + |1::5:6:7:8 + |1::8 + |1:2::4:5:6:7:8 + |1:2::5:6:7:8 + |1:2::8 + |::ffff:127.0.0.1 + |::ffff:c000:0280 + |)"_multiline; + + auto ld = cc.compileMulti(); + auto ls = Lexable { ld, TEXT, [](const string& msg) { + INFO(msg); + } }; + auto lexer = begin(ls); + + REQUIRE(RealWorld::IPv6 == *lexer); + REQUIRE("0:0:0:0:0:0:0:0" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("2001:0db8:85a3:0000:0000:8a2e:0370:7334" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1234:5678::" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("0::" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::0" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::3:4:5:6:7:8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::4:5:6:7:8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::5:6:7:8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1:2::4:5:6:7:8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1:2::5:6:7:8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1:2::8" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::ffff:127.0.0.1" == literal(lexer)); + + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::ffff:c000:0280" == literal(lexer)); + + REQUIRE(RealWorld::Eof == *++lexer); +} + +TEST_CASE("regex_Lexer.internal") +{ + REQUIRE("Eof" == fmt::format("{}", LookaheadToken::Eof)); + REQUIRE("abba" == fmt::format("{}", LookaheadToken::ABBA)); + REQUIRE("ab/cd" == fmt::format("{}", LookaheadToken::AB_CD)); + REQUIRE("cd" == fmt::format("{}", LookaheadToken::CD)); + REQUIRE("cdef" == fmt::format("{}", LookaheadToken::CDEF)); + REQUIRE("eol$" == fmt::format("{}", LookaheadToken::EOL_LF)); + REQUIRE("" == fmt::format("{}", LookaheadToken::XAnyLine)); + REQUIRE("<724>" == fmt::format("{}", static_cast(724))); + + REQUIRE("Eof" == fmt::format("{}", RealWorld::Eof)); + REQUIRE("IPv4" == fmt::format("{}", RealWorld::IPv4)); + REQUIRE("IPv6" == fmt::format("{}", RealWorld::IPv6)); + REQUIRE("<724>" == fmt::format("{}", static_cast(724))); +} diff --git a/src/regex_dfa/MultiDFA.cpp b/src/regex_dfa/MultiDFA.cpp new file mode 100644 index 0000000000..208ce7f207 --- /dev/null +++ b/src/regex_dfa/MultiDFA.cpp @@ -0,0 +1,33 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +using namespace std; + +namespace regex_dfa +{ + +MultiDFA constructMultiDFA(map many) +{ + MultiDFA multiDFA {}; + multiDFA.dfa.createStates(1 + many.size()); + multiDFA.dfa.setInitialState(0); + + StateId q0 = 1; + for (pair& p: many) + { + multiDFA.dfa.append(std::move(p.second), q0); + multiDFA.initialStates[p.first] = q0; + multiDFA.dfa.setTransition(0, static_cast(q0), q0); + q0++; + } + + return multiDFA; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/MultiDFA.h b/src/regex_dfa/MultiDFA.h new file mode 100644 index 0000000000..76a30c0907 --- /dev/null +++ b/src/regex_dfa/MultiDFA.h @@ -0,0 +1,29 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include +#include + +namespace regex_dfa +{ + +struct MultiDFA +{ + using InitialStateMap = std::map; + + InitialStateMap initialStates; + DFA dfa; +}; + +MultiDFA constructMultiDFA(std::map many); + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFA.cpp b/src/regex_dfa/NFA.cpp new file mode 100644 index 0000000000..f8674b980a --- /dev/null +++ b/src/regex_dfa/NFA.cpp @@ -0,0 +1,375 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +#include +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +Alphabet NFA::alphabet() const +{ + Alphabet alphabet; + + for (const TransitionMap& transitions: states_) + { + for (auto const& t: transitions) + { + switch (t.first) + { + case Symbols::Epsilon: break; + default: alphabet.insert(t.first); + } + } + } + + return alphabet; +} + +NFA NFA::clone() const +{ + return *this; +} + +StateId NFA::createState() +{ + states_.emplace_back(); + return states_.size() - 1; +} + +StateIdVec NFA::delta(const StateIdVec& S, Symbol c) const +{ + StateIdVec result; + delta(S, c, &result); + return result; +} + +StateIdVec* NFA::delta(const StateIdVec& S, Symbol c, StateIdVec* result) const +{ + for (StateId s: S) + { + const TransitionMap& transitions = stateTransitions(s); + for (const auto& transition: transitions) + { + if (transition.first == c) + { + for (StateId targetState: transition.second) + { + result->push_back(targetState); + } + } + } + } + + return result; +} + +StateIdVec NFA::epsilonTransitions(StateId s) const +{ + StateIdVec t; + + const TransitionMap& transitions = stateTransitions(s); + for (auto&& [p, q]: transitions) + if (p == Symbols::Epsilon) + t.insert(t.end(), q.begin(), q.end()); + + return t; +} + +StateIdVec NFA::epsilonClosure(const StateIdVec& S) const +{ + StateIdVec eclosure; + epsilonClosure(S, &eclosure); + return eclosure; +} + +void NFA::epsilonClosure(const StateIdVec& S, StateIdVec* eclosure) const +{ + *eclosure = S; + vector availabilityCheck(1 + size(), false); + stack workList; + for (StateId s: S) + { + workList.push(s); + availabilityCheck[s] = true; + } + + while (!workList.empty()) + { + const StateId s = workList.top(); + workList.pop(); + + for (StateId t: epsilonTransitions(s)) + { + if (!availabilityCheck[t]) + { + eclosure->push_back(t); + workList.push(t); + } + } + } + + sort(eclosure->begin(), eclosure->end()); +} + +void NFA::prepareStateIds(StateId baseId) +{ + // adjust transition state IDs + // traverse through each state's transition set + // traverse through each transition in the transition set + // traverse through each element and add BASE_ID + + // for each state's transitions + for (StateId i = 0, e = size(); i != e; ++i) + { + TransitionMap& transitions = states_[i]; + + // for each vector of target-state-id per transition-symbol + for (auto t = transitions.begin(), tE = transitions.end(); t != tE; ++t) + { + StateIdVec& transition = t->second; + + // for each target state ID + for (StateId k = 0, kE = transition.size(); k != kE; ++k) + { + transition[k] += baseId; + } + } + } + + initialState_ += baseId; + acceptState_ += baseId; + + AcceptMap remapped; + for (auto& a: acceptTags_) + remapped[baseId + a.first] = a.second; + acceptTags_ = std::move(remapped); + + BacktrackingMap backtracking; + for (const auto& bt: backtrackStates_) + backtracking[baseId + bt.first] = baseId + bt.second; + backtrackStates_ = std::move(backtracking); +} + +NFA NFA::join(const map& mappings) +{ + if (mappings.size() == 1) + return mappings.begin()->second; + + NFA multi; + + for (size_t i = 0; i <= mappings.size(); ++i) + (void) multi.createState(); + + Symbol transitionSymbol = 0; + for (const auto& mapping: mappings) + { + transitionSymbol++; + + NFA rhs = mapping.second.clone(); + rhs.prepareStateIds(multi.size()); + + multi.states_.reserve(multi.size() + rhs.size()); + multi.states_.insert(multi.states_.end(), rhs.states_.begin(), rhs.states_.end()); + multi.acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + + multi.addTransition(multi.initialState_, transitionSymbol, rhs.initialState_); + multi.backtrackStates_[rhs.acceptState_] = multi.acceptState_; + multi.acceptState_ = rhs.acceptState_; + } + + return multi; +} + +NFA& NFA::lookahead(NFA&& rhs) +{ + if (empty()) + { + *this = std::move(rhs); + backtrackStates_[acceptState_] = initialState_; + } + else + { + rhs.prepareStateIds(states_.size()); + states_.reserve(size() + rhs.size()); + states_.insert(states_.end(), rhs.states_.begin(), rhs.states_.end()); + acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + + addTransition(acceptState_, Symbols::Epsilon, rhs.initialState_); + backtrackStates_[rhs.acceptState_] = acceptState_; + acceptState_ = rhs.acceptState_; + } + + return *this; +} + +NFA& NFA::alternate(NFA&& rhs) +{ + StateId newStart = createState(); + StateId newEnd = createState(); + + rhs.prepareStateIds(states_.size()); + states_.insert(states_.end(), rhs.states_.begin(), rhs.states_.end()); + acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + backtrackStates_.insert(rhs.backtrackStates_.begin(), rhs.backtrackStates_.end()); + + addTransition(newStart, Symbols::Epsilon, initialState_); + addTransition(newStart, Symbols::Epsilon, rhs.initialState_); + + addTransition(acceptState_, Symbols::Epsilon, newEnd); + addTransition(rhs.acceptState_, Symbols::Epsilon, newEnd); + + initialState_ = newStart; + acceptState_ = newEnd; + + return *this; +} + +NFA& NFA::concatenate(NFA&& rhs) +{ + rhs.prepareStateIds(states_.size()); + states_.reserve(size() + rhs.size()); + states_.insert(states_.end(), rhs.states_.begin(), rhs.states_.end()); + acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + backtrackStates_.insert(rhs.backtrackStates_.begin(), rhs.backtrackStates_.end()); + + addTransition(acceptState_, Symbols::Epsilon, rhs.initialState_); + acceptState_ = rhs.acceptState_; + + return *this; +} + +NFA& NFA::optional() +{ + StateId newStart = createState(); + StateId newEnd = createState(); + + addTransition(newStart, Symbols::Epsilon, initialState_); + addTransition(newStart, Symbols::Epsilon, newEnd); + addTransition(acceptState_, Symbols::Epsilon, newEnd); + + initialState_ = newStart; + acceptState_ = newEnd; + + return *this; +} + +NFA& NFA::recurring() +{ + // {0, inf} + StateId newStart = createState(); + StateId newEnd = createState(); + + addTransition(newStart, Symbols::Epsilon, initialState_); + addTransition(newStart, Symbols::Epsilon, newEnd); + + addTransition(acceptState_, Symbols::Epsilon, initialState_); + addTransition(acceptState_, Symbols::Epsilon, newEnd); + + initialState_ = newStart; + acceptState_ = newEnd; + + return *this; +} + +NFA& NFA::positive() +{ + return concatenate(std::move(clone().recurring())); +} + +NFA& NFA::times(unsigned factor) +{ + assert(factor != 0); + + if (factor == 1) + return *this; + + NFA base = clone(); + for (unsigned n = 2; n <= factor; ++n) + concatenate(base.clone()); + + return *this; +} + +NFA& NFA::repeat(unsigned minimum, unsigned maximum) +{ + assert(minimum <= maximum); + + NFA factor = clone(); + + if (minimum != 0) + times(minimum); + + for (unsigned n = minimum + 1; n <= maximum; n++) + alternate(std::move(factor.clone().times(n))); + + if (minimum == 0) + optional(); + + return *this; +} + +void NFA::visit(DotVisitor& v) const +{ + v.start(initialState_); + + // initial state + v.visitNode(initialState_, true, acceptTags_.find(initialState_) != acceptTags_.end()); + + // accepting states + for (pair acceptTag: acceptTags_) + if (acceptTag.first != initialState_) + v.visitNode(acceptTag.first, false, true); + + // other states + for (StateId i = 0, e = size(); i != e; ++i) + if (i != initialState_ && acceptTags_.find(i) == acceptTags_.end()) + v.visitNode(i, false, false); + + // transitions + for (StateId sourceState = 0, sE = size(); sourceState != sE; ++sourceState) + { + map> reversed; + for (pair transitions: states_[sourceState]) + for (StateId targetState: transitions.second) + reversed[targetState].push_back(transitions.first /* symbol */); + + for (pair> tr: reversed) + { + StateId targetState = tr.first; + const vector& T = tr.second; + for_each(T.begin(), T.end(), [&](const Symbol t) { v.visitEdge(sourceState, targetState, t); }); + v.endVisitEdge(sourceState, targetState); + } + } + v.end(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFA.h b/src/regex_dfa/NFA.h new file mode 100644 index 0000000000..7380de8333 --- /dev/null +++ b/src/regex_dfa/NFA.h @@ -0,0 +1,221 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class Alphabet; +class DotVisitor; +class DFA; + +/** + * NFA Builder with the Thompson's Construction properties. + * + *
    + *
  • There is exactly one initial state and exactly one accepting state.. + *
  • No transition other than the initial transition enters the initial state. + *
  • The accepting state has no leaving edges + *
  • An ε-transition always connects two states that were (earlier in the construction process) + * the initial state and the accepting state of NFAs for some component REs. + *
  • Each state has at most two entering states and at most two leaving states. + *
+ */ +class NFA +{ + private: + NFA(const NFA& other) = default; + NFA& operator=(const NFA& other) = default; + + public: + //! represent a transition table for a specific state + using TransitionMap = std::map; + + //! defines a set of states within one NFA. the index represents the state Id. + using StateVec = std::vector; + + //! defines a mapping between accept state ID and another (prior) ID to track roll back the input stream + //! to. + using BacktrackingMap = std::map; + + NFA(NFA&&) = default; + NFA& operator=(NFA&&) = default; + + //! Constructs an empty NFA. + NFA(): states_ {}, initialState_ { 0 }, acceptState_ { 0 }, backtrackStates_ {}, acceptTags_ {} {} + + /** + * Constructs an NFA for a single character transition. + * + * *No* acceptState flag is set on the accepting node! + */ + explicit NFA(Symbol value): NFA {} + { + initialState_ = createState(); + acceptState_ = createState(); + addTransition(initialState_, value, acceptState_); + } + + explicit NFA(SymbolSet value): NFA {} + { + initialState_ = createState(); + acceptState_ = createState(); + for (Symbol s: value) + addTransition(initialState_, s, acceptState_); + } + + void addTransition(StateId from, Symbol s, StateId to) { states_[from][s].push_back(to); } + + [[nodiscard]] static NFA join(const std::map& mappings); + + /** + * Traverses all states and edges in this NFA and calls @p visitor for each state & edge. + * + * Use this function to e.g. get a GraphViz dot-file drawn. + */ + void visit(DotVisitor& visitor) const; + + //! Tests whether or not this is an empty NFA. + [[nodiscard]] bool empty() const noexcept { return states_.empty(); } + + //! Retrieves the number of states of this NFA. + [[nodiscard]] size_t size() const noexcept { return states_.size(); } + + //! Retrieves the one and only initial state. This value is nullptr iff the NFA is empty. + [[nodiscard]] StateId initialStateId() const noexcept { return initialState_; } + + //! Retrieves the one and only accept state. This value is nullptr iff the NFA is empty. + [[nodiscard]] StateId acceptStateId() const noexcept { return acceptState_; } + + //! Retrieves the list of states this FA contains. + [[nodiscard]] const StateVec& states() const { return states_; } + StateVec& states() { return states_; } + + //! Retrieves the alphabet of this finite automaton. + [[nodiscard]] Alphabet alphabet() const; + + //! Clones this NFA. + [[nodiscard]] NFA clone() const; + + /** + * Constructs an NFA where @p rhs is following but backtracking to @c acceptState(this) when + * when @p rhs is fully matched. + * + * This resembles the syntax r/s (or r(?=s) in Perl) where r is matched when also s is following. + */ + NFA& lookahead(NFA&& rhs); + + //! Reconstructs this FA to alternate between this FA and the @p other FA. + NFA& alternate(NFA&& other); + + //! Concatenates the right FA's initial state with this FA's accepting state. + NFA& concatenate(NFA&& rhs); + + //! Reconstructs this FA to allow optional input. X -> X? + NFA& optional(); + + //! Reconstructs this FA with the given @p quantifier factor. + NFA& times(unsigned quantifier); + + //! Reconstructs this FA to allow recurring input. X -> X* + NFA& recurring(); + + //! Reconstructs this FA to be recurring at least once. X+ = XX* + NFA& positive(); + + //! Reconstructs this FA to be repeatable between range [minimum, maximum]. + NFA& repeat(unsigned minimum, unsigned maximum); + + //! Retrieves transitions for state with the ID @p id. + [[nodiscard]] TransitionMap const& stateTransitions(StateId id) const { return states_[id]; } + + //! Retrieves all states that can be reached from @p S with one single input Symbol @p c. + [[nodiscard]] StateIdVec delta(const StateIdVec& S, Symbol c) const; + StateIdVec* delta(const StateIdVec& S, Symbol c, StateIdVec* result) const; + + //! Retrieves all states that can be directly or indirectly accessed via epsilon-transitions exclusively. + [[nodiscard]] StateIdVec epsilonClosure(const StateIdVec& S) const; + void epsilonClosure(const StateIdVec& S, StateIdVec* result) const; + + [[nodiscard]] TransitionMap& stateTransitions(StateId s) { return states_[s]; } + + //! Flags given state as accepting-state with given Tag @p acceptTag. + void setAccept(Tag acceptTag) { acceptTags_[acceptState_] = acceptTag; } + + void setAccept(StateId state, Tag tag) { acceptTags_[state] = tag; } + + [[nodiscard]] std::optional acceptTag(StateId s) const + { + if (auto i = acceptTags_.find(s); i != acceptTags_.end()) + return i->second; + + return std::nullopt; + } + + [[nodiscard]] bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } + + /** + * Returns whether or not the StateSet @p Q contains at least one State that is also "accepting". + */ + [[nodiscard]] bool isAnyAccepting(const StateIdVec& Q) const + { + for (StateId q: Q) + if (isAccepting(q)) + return true; + + return false; + } + + [[nodiscard]] const AcceptMap& acceptMap() const noexcept { return acceptTags_; } + [[nodiscard]] AcceptMap& acceptMap() noexcept { return acceptTags_; } + + [[nodiscard]] std::optional backtrack(StateId s) const + { + if (auto i = backtrackStates_.find(s); i != backtrackStates_.end()) + return i->second; + + return std::nullopt; + } + + /** + * Checks if @p Q contains a state that is flagged as backtracking state in the NFA and returns + * the target state within the NFA or @c std::nullopt if not a backtracking state. + */ + [[nodiscard]] std::optional containsBacktrackState(const StateIdVec& Q) const + { + for (StateId q: Q) + if (std::optional t = backtrack(q); t.has_value()) + return *t; + + return std::nullopt; + } + + private: + [[nodiscard]] StateId createState(); + void visit(DotVisitor& v, StateId s, std::unordered_map& registry) const; + void prepareStateIds(StateId baseId); + + //! Retrieves all epsilon-transitions directly connected to State @p s. + [[nodiscard]] StateIdVec epsilonTransitions(StateId s) const; + + private: + StateVec states_; + StateId initialState_; + StateId acceptState_; + BacktrackingMap backtrackStates_; + AcceptMap acceptTags_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFABuilder.cpp b/src/regex_dfa/NFABuilder.cpp new file mode 100644 index 0000000000..0e07d6f342 --- /dev/null +++ b/src/regex_dfa/NFABuilder.cpp @@ -0,0 +1,124 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +NFA NFABuilder::construct(const RegExpr& re, Tag tag) +{ + visit(*this, re); + + // fa_.setAccept(acceptState_.value_or(fa_.acceptStateId()), tag); + if (acceptState_) + fa_.setAccept(acceptState_.value(), tag); + else + fa_.setAccept(tag); + + return std::move(fa_); +} + +NFA NFABuilder::construct(const RegExpr& re) +{ + visit(*this, re); + return std::move(fa_); +} + +void NFABuilder::operator()(const LookAheadExpr& lookaheadExpr) +{ + // fa_ = move(construct(lookaheadExpr.leftExpr()).lookahead(construct(lookaheadExpr.rightExpr()))); + NFA lhs = construct(*lookaheadExpr.left); + NFA rhs = construct(*lookaheadExpr.right); + lhs.lookahead(std::move(rhs)); + fa_ = std::move(lhs); +} + +void NFABuilder::operator()(const AlternationExpr& alternationExpr) +{ + NFA lhs = construct(*alternationExpr.left); + NFA rhs = construct(*alternationExpr.right); + lhs.alternate(std::move(rhs)); + fa_ = std::move(lhs); +} + +void NFABuilder::operator()(const ConcatenationExpr& concatenationExpr) +{ + NFA lhs = construct(*concatenationExpr.left); + NFA rhs = construct(*concatenationExpr.right); + lhs.concatenate(std::move(rhs)); + fa_ = std::move(lhs); +} + +void NFABuilder::operator()(const CharacterExpr& characterExpr) +{ + fa_ = NFA { characterExpr.value }; +} + +void NFABuilder::operator()(const CharacterClassExpr& characterClassExpr) +{ + fa_ = NFA { characterClassExpr.symbols }; +} + +void NFABuilder::operator()(const ClosureExpr& closureExpr) +{ + const unsigned xmin = closureExpr.minimumOccurrences; + const unsigned xmax = closureExpr.maximumOccurrences; + constexpr unsigned Infinity = numeric_limits::max(); + + if (xmin == 0 && xmax == 1) + fa_ = std::move(construct(*closureExpr.subExpr).optional()); + else if (xmin == 0 && xmax == Infinity) + fa_ = std::move(construct(*closureExpr.subExpr).recurring()); + else if (xmin == 1 && xmax == Infinity) + fa_ = std::move(construct(*closureExpr.subExpr).positive()); + else if (xmin < xmax) + fa_ = std::move(construct(*closureExpr.subExpr).repeat(xmin, xmax)); + else if (xmin == xmax) + fa_ = std::move(construct(*closureExpr.subExpr).times(xmin)); + else + throw invalid_argument { "closureExpr" }; +} + +void NFABuilder::operator()(const BeginOfLineExpr&) +{ + fa_ = NFA { Symbols::Epsilon }; +} + +void NFABuilder::operator()(const EndOfLineExpr&) +{ + // NFA lhs; + // NFA rhs{'\n'}; + // lhs.lookahead(move(rhs)); + // fa_ = move(lhs); + fa_ = std::move(NFA {}.lookahead(NFA { '\n' })); +} + +void NFABuilder::operator()(const EndOfFileExpr&) +{ + fa_ = NFA { Symbols::EndOfFile }; +} + +void NFABuilder::operator()(const DotExpr&) +{ + // any character except LF + fa_ = NFA { '\t' }; + for (int ch = 32; ch < 127; ++ch) + { + fa_.addTransition(fa_.initialStateId(), ch, fa_.acceptStateId()); + } +} + +void NFABuilder::operator()(const EmptyExpr&) +{ + fa_ = NFA { Symbols::Epsilon }; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFABuilder.h b/src/regex_dfa/NFABuilder.h new file mode 100644 index 0000000000..4ec4892856 --- /dev/null +++ b/src/regex_dfa/NFABuilder.h @@ -0,0 +1,55 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class DFA; + +/*! + * Generates a finite automaton from the given input (a regular expression). + */ +class NFABuilder +{ + public: + explicit NFABuilder(): fa_ {} {} + + [[nodiscard]] NFA construct(const RegExpr& re, Tag tag); + [[nodiscard]] NFA construct(const RegExpr& re); + void operator()(const LookAheadExpr& lookaheadExpr); + void operator()(const ConcatenationExpr& concatenationExpr); + void operator()(const AlternationExpr& alternationExpr); + void operator()(const CharacterExpr& characterExpr); + void operator()(const CharacterClassExpr& characterClassExpr); + void operator()(const ClosureExpr& closureExpr); + void operator()(const BeginOfLineExpr& bolExpr); + void operator()(const EndOfLineExpr& eolExpr); + void operator()(const EndOfFileExpr& eofExpr); + void operator()(const DotExpr& dotExpr); + void operator()(const EmptyExpr& emptyExpr); + + private: + NFA fa_; + std::optional acceptState_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFA_test.cpp b/src/regex_dfa/NFA_test.cpp new file mode 100644 index 0000000000..734e19581f --- /dev/null +++ b/src/regex_dfa/NFA_test.cpp @@ -0,0 +1,85 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +using namespace std; +using namespace regex_dfa; + +TEST_CASE("regex_NFA.emptyCtor") +{ + const NFA nfa; + REQUIRE(0 == nfa.size()); + REQUIRE(nfa.empty()); +} + +TEST_CASE("regex_NFA.characterCtor") +{ + const NFA nfa { 'a' }; + REQUIRE(2 == nfa.size()); + REQUIRE(0 == nfa.initialStateId()); + REQUIRE(1 == nfa.acceptStateId()); + REQUIRE(StateIdVec { 1 } == nfa.delta(StateIdVec { 0 }, 'a')); +} + +TEST_CASE("regex_NFA.concatenate") +{ + const NFA ab = std::move(NFA { 'a' }.concatenate(NFA { 'b' })); + REQUIRE(4 == ab.size()); + REQUIRE(0 == ab.initialStateId()); + REQUIRE(3 == ab.acceptStateId()); + + // TODO: check ab.initial == A.initial + // TODO: check A.accept == B.initial + // TODO: check ab.accept == B.accept +} + +TEST_CASE("regex_NFA.alternate") +{ + const NFA ab = std::move(NFA { 'a' }.alternate(NFA { 'b' })); + REQUIRE(6 == ab.size()); + REQUIRE(2 == ab.initialStateId()); + REQUIRE(3 == ab.acceptStateId()); + + // TODO: check acceptState transitions to A and B + // TODO: check A and B's outgoing edges to final acceptState +} + +TEST_CASE("regex_NFA.epsilonClosure") +{ + const NFA nfa { 'a' }; + REQUIRE(0 == nfa.initialStateId()); + REQUIRE(1 == nfa.acceptStateId()); + REQUIRE(StateIdVec { 0 } == nfa.epsilonClosure(StateIdVec { 0 })); + + const NFA abc = + std::move(NFA { 'a' }.concatenate(std::move(NFA { 'b' }.alternate(NFA { 'c' }).recurring()))); + REQUIRE(StateIdVec { 0 } == abc.epsilonClosure(StateIdVec { 0 })); + + const StateIdVec e1 { 1, 2, 4, 6, 8, 9 }; + REQUIRE(e1 == abc.epsilonClosure(StateIdVec { 1 })); +} + +TEST_CASE("regex_NFA.delta") +{ + const NFA nfa { 'a' }; + REQUIRE(0 == nfa.initialStateId()); + REQUIRE(1 == nfa.acceptStateId()); + REQUIRE(StateIdVec { 1 } == nfa.delta(StateIdVec { 0 }, 'a')); +} + +TEST_CASE("regex_NFA.alphabet") +{ + REQUIRE("{}" == NFA {}.alphabet().to_string()); + REQUIRE("{a}" == NFA { 'a' }.alphabet().to_string()); + REQUIRE("{ab}" == NFA { 'a' }.concatenate(NFA { 'b' }).alphabet().to_string()); + REQUIRE("{abc}" == NFA { 'a' }.concatenate(NFA { 'b' }).alternate(NFA { 'c' }).alphabet().to_string()); +} diff --git a/src/regex_dfa/RegExpr.cpp b/src/regex_dfa/RegExpr.cpp new file mode 100644 index 0000000000..b7ba9c70af --- /dev/null +++ b/src/regex_dfa/RegExpr.cpp @@ -0,0 +1,117 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +#include +#include +#include + +using namespace std; + +/* + REGULAR EXPRESSION SYNTAX: + -------------------------- + + expr := alternation + alternation := concatenation ('|' concatenation)* + concatenation := closure (closure)* + closure := atom ['*' | '?' | '{' NUM [',' NUM] '}'] + atom := character | characterClass | '(' expr ')' + characterClass := '[' ['^'] characterClassFragment+ ']' + characterClassFragment := character | character '-' character +*/ + +namespace regex_dfa +{ + +auto embrace(const RegExpr& outer, const RegExpr& inner) +{ + if (precedence(outer) > precedence(inner)) + return "(" + to_string(inner) + ")"; + else + return to_string(inner); +} + +std::string to_string(const RegExpr& re) +{ + return visit( + overloaded { + [&](ClosureExpr const& e) { + stringstream sstr; + sstr << embrace(re, *e.subExpr); + if (e.minimumOccurrences == 0 && e.maximumOccurrences == 1) + sstr << '?'; + else if (e.minimumOccurrences == 0 && e.maximumOccurrences == numeric_limits::max()) + sstr << '*'; + else if (e.minimumOccurrences == 1 && e.maximumOccurrences == numeric_limits::max()) + sstr << '+'; + else + sstr << '{' << e.minimumOccurrences << ',' << e.maximumOccurrences << '}'; + return sstr.str(); + }, + [&](const AlternationExpr& e) { return embrace(re, *e.left) + "|" + embrace(re, *e.right); }, + [&](const ConcatenationExpr& e) { return embrace(re, *e.left) + embrace(re, *e.right); }, + [&](const LookAheadExpr& e) { return embrace(re, *e.left) + "/" + embrace(re, *e.right); }, + [](const CharacterExpr& e) { return string(1, e.value); }, + [](EndOfFileExpr) { return string { "<>" }; }, + [](BeginOfLineExpr) { return string { "^" }; }, + [](EndOfLineExpr) { return string { "$" }; }, + [](CharacterClassExpr const& e) { return e.symbols.to_string(); }, + [](DotExpr) { return string { "." }; }, + [](EmptyExpr) { return string {}; }, + }, + re); +} + +int precedence(const RegExpr& regex) +{ + return visit(overloaded { + [](const AlternationExpr&) { return 1; }, + [](const BeginOfLineExpr&) { return 4; }, + [](const CharacterClassExpr&) { return 4; }, + [](const CharacterExpr&) { return 4; }, + [](const ClosureExpr&) { return 3; }, + [](const ConcatenationExpr&) { return 2; }, + [](const DotExpr&) { return 4; }, + [](const EmptyExpr&) { return 4; }, + [](const EndOfFileExpr&) { return 4; }, + [](const EndOfLineExpr&) { return 4; }, + [](const LookAheadExpr&) { return 0; }, + }, + regex); +} + +bool containsBeginOfLine(const RegExpr& regex) +{ + return visit(overloaded { + [](const AlternationExpr& e) { + return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); + }, + [](const BeginOfLineExpr&) { return true; }, + [](const CharacterClassExpr&) { return false; }, + [](const CharacterExpr&) { return false; }, + [](const ClosureExpr& e) { return containsBeginOfLine(*e.subExpr); }, + [](const ConcatenationExpr& e) { + return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); + }, + [](const DotExpr&) { return false; }, + [](const EmptyExpr&) { return false; }, + [](const EndOfFileExpr&) { return false; }, + [](const EndOfLineExpr&) { return false; }, + [](const LookAheadExpr& e) { + return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); + }, + }, + regex); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExpr.h b/src/regex_dfa/RegExpr.h new file mode 100644 index 0000000000..02e892baa4 --- /dev/null +++ b/src/regex_dfa/RegExpr.h @@ -0,0 +1,93 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +struct AlternationExpr; +struct BeginOfLineExpr; +struct CharacterClassExpr; +struct CharacterExpr; +struct ClosureExpr; +struct ConcatenationExpr; +struct DotExpr; +struct EmptyExpr; +struct EndOfFileExpr; +struct EndOfLineExpr; +struct LookAheadExpr; + +using RegExpr = std::variant; + +struct LookAheadExpr +{ + std::unique_ptr left; + std::unique_ptr right; +}; + +struct AlternationExpr +{ + std::unique_ptr left; + std::unique_ptr right; +}; + +struct ConcatenationExpr +{ + std::unique_ptr left; + std::unique_ptr right; +}; + +struct ClosureExpr +{ + std::unique_ptr subExpr; + unsigned minimumOccurrences { 0 }; + unsigned maximumOccurrences { std::numeric_limits::max() }; +}; + +struct CharacterExpr +{ + Symbol value; +}; +struct CharacterClassExpr +{ + SymbolSet symbols; +}; + +// clang-format off +struct DotExpr {}; +struct BeginOfLineExpr {}; +struct EndOfLineExpr {}; +struct EndOfFileExpr {}; +struct EmptyExpr {}; +// clang-format on + +[[nodiscard]] std::string to_string(const RegExpr& regex); +[[nodiscard]] int precedence(const RegExpr& regex); +[[nodiscard]] bool containsBeginOfLine(const RegExpr& regex); + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser.cpp b/src/regex_dfa/RegExprParser.cpp new file mode 100644 index 0000000000..6c66dcfb0e --- /dev/null +++ b/src/regex_dfa/RegExprParser.cpp @@ -0,0 +1,483 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +#include +#include +#include +#include + +using namespace std; + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +/* + REGULAR EXPRESSION SYNTAX: + -------------------------- + + expr := alternation + alternation := concatenation ('|' concatenation)* + concatenation := closure (closure)* + closure := atom ['*' | '?' | '{' NUM [',' NUM] '}'] + atom := character + | '^' + | '$' + | '<>' + | '"' LITERAL '"' + | characterClass + | '(' expr ')' + | EPSILON + characterClass := '[' ['^'] characterClassFragment+ ']' + characterClassFragment := character | character '-' character +*/ + +namespace regex_dfa +{ + +RegExprParser::RegExprParser(): input_ {}, currentChar_ { input_.end() }, line_ { 1 }, column_ { 0 } +{ +} + +int RegExprParser::currentChar() const +{ + if (currentChar_ != input_.end()) + return *currentChar_; + else + return std::char_traits::eof(); +} + +bool RegExprParser::consumeIf(int ch) +{ + if (currentChar() != ch) + return false; + + consume(); + return true; +} + +int RegExprParser::consume() +{ + if (currentChar_ == input_.end()) + return std::char_traits::eof(); + + int ch = *currentChar_; + if (ch == '\n') + { + line_++; + column_ = 1; + } + else + { + column_++; + } + ++currentChar_; + DEBUG("consume: '{}'", (char) ch); + return ch; +} + +void RegExprParser::consume(int expected) +{ + int actual = currentChar(); + consume(); + if (actual != expected) + { + throw UnexpectedToken { line_, column_, actual, expected }; + } +} + +RegExpr RegExprParser::parse(string_view expr, unsigned line, unsigned column) +{ + input_ = expr; + currentChar_ = input_.begin(); + line_ = line; + column_ = column; + + return parseExpr(); +} + +RegExpr RegExprParser::parseExpr() +{ + return parseLookAheadExpr(); +} + +RegExpr RegExprParser::parseLookAheadExpr() +{ + RegExpr lhs = parseAlternation(); + + if (currentChar() == '/') + { + consume(); + RegExpr rhs = parseAlternation(); + lhs = LookAheadExpr { make_unique(std::move(lhs)), make_unique(std::move(rhs)) }; + } + + return lhs; +} + +RegExpr RegExprParser::parseAlternation() +{ + RegExpr lhs = parseConcatenation(); + + while (currentChar() == '|') + { + consume(); + RegExpr rhs = parseConcatenation(); + lhs = AlternationExpr { make_unique(std::move(lhs)), make_unique(std::move(rhs)) }; + } + + return lhs; +} + +RegExpr RegExprParser::parseConcatenation() +{ + // FOLLOW-set, the set of terminal tokens that can occur right after a concatenation + static const string_view follow = "/|)"; + RegExpr lhs = parseClosure(); + + while (!eof() && follow.find(currentChar()) == std::string_view::npos) + { + RegExpr rhs = parseClosure(); + lhs = + ConcatenationExpr { make_unique(std::move(lhs)), make_unique(std::move(rhs)) }; + } + + return lhs; +} + +RegExpr RegExprParser::parseClosure() +{ + RegExpr subExpr = parseAtom(); + + switch (currentChar()) + { + case '?': consume(); return ClosureExpr { make_unique(std::move(subExpr)), 0, 1 }; + case '*': consume(); return ClosureExpr { make_unique(std::move(subExpr)), 0 }; + case '+': consume(); return ClosureExpr { make_unique(std::move(subExpr)), 1 }; + case '{': { + consume(); + unsigned int m = parseInt(); + if (currentChar() == ',') + { + consume(); + unsigned int n = parseInt(); + consume('}'); + return ClosureExpr { make_unique(std::move(subExpr)), m, n }; + } + else + { + consume('}'); + return ClosureExpr { make_unique(std::move(subExpr)), m, m }; + } + } + default: return subExpr; + } +} + +unsigned RegExprParser::parseInt() +{ + unsigned n = 0; + while (isdigit(currentChar())) + { + n *= 10; + n += currentChar() - '0'; + consume(); + } + return n; +} + +RegExpr RegExprParser::parseAtom() +{ + // skip any whitespace (except newlines) + while (!eof() && isspace(currentChar()) && currentChar() != '\n') + consume(); + + switch (currentChar()) + { + case std::char_traits::eof(): // EOF + case ')': return EmptyExpr {}; + case '<': + consume(); + consume('<'); + consume('E'); + consume('O'); + consume('F'); + consume('>'); + consume('>'); + return EndOfFileExpr {}; + case '(': { + consume(); + RegExpr subExpr = parseExpr(); + consume(')'); + return subExpr; + } + case '"': { + consume(); + RegExpr lhs = CharacterExpr { consume() }; + while (!eof() && currentChar() != '"') + { + RegExpr rhs = CharacterExpr { consume() }; + lhs = ConcatenationExpr { make_unique(std::move(lhs)), + make_unique(std::move(rhs)) }; + } + consume('"'); + return lhs; + } + case '[': return parseCharacterClass(); + case '.': consume(); return DotExpr {}; + case '^': consume(); return BeginOfLineExpr {}; + case '$': consume(); return EndOfLineExpr {}; + default: return CharacterExpr { parseSingleCharacter() }; + } +} + +RegExpr RegExprParser::parseCharacterClass() +{ + consume(); // '[' + const bool complement = consumeIf('^'); // TODO + + SymbolSet ss; + parseCharacterClassFragment(ss); + while (!eof() && currentChar() != ']') + parseCharacterClassFragment(ss); + + if (complement) + ss.complement(); + + consume(']'); + return CharacterClassExpr { std::move(ss) }; +} + +void RegExprParser::parseNamedCharacterClass(SymbolSet& ss) +{ + consume('['); + consume(':'); + string token; + while (isalpha(currentChar())) + { + token += static_cast(consume()); + } + consume(':'); + consume(']'); + + static const unordered_map> names = { + { "alnum", + [](SymbolSet& ss) { + for (Symbol c = 'a'; c <= 'z'; c++) + ss.insert(c); + for (Symbol c = 'A'; c <= 'Z'; c++) + ss.insert(c); + for (Symbol c = '0'; c <= '9'; c++) + ss.insert(c); + } }, + { "alpha", + [](SymbolSet& ss) { + for (Symbol c = 'a'; c <= 'z'; c++) + ss.insert(c); + for (Symbol c = 'A'; c <= 'Z'; c++) + ss.insert(c); + } }, + { "blank", + [](SymbolSet& ss) { + ss.insert(' '); + ss.insert('\t'); + } }, + { "cntrl", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (iscntrl(c)) + ss.insert(c); + } }, + { "digit", + [](SymbolSet& ss) { + for (Symbol c = '0'; c <= '9'; c++) + ss.insert(c); + } }, + { "graph", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (isgraph(c)) + ss.insert(c); + } }, + { "lower", + [](SymbolSet& ss) { + for (Symbol c = 'a'; c <= 'z'; c++) + ss.insert(c); + } }, + { "print", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (isprint(c) || c == ' ') + ss.insert(c); + } }, + { "punct", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (ispunct(c)) + ss.insert(c); + } }, + { "space", + [](SymbolSet& ss) { + for (Symbol c: "\f\n\r\t\v") + ss.insert(c); + } }, + { "upper", + [](SymbolSet& ss) { + for (Symbol c = 'A'; c <= 'Z'; c++) + ss.insert(c); + } }, + { "xdigit", + [](SymbolSet& ss) { + for (Symbol c = '0'; c <= '9'; c++) + ss.insert(c); + for (Symbol c = 'a'; c <= 'f'; c++) + ss.insert(c); + for (Symbol c = 'A'; c <= 'F'; c++) + ss.insert(c); + } }, + }; + + if (auto i = names.find(token); i != names.end()) + i->second(ss); + else + throw UnexpectedToken { line_, column_, token, "" }; +} + +Symbol RegExprParser::parseSingleCharacter() +{ + if (currentChar() != '\\') + return consume(); + + consume(); // consumes escape character + switch (currentChar()) + { + case 'a': consume(); return '\a'; + case 'b': consume(); return '\b'; + case 'f': consume(); return '\f'; + case 'n': consume(); return '\n'; + case 'r': consume(); return '\r'; + case 's': consume(); return ' '; + case 't': consume(); return '\t'; + case 'v': consume(); return '\v'; + case 'x': { + consume(); + + char buf[3]; + buf[0] = consume(); + if (!isxdigit(buf[0])) + throw UnexpectedToken { line_, column_, string(1, buf[0]), "[0-9a-fA-F]" }; + buf[1] = consume(); + if (!isxdigit(buf[1])) + throw UnexpectedToken { line_, column_, string(1, buf[1]), "[0-9a-fA-F]" }; + buf[2] = 0; + + return static_cast(strtoul(buf, nullptr, 16)); + } + case '0': { + const Symbol x0 = consume(); + if (!isdigit(currentChar())) + return '\0'; + + // octal value (\DDD) + char buf[4]; + buf[0] = x0; + buf[1] = consume(); + if (!(buf[1] >= '0' && buf[1] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[1]), "[0-7]" }; + buf[2] = consume(); + if (!(buf[2] >= '0' && buf[2] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[2]), "[0-7]" }; + buf[3] = '\0'; + + return static_cast(strtoul(buf, nullptr, 8)); + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + // octal value (\DDD) + char buf[4]; + buf[0] = consume(); + buf[1] = consume(); + if (!(buf[1] >= '0' && buf[1] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[1]), "[0-7]" }; + buf[2] = consume(); + if (!(buf[2] >= '0' && buf[2] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[2]), "[0-7]" }; + buf[3] = '\0'; + + return static_cast(strtoul(buf, nullptr, 8)); + } + case '"': + case '$': + case '(': + case ')': + case '*': + case '+': + case ':': + case '?': + case '[': + case '\'': + case '\\': + case ']': + case '^': + case '{': + case '}': + case '.': + case '/': return consume(); + default: { + throw UnexpectedToken { line_, + column_, + fmt::format("'{}'", static_cast(currentChar())), + "" }; + } + } +} + +void RegExprParser::parseCharacterClassFragment(SymbolSet& ss) +{ + // parse [:named:] + if (currentChar() == '[') + { + parseNamedCharacterClass(ss); + return; + } + + // parse single char (A) or range (A-Z) + const Symbol c1 = parseSingleCharacter(); + if (currentChar() != '-') + { + ss.insert(c1); + return; + } + + consume(); // consume '-' + const Symbol c2 = parseSingleCharacter(); + + for (Symbol c_i = c1; c_i <= c2; c_i++) + ss.insert(c_i); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser.h b/src/regex_dfa/RegExprParser.h new file mode 100644 index 0000000000..8484087af8 --- /dev/null +++ b/src/regex_dfa/RegExprParser.h @@ -0,0 +1,96 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include + +#include +#include + +namespace regex_dfa +{ + +class SymbolSet; + +class RegExprParser +{ + public: + RegExprParser(); + + [[nodiscard]] RegExpr parse(std::string_view expr, unsigned line, unsigned column); + + [[nodiscard]] RegExpr parse(std::string_view expr) { return parse(expr, 1, 1); } + + class UnexpectedToken: public std::runtime_error + { + public: + UnexpectedToken(unsigned int line, unsigned int column, std::string actual, std::string expected): + std::runtime_error { fmt::format( + "[{}:{}] Unexpected token {}. Expected {} instead.", line, column, actual, expected) }, + line_ { line }, + column_ { column }, + actual_ { std::move(actual) }, + expected_ { std::move(expected) } + { + } + + UnexpectedToken(unsigned int line, unsigned int column, int actual, int expected): + UnexpectedToken { line, + column, + std::char_traits::eq(actual, std::char_traits::eof()) + ? "EOF" + : fmt::format("{}", static_cast(actual)), + std::string(1, static_cast(expected)) } + { + } + + [[nodiscard]] unsigned int line() const noexcept { return line_; } + [[nodiscard]] unsigned int column() const noexcept { return column_; } + [[nodiscard]] const std::string& actual() const noexcept { return actual_; } + [[nodiscard]] const std::string& expected() const noexcept { return expected_; } + + private: + unsigned int line_; + unsigned int column_; + std::string actual_; + std::string expected_; + }; + + private: + [[nodiscard]] int currentChar() const; + [[nodiscard]] bool eof() const noexcept + { + return std::char_traits::eq(currentChar(), std::char_traits::eof()); + } + [[nodiscard]] bool consumeIf(int ch); + void consume(int ch); + int consume(); + [[nodiscard]] unsigned parseInt(); + + [[nodiscard]] RegExpr parse(); // expr + [[nodiscard]] RegExpr parseExpr(); // lookahead + [[nodiscard]] RegExpr parseLookAheadExpr(); // alternation ('/' alternation)? + [[nodiscard]] RegExpr parseAlternation(); // concatenation ('|' concatenation)* + [[nodiscard]] RegExpr parseConcatenation(); // closure (closure)* + [[nodiscard]] RegExpr parseClosure(); // atom ['*' | '?' | '{' NUM [',' NUM] '}'] + [[nodiscard]] RegExpr parseAtom(); // character | characterClass | '(' expr ')' + [[nodiscard]] RegExpr parseCharacterClass(); // '[' characterClassFragment+ ']' + void parseCharacterClassFragment(SymbolSet& ss); // namedClass | character | character '-' character + void parseNamedCharacterClass(SymbolSet& ss); // '[' ':' NAME ':' ']' + [[nodiscard]] Symbol parseSingleCharacter(); + + private: + std::string_view input_; + std::string_view::iterator currentChar_; + unsigned int line_; + unsigned int column_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser_test.cpp b/src/regex_dfa/RegExprParser_test.cpp new file mode 100644 index 0000000000..e668143206 --- /dev/null +++ b/src/regex_dfa/RegExprParser_test.cpp @@ -0,0 +1,309 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +#include + +using namespace std; +using namespace regex_dfa; + +namespace +{ + +RegExpr parseRegExpr(string const& s) +{ + return RegExprParser {}.parse(s); +} + +} // namespace + +TEST_CASE("regex_RegExprParser.namedCharacterClass_graph") +{ + RegExpr re = parseRegExpr("[[:graph:]]"); + REQUIRE(holds_alternative(re)); + CHECK("!-~" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.whitespaces_concatination") +{ + RegExpr re = parseRegExpr("a b"); + REQUIRE(holds_alternative(re)); + CHECK("ab" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.whitespaces_alternation") +{ + RegExpr re = parseRegExpr("a | b"); + REQUIRE(holds_alternative(re)); + CHECK("a|b" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_digit") +{ + RegExpr re = parseRegExpr("[[:digit:]]"); + REQUIRE(holds_alternative(re)); + CHECK("0-9" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_alnum") +{ + RegExpr re = parseRegExpr("[[:alnum:]]"); + REQUIRE(holds_alternative(re)); + CHECK("0-9A-Za-z" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_alpha") +{ + RegExpr re = parseRegExpr("[[:alpha:]]"); + REQUIRE(holds_alternative(re)); + CHECK("A-Za-z" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_blank") +{ + RegExpr re = parseRegExpr("[[:blank:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\t\\s" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_cntrl") +{ + RegExpr re = parseRegExpr("[[:cntrl:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\0-\\x1f\\x7f" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_print") +{ + RegExpr re = parseRegExpr("[[:print:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\s-~" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_punct") +{ + RegExpr re = parseRegExpr("[[:punct:]]"); + REQUIRE(holds_alternative(re)); + CHECK("!-/:-@[-`{-~" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_space") +{ + RegExpr re = parseRegExpr("[[:space:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\0\\t-\\r" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_unknown") +{ + CHECK_THROWS_AS(parseRegExpr("[[:unknown:]]"), RegExprParser::UnexpectedToken); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_upper") +{ + RegExpr re = parseRegExpr("[[:upper:]]"); + REQUIRE(holds_alternative(re)); + CHECK("A-Z" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.namedCharacterClass_mixed") +{ + RegExpr re = parseRegExpr("[[:lower:]0-9]"); + REQUIRE(holds_alternative(re)); + CHECK("0-9a-z" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.characterClass_complement") +{ + RegExpr re = parseRegExpr("[^\\n]"); + REQUIRE(holds_alternative(re)); + CHECK(get(re).symbols.isDot()); + CHECK("." == get(re).symbols.to_string()); +} + +TEST_CASE("regex_RegExprParser.escapeSequences_invalid") +{ + CHECK_THROWS_AS(parseRegExpr("[\\z]"), RegExprParser::UnexpectedToken); +} + +TEST_CASE("regex_RegExprParser.escapeSequences_abfnrstv") +{ + CHECK("\\a" == to_string(parseRegExpr("[\\a]"))); + CHECK("\\b" == to_string(parseRegExpr("[\\b]"))); + CHECK("\\f" == to_string(parseRegExpr("[\\f]"))); + CHECK("\\n" == to_string(parseRegExpr("[\\n]"))); + CHECK("\\r" == to_string(parseRegExpr("[\\r]"))); + CHECK("\\s" == to_string(parseRegExpr("[\\s]"))); + CHECK("\\t" == to_string(parseRegExpr("[\\t]"))); + CHECK("\\v" == to_string(parseRegExpr("[\\v]"))); +} + +TEST_CASE("regex_RegExprParser.newline") +{ + RegExpr re = parseRegExpr("\n"); + REQUIRE(holds_alternative(re)); + CHECK('\n' == get(re).value); +} + +TEST_CASE("regex_RegExprParser.escapeSequences_hex") +{ + RegExpr re = parseRegExpr("[\\x20]"); + REQUIRE(holds_alternative(re)); + CHECK("\\s" == get(re).symbols.to_string()); + + CHECK_THROWS_AS(parseRegExpr("[\\xZZ]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\xAZ]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\xZA]"), RegExprParser::UnexpectedToken); +} + +TEST_CASE("regex_RegExprParser.escapeSequences_nul") +{ + RegExpr re = parseRegExpr("[\\0]"); + REQUIRE(holds_alternative(re)); + CHECK("\\0" == get(re).symbols.to_string()); +} + +TEST_CASE("regex_RegExprParser.escapeSequences_octal") +{ + // with leading zero + RegExpr re = parseRegExpr("[\\040]"); + REQUIRE(holds_alternative(re)); + CHECK("\\s" == get(re).symbols.to_string()); + + // with leading non-zero + re = parseRegExpr("[\\172]"); + REQUIRE(holds_alternative(re)); + CHECK("z" == get(re).symbols.to_string()); + + // invalids + CHECK_THROWS_AS(parseRegExpr("[\\822]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\282]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\228]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\082]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\028]"), RegExprParser::UnexpectedToken); +} + +TEST_CASE("regex_RegExprParser.doubleQuote") +{ + // as concatenation character + RegExpr re = parseRegExpr(R"(\")"); + REQUIRE(holds_alternative(re)); + CHECK('"' == get(re).value); + + // as character class + re = parseRegExpr(R"([\"])"); + REQUIRE(holds_alternative(re)); + CHECK(R"(")" == get(re).symbols.to_string()); +} + +TEST_CASE("regex_RegExprParser.dot") +{ + RegExpr re = parseRegExpr("."); + REQUIRE(holds_alternative(re)); + CHECK("." == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.optional") +{ + RegExpr re = parseRegExpr("a?"); + REQUIRE(holds_alternative(re)); + CHECK("a?" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.bol") +{ + RegExpr re = parseRegExpr("^a"); + REQUIRE(holds_alternative(re)); + const ConcatenationExpr& cat = get(re); + + REQUIRE(holds_alternative(*cat.left)); + CHECK("^" == to_string(*cat.left)); + CHECK("a" == to_string(*cat.right)); +} + +TEST_CASE("regex_RegExprParser.eol") +{ + RegExpr re = parseRegExpr("a$"); + REQUIRE(holds_alternative(re)); + const ConcatenationExpr& cat = get(re); + + REQUIRE(holds_alternative(*cat.right)); + CHECK("a$" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.eof") +{ + RegExpr re = parseRegExpr("<>"); + REQUIRE(holds_alternative(re)); + CHECK("<>" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.alternation") +{ + CHECK("a|b" == to_string(parseRegExpr("a|b"))); + CHECK("(a|b)c" == to_string(parseRegExpr("(a|b)c"))); + CHECK("a(b|c)" == to_string(parseRegExpr("a(b|c)"))); +} + +TEST_CASE("regex_RegExprParser.lookahead") +{ + RegExpr re = parseRegExpr("ab/cd"); + REQUIRE(holds_alternative(re)); + CHECK("ab/cd" == to_string(re)); + CHECK("(a/b)|b" == to_string(parseRegExpr("(a/b)|b"))); + CHECK("a|(b/c)" == to_string(parseRegExpr("a|(b/c)"))); +} + +TEST_CASE("regex_RegExprParser.closure") +{ + RegExpr re = parseRegExpr("(abc)*"); + REQUIRE(holds_alternative(re)); + const ClosureExpr& e = get(re); + CHECK(0 == e.minimumOccurrences); + CHECK(numeric_limits::max() == e.maximumOccurrences); + CHECK("(abc)*" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.positive") +{ + auto re = parseRegExpr("(abc)+"); + REQUIRE(holds_alternative(re)); + const ClosureExpr& e = get(re); + CHECK(1 == e.minimumOccurrences); + CHECK(numeric_limits::max() == e.maximumOccurrences); + CHECK("(abc)+" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.closure_range") +{ + auto re = parseRegExpr("a{2,4}"); + REQUIRE(holds_alternative(re)); + const ClosureExpr& e = get(re); + CHECK(2 == e.minimumOccurrences); + CHECK(4 == e.maximumOccurrences); + CHECK("a{2,4}" == to_string(re)); +} + +TEST_CASE("regex_RegExprParser.empty") +{ + auto re = parseRegExpr("(a|)"); + CHECK("a|" == to_string(re)); // grouping '(' & ')' is not preserved as node in the parse tree. +} + +TEST_CASE("regex_RegExprParser.UnexpectedToken_grouping") +{ + CHECK_THROWS_AS(parseRegExpr("(a"), RegExprParser::UnexpectedToken); +} + +TEST_CASE("regex_RegExprParser.UnexpectedToken_literal") +{ + CHECK_THROWS_AS(parseRegExpr("\"a"), RegExprParser::UnexpectedToken); +} diff --git a/src/regex_dfa/Report.cpp b/src/regex_dfa/Report.cpp new file mode 100644 index 0000000000..9f2b9b51cd --- /dev/null +++ b/src/regex_dfa/Report.cpp @@ -0,0 +1,109 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include +#include + +using namespace std; +using namespace regex_dfa; + +// {{{ Message +string Report::Message::to_string() const +{ + switch (type) + { + case Type::Warning: return fmt::format("[{}] {}", sourceLocation, text); + case Type::LinkError: return fmt::format("{}: {}", type, text); + default: return fmt::format("[{}] {}: {}", sourceLocation, type, text); + } +} + +bool Report::Message::operator==(const Message& other) const noexcept +{ + // XXX ignore SourceLocation's filename & end + return type == other.type && sourceLocation.offset == other.sourceLocation.offset && text == other.text; +} +// }}} +// {{{ ConsoleReport +void ConsoleReport::onMessage(Message&& message) +{ + switch (message.type) + { + case Type::Warning: cerr << fmt::format("Warning: {}\n", message); break; + default: cerr << fmt::format("Error: {}\n", message); break; + } +} +// }}} +// {{{ BufferedReport +void BufferedReport::onMessage(Message&& msg) +{ + messages_.emplace_back(std::move(msg)); +} + +void BufferedReport::clear() +{ + messages_.clear(); +} + +string BufferedReport::to_string() const +{ + stringstream sstr; + for (const Message& message: messages_) + { + switch (message.type) + { + case Type::Warning: sstr << "Warning: " << message.to_string() << "\n"; break; + default: sstr << "Error: " << message.to_string() << "\n"; break; + } + } + return sstr.str(); +} + +bool BufferedReport::operator==(const BufferedReport& other) const noexcept +{ + if (size() != other.size()) + return false; + + for (size_t i = 0, e = size(); i != e; ++i) + if (messages_[i] != other.messages_[i]) + return false; + + return true; +} + +bool BufferedReport::contains(const Message& message) const noexcept +{ + for (const Message& m: messages_) + if (m == message) + return true; + + return false; +} + +DifferenceReport difference(const BufferedReport& first, const BufferedReport& second) +{ + DifferenceReport diff; + + for (const Report::Message& m: first) + if (!second.contains(m)) + diff.first.push_back(m); + + for (const Report::Message& m: second) + if (!first.contains(m)) + diff.second.push_back(m); + + return diff; +} + +ostream& operator<<(ostream& os, const BufferedReport& report) +{ + os << report.to_string(); + return os; +} +// }}} diff --git a/src/regex_dfa/Report.h b/src/regex_dfa/Report.h new file mode 100644 index 0000000000..0fc9bc71e7 --- /dev/null +++ b/src/regex_dfa/Report.h @@ -0,0 +1,223 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class Report +{ + public: + enum class Type + { + TokenError, + SyntaxError, + TypeError, + Warning, + LinkError + }; + + struct Message + { + Type type; + SourceLocation sourceLocation; + std::string text; + + Message(Type type, SourceLocation sloc, std::string text): + type { type }, sourceLocation { std::move(sloc) }, text { std::move(text) } + { + } + + [[nodiscard]] std::string to_string() const; + + bool operator==(const Message& other) const noexcept; + bool operator!=(const Message& other) const noexcept { return !(*this == other); } + }; + + using MessageList = std::vector; + using Reporter = std::function; + + explicit Report(Reporter reporter): onReport_ { std::move(reporter) } {} + + template + void tokenError(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::TokenError, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void syntaxError(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::SyntaxError, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void typeError(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::TypeError, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void warning(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::Warning, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void linkError(const std::string& f, Args&&... args) + { + report(Type::LinkError, SourceLocation {}, fmt::format(f, std::forward(args)...)); + } + + void report(Type type, SourceLocation sloc, std::string text) + { + if (type != Type::Warning) + errorCount_++; + + if (onReport_) + { + onReport_(Message(type, std::move(sloc), std::move(text))); + } + } + + [[nodiscard]] bool containsFailures() const noexcept { return errorCount_ != 0; } + + private: + size_t errorCount_ = 0; + Reporter onReport_; +}; + +class ConsoleReport: public Report +{ + public: + ConsoleReport(): Report(std::bind(&ConsoleReport::onMessage, this, std::placeholders::_1)) {} + + private: + void onMessage(Message&& msg); +}; + +class BufferedReport: public Report +{ + public: + BufferedReport(): Report(std::bind(&BufferedReport::onMessage, this, std::placeholders::_1)), messages_ {} + { + } + + [[nodiscard]] std::string to_string() const; + + [[nodiscard]] const MessageList& messages() const noexcept { return messages_; } + + void clear(); + [[nodiscard]] size_t size() const noexcept { return messages_.size(); } + [[nodiscard]] const Message& operator[](size_t i) const { return messages_[i]; } + + using iterator = MessageList::iterator; + using const_iterator = MessageList::const_iterator; + + [[nodiscard]] iterator begin() noexcept { return messages_.begin(); } + [[nodiscard]] iterator end() noexcept { return messages_.end(); } + [[nodiscard]] const_iterator begin() const noexcept { return messages_.begin(); } + [[nodiscard]] const_iterator end() const noexcept { return messages_.end(); } + + [[nodiscard]] bool contains(const Message& m) const noexcept; + + [[nodiscard]] bool operator==(const BufferedReport& other) const noexcept; + [[nodiscard]] bool operator!=(const BufferedReport& other) const noexcept { return !(*this == other); } + + private: + void onMessage(Message&& msg); + + private: + MessageList messages_; +}; + +std::ostream& operator<<(std::ostream& os, const BufferedReport& report); + +using DifferenceReport = std::pair; + +DifferenceReport difference(const BufferedReport& first, const BufferedReport& second); + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter: formatter +{ + using Type = regex_dfa::Report::Type; + + static std::string_view to_stringview(Type t) + { + switch (t) + { + case Type::TokenError: return "TokenError"; + case Type::SyntaxError: return "SyntaxError"; + case Type::TypeError: return "TypeError"; + case Type::Warning: return "Warning"; + case Type::LinkError: return "LinkError"; + default: return "???"; + } + } + + template + constexpr auto format(Type v, FormatContext& ctx) + { + return formatter::format(to_stringview(v), ctx); + } +}; +} // namespace fmt + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::SourceLocation& sloc, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{} ({}-{})", sloc.filename, sloc.offset, sloc.offset + sloc.count); + } +}; +} // namespace fmt + +namespace fmt +{ +template <> +struct formatter +{ + using Message = regex_dfa::Report::Message; + + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const Message& v, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{}", v.to_string()); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/Rule.h b/src/regex_dfa/Rule.h new file mode 100644 index 0000000000..0c97764494 --- /dev/null +++ b/src/regex_dfa/Rule.h @@ -0,0 +1,137 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include // IgnoreTag +#include +#include +#include // Tag + +#include +#include +#include +#include + +namespace regex_dfa +{ + +struct Rule +{ + unsigned int line; + unsigned int column; + Tag tag; + std::vector conditions; + std::string name; + std::string pattern; + std::unique_ptr regexpr = nullptr; + + [[nodiscard]] bool isIgnored() const noexcept { return tag == IgnoreTag; } + + [[nodiscard]] Rule clone() const + { + return regexpr ? Rule { line, + column, + tag, + conditions, + name, + pattern, + std::make_unique(RegExprParser {}.parse(pattern, line, column)) } + : Rule { line, column, tag, conditions, name, pattern, nullptr }; + } + + Rule() = default; + + Rule(unsigned line, + unsigned column, + Tag tag, + std::vector conditions, + std::string name, + std::string pattern, + std::unique_ptr regexpr = nullptr): + line { line }, + column { column }, + tag { tag }, + conditions { std::move(conditions) }, + name { std::move(name) }, + pattern { std::move(pattern) }, + regexpr { std::move(regexpr) } + { + } + + Rule(const Rule& v): + line { v.line }, + column { v.column }, + tag { v.tag }, + conditions { v.conditions }, + name { v.name }, + pattern { v.pattern }, + regexpr { v.regexpr ? std::make_unique(RegExprParser {}.parse(pattern, line, column)) + : nullptr } + { + } + + Rule& operator=(const Rule& v) + { + line = v.line; + column = v.column; + tag = v.tag; + conditions = v.conditions; + name = v.name; + pattern = v.pattern; + regexpr = + v.regexpr ? std::make_unique(RegExprParser {}.parse(pattern, line, column)) : nullptr; + return *this; + } + + bool operator<(const Rule& rhs) const noexcept { return tag < rhs.tag; } + bool operator<=(const Rule& rhs) const noexcept { return tag <= rhs.tag; } + bool operator==(const Rule& rhs) const noexcept { return tag == rhs.tag; } + bool operator!=(const Rule& rhs) const noexcept { return tag != rhs.tag; } + bool operator>=(const Rule& rhs) const noexcept { return tag >= rhs.tag; } + bool operator>(const Rule& rhs) const noexcept { return tag > rhs.tag; } +}; + +using RuleList = std::vector; + +inline bool ruleContainsBeginOfLine(const Rule& r) +{ + return containsBeginOfLine(*r.regexpr); +} + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::Rule& v, FormatContext& ctx) + { + if (!v.conditions.empty()) + { + fmt::format_to(ctx.out(), "<"); + for (size_t i = 0; i < v.conditions.size(); ++i) + if (i != 0) + fmt::format_to(ctx.out(), ", {}", v.conditions[i]); + else + fmt::format_to(ctx.out(), "{}", v.conditions[i]); + fmt::format_to(ctx.out(), ">"); + } + if (v.tag == regex_dfa::IgnoreTag) + return fmt::format_to(ctx.out(), "{}({}) ::= {}", v.name, "ignore", v.pattern); + else + return fmt::format_to(ctx.out(), "{}({}) ::= {}", v.name, v.tag, v.pattern); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/RuleParser.cpp b/src/regex_dfa/RuleParser.cpp new file mode 100644 index 0000000000..dda518404b --- /dev/null +++ b/src/regex_dfa/RuleParser.cpp @@ -0,0 +1,378 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include // special tags +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +RuleParser::RuleParser(unique_ptr input, int firstTerminalId): + _stream { std::move(input) }, + _lastParsedRule { nullptr }, + _lastParsedRuleIsRef { false }, + _currentChar { 0 }, + _line { 1 }, + _column { 0 }, + _offset { 0 }, + _nextTag { firstTerminalId } +{ + consumeChar(); +} + +RuleParser::RuleParser(string input, int firstTerminalId): + RuleParser { make_unique(std::move(input)), firstTerminalId } +{ +} + +RuleList RuleParser::parseRules() +{ + RuleList rules; + + for (;;) + { + consumeSpace(); + if (eof()) + { + break; + } + else if (currentChar() == '\n') + { + consumeChar(); + } + else + { + parseRule(rules); + } + } + + // collect all condition labels, find all <*>-conditions, then replace their <*> with {collected + // conditions} + set conditions; + list starRules; + for (Rule& rule: rules) + { + for (const string& condition: rule.conditions) + { + if (condition != "*") + { + conditions.emplace(condition); + } + else + { + rule.conditions.clear(); + starRules.emplace_back(&rule); + } + } + } + for (Rule* rule: starRules) + for (const string& condition: conditions) + rule->conditions.emplace_back(condition); + + return rules; +} + +void RuleParser::parseRule(RuleList& rules) +{ + // Rule ::= RuleConditionList? BasicRule + // | RuleConditionList '{' BasicRule* '}' (LF | EOF)? + // BasicRule ::= TOKEN RuleOptions? SP '::=' SP RegEx SP? (LF | EOF) + // RuleOptions ::= '(' RuleOption (',' RuleOption)* + // RuleOption ::= ignore + + consumeSP(); + if (_currentChar == '|' && _lastParsedRule != nullptr) + { + consumeChar(); + consumeSP(); + const string pattern = parseExpression(); + _lastParsedRule->pattern += '|' + pattern; + return; + } + + // finalize ref-rule by surrounding it with round braces + if (_lastParsedRuleIsRef) + _lastParsedRule->pattern = fmt::format("({})", _lastParsedRule->pattern); + + vector conditions = parseRuleConditions(); + consumeSP(); + if (!conditions.empty() && currentChar() == '{') + { + consumeChar(); + consumeAnySP(); // allow whitespace, including LFs + while (!eof() && currentChar() != '}') + { + parseBasicRule(rules, vector(conditions)); + consumeSP(); // part of the next line, allow indentation + } + consumeChar('}'); + consumeSP(); + if (currentChar() == '\n') + consumeChar(); + else if (!eof()) + throw UnexpectedChar { _line, _column, _currentChar, '\n' }; + } + else + { + parseBasicRule(rules, std::move(conditions)); + } +} + +struct TestRuleForName +{ + string name; + bool operator()(const Rule& r) const { return r.name == name; } +}; + +void RuleParser::parseBasicRule(RuleList& rules, vector&& conditions) +{ + const unsigned int beginLine = _line; + const unsigned int beginColumn = _column; + + string token = consumeToken(); + bool ignore = false; + bool ref = false; + if (_currentChar == '(') + { + consumeChar(); + unsigned optionOffset = _offset; + string option = consumeToken(); + consumeChar(')'); + + if (option == "ignore") + ignore = true; + else if (option == "ref") + ref = true; + else + throw InvalidRuleOption { optionOffset, option }; + } + consumeSP(); + consumeAssoc(); + consumeSP(); + const unsigned int line = _line; + const unsigned int column = _column; + const string pattern = parseExpression(); + if (currentChar() == '\n') + consumeChar(); + else if (!eof()) + throw UnexpectedChar { _line, _column, _currentChar, '\n' }; + + const Tag tag = [&] { + if (ignore || ref) + return IgnoreTag; + else if (auto i = find_if(rules.begin(), rules.end(), TestRuleForName { token }); i != rules.end()) + return i->tag; + else + return _nextTag++; + }(); + + if (ref && !conditions.empty()) + throw InvalidRefRuleWithConditions { + beginLine, beginColumn, Rule { line, column, tag, std::move(conditions), token, pattern } + }; + + if (conditions.empty()) + conditions.emplace_back("INITIAL"); + + sort(conditions.begin(), conditions.end()); + + if (!ref) + { + if (auto i = find_if(rules.begin(), rules.end(), TestRuleForName { token }); i != rules.end()) + { + throw DuplicateRule { Rule { line, column, tag, std::move(conditions), token, pattern }, *i }; + } + else + { + rules.emplace_back(Rule { line, column, tag, conditions, token, pattern }); + _lastParsedRule = &rules.back(); + _lastParsedRuleIsRef = false; + } + } + else if (auto i = _refRules.find(token); i != _refRules.end()) + { + throw DuplicateRule { Rule { line, column, tag, std::move(conditions), token, pattern }, i->second }; + } + else + { + // TODO: throw if !conditions.empty(); + _refRules[token] = { line, column, tag, {}, token, pattern }; + _lastParsedRule = &_refRules[token]; + _lastParsedRuleIsRef = true; + } +} + +vector RuleParser::parseRuleConditions() +{ + // RuleConditionList ::= '<' ('*' | TOKEN (',' SP* TOKEN)) '>' + if (currentChar() != '<') + return {}; + + consumeChar(); + + if (currentChar() == '*') + { + consumeChar(); + consumeChar('>'); + return { "*" }; + } + + vector conditions { consumeToken() }; + + while (currentChar() == ',') + { + consumeChar(); + consumeSP(); + conditions.emplace_back(consumeToken()); + } + + consumeChar('>'); + + return conditions; +} + +string RuleParser::parseExpression() +{ + // expression ::= " .... " + // | .... + + stringstream sstr; + + size_t i = 0; + size_t lastGraph = 0; + while (!eof() && _currentChar != '\n') + { + if (isgraph(_currentChar)) + lastGraph = i + 1; + i++; + sstr << consumeChar(); + } + string pattern = sstr.str().substr(0, lastGraph); // skips trailing spaces + + // replace all occurrences of {ref} + for (const pair& ref: _refRules) + { + const Rule& rule = ref.second; + const string name = fmt::format("{{{}}}", rule.name); + // for (size_t i = 0; (i = pattern.find(name, i)) != string::npos; i += rule.pattern.size()) { + // pattern.replace(i, name.size(), rule.pattern); + // } + size_t i = 0; + while ((i = pattern.find(name, i)) != string::npos) + { + pattern.replace(i, name.size(), rule.pattern); + i += rule.pattern.size(); + } + } + + return pattern; +} + +// skips space until LF or EOF +void RuleParser::consumeSpace() +{ + for (;;) + { + switch (_currentChar) + { + case ' ': + case '\t': + case '\r': consumeChar(); break; + case '#': + while (!eof() && _currentChar != '\n') + { + consumeChar(); + } + break; + default: return; + } + } +} + +char RuleParser::currentChar() const noexcept +{ + return _currentChar; +} + +char RuleParser::consumeChar(char ch) +{ + if (_currentChar != ch) + throw UnexpectedChar { _line, _column, _currentChar, ch }; + + return consumeChar(); +} + +char RuleParser::consumeChar() +{ + char t = _currentChar; + + _currentChar = _stream->get(); + if (!_stream->eof()) + { + _offset++; + if (t == '\n') + { + _line++; + _column = 1; + } + else + { + _column++; + } + } + + return t; +} + +bool RuleParser::eof() const noexcept +{ + return std::char_traits::eq(_currentChar, std::char_traits::eof()) || _stream->eof(); +} + +string RuleParser::consumeToken() +{ + stringstream sstr; + + if (!isalpha(_currentChar) || _currentChar == '_') + throw UnexpectedToken { _offset, _currentChar, "Token" }; + + do + sstr << consumeChar(); + while (isalnum(_currentChar) || _currentChar == '_'); + + return sstr.str(); +} + +void RuleParser::consumeAnySP() +{ + while (_currentChar == ' ' || _currentChar == '\t' || _currentChar == '\n') + consumeChar(); +} + +void RuleParser::consumeSP() +{ + while (_currentChar == ' ' || _currentChar == '\t') + consumeChar(); +} + +void RuleParser::consumeAssoc() +{ + consumeChar(':'); + consumeChar(':'); + consumeChar('='); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/RuleParser.h b/src/regex_dfa/RuleParser.h new file mode 100644 index 0000000000..393e26ec8c --- /dev/null +++ b/src/regex_dfa/RuleParser.h @@ -0,0 +1,187 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class RuleParser +{ + public: + explicit RuleParser(std::unique_ptr input, int firstTerminalId = FirstUserTag); + explicit RuleParser(std::string input, int firstTerminalId = FirstUserTag); + + RuleList parseRules(); + + class UnexpectedChar; + class UnexpectedToken; + class InvalidRuleOption; + class InvalidRefRuleWithConditions; + class DuplicateRule; + + private: + void parseRule(RuleList& rules); + std::vector parseRuleConditions(); + void parseBasicRule(RuleList& rules, std::vector&& conditions); + std::string parseExpression(); + + private: + std::string consumeToken(); + void consumeAnySP(); + void consumeSP(); + void consumeAssoc(); + void consumeSpace(); + [[nodiscard]] char currentChar() const noexcept; + char consumeChar(char ch); + char consumeChar(); + [[nodiscard]] bool eof() const noexcept; + [[nodiscard]] std::string replaceRefs(const std::string& pattern); + + private: + std::unique_ptr _stream; + std::map _refRules; + Rule* _lastParsedRule; + bool _lastParsedRuleIsRef; + char _currentChar; + unsigned int _line; + unsigned int _column; + unsigned int _offset; + int _nextTag; +}; + +class RuleParser::InvalidRefRuleWithConditions: public std::runtime_error +{ + public: + InvalidRefRuleWithConditions(unsigned line, unsigned column, Rule rule): + std::runtime_error { fmt::format( + "{}:{}: Invalid rule \"{}\". Reference rules must not be labelled with conditions.", + line, + column, + rule.name) }, + _rule { std::move(rule) } + { + } + + [[nodiscard]] Rule const& rule() const noexcept { return _rule; } + + private: + Rule _rule; +}; + +class RuleParser::DuplicateRule: public std::runtime_error +{ + public: + DuplicateRule(Rule&& duplicate, const Rule& other): + std::runtime_error { fmt::format( + "{}:{}: Duplicated rule definition with name \"{}\", previously defined in {}:{}.", + duplicate.line, + duplicate.column, + duplicate.name, + other.line, + other.column) }, + _duplicate { std::move(duplicate) }, + _other { other } + { + } + + [[nodiscard]] Rule const& duplicate() const noexcept { return _duplicate; } + [[nodiscard]] Rule const& other() const noexcept { return _other; } + + private: + Rule _duplicate; + Rule const& _other; +}; + +class RuleParser::UnexpectedToken: public std::runtime_error +{ + public: + UnexpectedToken(unsigned offset, char actual, std::string expected): + std::runtime_error { fmt::format( + "{}: Unexpected token {}, expected <{}> instead.", offset, actual, expected) }, + _offset { offset }, + _actual { actual }, + _expected { std::move(expected) } + { + } + + [[nodiscard]] unsigned offset() const noexcept { return _offset; } + [[nodiscard]] char actual() const noexcept { return _actual; } + [[nodiscard]] const std::string& expected() const noexcept { return _expected; } + + private: + unsigned _offset; + char _actual; + std::string _expected; +}; + +class RuleParser::UnexpectedChar: public std::runtime_error +{ + public: + UnexpectedChar(unsigned int line, unsigned int column, char actual, char expected): + std::runtime_error { fmt::format("[{}:{}] Unexpected char {}, expected {} instead.", + line, + column, + quoted(actual), + quoted(expected)) }, + _line { line }, + _column { column }, + _actual { actual }, + _expected { expected } + { + } + + [[nodiscard]] unsigned int line() const noexcept { return _line; } + [[nodiscard]] unsigned int column() const noexcept { return _column; } + [[nodiscard]] char actual() const noexcept { return _actual; } + [[nodiscard]] char expected() const noexcept { return _expected; } + + private: + static std::string quoted(char ch) + { + if (std::char_traits::eq(ch, std::char_traits::eof())) + return "<>"; + else + return fmt::format("'{}'", static_cast(ch)); + } + + private: + unsigned int _line; + unsigned int _column; + char _actual; + char _expected; +}; + +class RuleParser::InvalidRuleOption: public std::runtime_error +{ + public: + InvalidRuleOption(unsigned offset, std::string option): + std::runtime_error { fmt::format("{}: Invalid rule option \"{}\".", offset, option) }, + _offset { offset }, + _option { option } + { + } + + [[nodiscard]] unsigned offset() const noexcept { return _offset; } + [[nodiscard]] const std::string& option() const noexcept { return _option; } + + private: + unsigned _offset; + std::string _option; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/RuleParser_test.cpp b/src/regex_dfa/RuleParser_test.cpp new file mode 100644 index 0000000000..aae7fdc58f --- /dev/null +++ b/src/regex_dfa/RuleParser_test.cpp @@ -0,0 +1,247 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include +#include + +using namespace regex_dfa; + +TEST_CASE("regex_RuleParser.simple") +{ + RuleParser rp { "main ::= blah\n" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK("blah" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.whitespaces") +{ + RuleParser rp { "main ::= a\n\t| b | c\n" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK("a|b | c" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.rule_at_eof") +{ + RuleParser rp { "main ::= blah" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK("blah" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.simple_trailing_spaces") +{ + RuleParser rp { "main ::= blah\n " }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK("blah" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.quotedPattern") +{ + RuleParser rp { "main ::= \"blah\"" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK("\"blah\"" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.multiQuotedPattern") +{ + RuleParser rp { R"(rule ::= "b"la"h")" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK(R"("b"la"h")" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.doubleQuote") +{ + RuleParser rp { R"(rule ::= \")" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK(R"(\")" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.spaceRule") +{ + RuleParser rp { R"(rule ::= [ \n\t]+)" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK(R"([ \n\t]+)" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.stringRule") +{ + RuleParser rp { R"(rule ::= \"[^\"]*\")" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK(R"(\"[^\"]*\")" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.ref") +{ + RuleParser rp { R"( + Foo(ref) ::= foo + Bar(ref) ::= bar + FooBar ::= {Foo}_{Bar} + )" }; + RuleList rules = rp.parseRules(); + REQUIRE(1 == rules.size()); + CHECK("(foo)_(bar)" == rules[0].pattern); +} + +TEST_CASE("regex_RuleParser.ref_duplicated") +{ + RuleParser rp { R"( + Foo(ref) ::= foo + Foo(ref) ::= bar + FooBar ::= {Foo} + )" }; + CHECK_THROWS_AS(rp.parseRules(), RuleParser::DuplicateRule); +} + +TEST_CASE("regex_RuleParser.multiline_alt") +{ + RuleParser rp { R"( + Rule1 ::= foo + | bar + Rule2(ref) ::= fnord + | hard + Rule3 ::= {Rule2} + | {Rule2} + )" }; + RuleList rules = rp.parseRules(); + REQUIRE(2 == rules.size()); + CHECK("foo|bar" == rules[0].pattern); + CHECK("(fnord|hard)|(fnord|hard)" == rules[1].pattern); +} + +TEST_CASE("regex_RuleParser.condition1") +{ + RuleParser rp { R"( + Rule1 ::= foo + Rule2 ::= bar + )" }; + RuleList rules = rp.parseRules(); + + REQUIRE(2 == rules.size()); + CHECK("foo" == rules[0].pattern); + CHECK("bar" == rules[1].pattern); + + REQUIRE(1 == rules[0].conditions.size()); + CHECK("foo" == rules[0].conditions[0]); + + REQUIRE(1 == rules[1].conditions.size()); + CHECK("bar" == rules[1].conditions[0]); +} + +TEST_CASE("regex_RuleParser.condition2") +{ + RuleParser rp { R"( + Rule1 ::= foo + Rule2 ::= bar + )" }; + RuleList rules = rp.parseRules(); + + REQUIRE(2 == rules.size()); + CHECK("foo" == rules[0].pattern); + CHECK("bar" == rules[1].pattern); + + REQUIRE(1 == rules[0].conditions.size()); + CHECK("foo" == rules[0].conditions[0]); + + REQUIRE(2 == rules[1].conditions.size()); + // in sorted order + CHECK("bar" == rules[1].conditions[0]); + CHECK("foo" == rules[1].conditions[1]); +} + +TEST_CASE("regex_RuleParser.conditional_star") +{ + RuleParser rp { R"( + Zero ::= zero + One ::= one + Two ::= two + <*>Tri ::= tri + )" }; + RuleList rules = rp.parseRules(); + + REQUIRE(4 == rules.size()); + + CHECK("zero" == rules[0].pattern); + REQUIRE(1 == rules[0].conditions.size()); + CHECK("INITIAL" == rules[0].conditions[0]); + + CHECK("one" == rules[1].pattern); + REQUIRE(1 == rules[1].conditions.size()); + CHECK("one" == rules[1].conditions[0]); + + CHECK("two" == rules[2].pattern); + REQUIRE(1 == rules[2].conditions.size()); + CHECK("two" == rules[2].conditions[0]); + + CHECK("tri" == rules[3].pattern); + REQUIRE(3 == rules[3].conditions.size()); + CHECK("INITIAL" == rules[3].conditions[0]); + CHECK("one" == rules[3].conditions[1]); + CHECK("two" == rules[3].conditions[2]); +} + +TEST_CASE("regex_RuleParser.grouped_conditions") +{ + RuleParser rp { R"( + Rule1 ::= foo + { + Rule2 ::= bar + } + )" }; + RuleList rules = rp.parseRules(); + + REQUIRE(2 == rules.size()); + CHECK("foo" == rules[0].pattern); + CHECK("bar" == rules[1].pattern); + + REQUIRE(1 == rules[1].conditions.size()); + CHECK("blah" == rules[1].conditions[0]); +} + +TEST_CASE("regex_RuleParser.InvalidRefRuleWithConditions") +{ + CHECK_THROWS_AS(RuleParser { "main(ref) ::= blah\n" }.parseRules(), + RuleParser::InvalidRefRuleWithConditions); +} + +TEST_CASE("regex_RuleParser.InvalidRuleOption") +{ + CHECK_THROWS_AS(RuleParser { "A(invalid) ::= a\n" }.parseRules(), RuleParser::InvalidRuleOption); +} + +TEST_CASE("regex_RuleParser.DuplicateRule") +{ + RuleParser rp { R"( + foo ::= abc + foo ::= def + )" }; + CHECK_THROWS_AS(rp.parseRules(), RuleParser::DuplicateRule); +} + +TEST_CASE("regex_RuleParser.UnexpectedChar") +{ + CHECK_THROWS_AS(RuleParser { "A :=" }.parseRules(), RuleParser::UnexpectedChar); + CHECK_THROWS_AS(RuleParser { " A ::= a" }.parseRules(), RuleParser::UnexpectedToken); + CHECK_THROWS_AS(RuleParser { "<> A ::= a" }.parseRules(), RuleParser::UnexpectedToken); + CHECK_THROWS_AS(RuleParser { " ::= a" }.parseRules(), RuleParser::UnexpectedToken); +} diff --git a/src/regex_dfa/SourceLocation.cpp b/src/regex_dfa/SourceLocation.cpp new file mode 100644 index 0000000000..c9e6cd8267 --- /dev/null +++ b/src/regex_dfa/SourceLocation.cpp @@ -0,0 +1,27 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +string SourceLocation::source() const // TODO +{ + string code; + ifstream ifs(filename); + ifs.seekg(offset, ifs.beg); + code.resize(count); + ifs.read(&code[0], count); + return code; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/SourceLocation.h b/src/regex_dfa/SourceLocation.h new file mode 100644 index 0000000000..c69d7f7487 --- /dev/null +++ b/src/regex_dfa/SourceLocation.h @@ -0,0 +1,40 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +namespace regex_dfa +{ + +struct SourceLocation +{ + std::string filename; + size_t offset; + size_t count; + + [[nodiscard]] long long int compare(const SourceLocation& other) const noexcept + { + if (filename == other.filename) + return (long) offset - (long) other.offset; + else if (filename < other.filename) + return -1; + else + return 1; + } + + [[nodiscard]] std::string source() const; + + bool operator==(const SourceLocation& other) const noexcept { return compare(other) == 0; } + bool operator<=(const SourceLocation& other) const noexcept { return compare(other) <= 0; } + bool operator>=(const SourceLocation& other) const noexcept { return compare(other) >= 0; } + bool operator<(const SourceLocation& other) const noexcept { return compare(other) < 0; } + bool operator>(const SourceLocation& other) const noexcept { return compare(other) > 0; } +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/State.cpp b/src/regex_dfa/State.cpp new file mode 100644 index 0000000000..76eaa27f26 --- /dev/null +++ b/src/regex_dfa/State.cpp @@ -0,0 +1,37 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +string to_string(const StateIdVec& S, string_view stateLabelPrefix) +{ + StateIdVec names = S; + sort(names.begin(), names.end()); + + stringstream sstr; + sstr << "{"; + int i = 0; + for (StateId name: names) + { + if (i) + sstr << ", "; + sstr << stateLabelPrefix << name; + i++; + } + sstr << "}"; + + return sstr.str(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/State.h b/src/regex_dfa/State.h new file mode 100644 index 0000000000..975dd8851e --- /dev/null +++ b/src/regex_dfa/State.h @@ -0,0 +1,53 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +using Tag = int; +using StateId = size_t; +using StateIdVec = std::vector; + +using AcceptMap = std::map; + +/** + * Returns a human readable string of @p S, such as "{n0, n1, n2}". + */ +[[nodiscard]] std::string to_string(const StateIdVec& S, std::string_view stateLabelPrefix = "n"); + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::StateIdVec& v, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{}", regex_dfa::to_string(v)); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/State_test.cpp b/src/regex_dfa/State_test.cpp new file mode 100644 index 0000000000..4cb9074f1e --- /dev/null +++ b/src/regex_dfa/State_test.cpp @@ -0,0 +1,18 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +TEST_CASE("regex_State.to_string") +{ + regex_dfa::StateIdVec v { 1, 2, 3 }; + CHECK("{n1, n2, n3}" == fmt::format("{}", v)); +} diff --git a/src/regex_dfa/Symbols.cpp b/src/regex_dfa/Symbols.cpp new file mode 100644 index 0000000000..630670740b --- /dev/null +++ b/src/regex_dfa/Symbols.cpp @@ -0,0 +1,184 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +string prettySymbol(Symbol input) +{ + switch (input) + { + case Symbols::Error: return "<>"; + case Symbols::BeginOfLine: return "<>"; + case Symbols::EndOfLine: return "<>"; + case Symbols::EndOfFile: return "<>"; + case Symbols::Epsilon: return "ε"; + case '\a': return "\\a"; + case '\b': return "\\b"; + case '\f': return "\\f"; + case '\n': return "\\n"; + case '\r': return "\\r"; + case ' ': return "\\s"; + case '\t': return "\\t"; + case '\v': return "\\v"; + case '\0': return "\\0"; + case '.': return "\\."; // so we can distinguish from dot-operator + default: + if (isprint(input)) + { + return fmt::format("{}", (char) input); + } + else + { + return fmt::format("\\x{:02x}", input); + } + } +} + +string prettyCharRange(Symbol ymin, Symbol ymax) +{ + assert(ymin <= ymax); + + stringstream sstr; + switch (ymax - ymin) + { + case 0: sstr << prettySymbol(ymin); break; + case 1: sstr << prettySymbol(ymin) << prettySymbol(ymin + 1); break; + case 2: sstr << prettySymbol(ymin) << prettySymbol(ymin + 1) << prettySymbol(ymax); break; + default: sstr << prettySymbol(ymin) << '-' << prettySymbol(ymax); break; + } + return sstr.str(); +} + +string groupCharacterClassRanges(const vector& syms) +{ + // {1,3,5,a,b,c,d,e,f,z] + // -> + // {{1}, {3}, {5}, {a-f}, {z}} + + stringstream sstr; + Symbol ymin = '\0'; + Symbol ymax = ymin; + int k = 0; + + for (size_t i = 0, e = syms.size(); i != e; ++i) + { + if (!syms[i]) + continue; + + const Symbol c = (Symbol) i; + if (c == ymax + 1) + { // range growing + ymax = c; + } + else + { // gap found + if (k) + { + sstr << prettyCharRange(ymin, ymax); + } + ymin = ymax = c; + } + k++; + } + sstr << prettyCharRange(ymin, ymax); + + return sstr.str(); +} + +string groupCharacterClassRanges(vector chars) +{ + // we took a copy in tgroup here, so I can sort() later + sort(chars.begin(), chars.end()); + + if (chars.size() == 1) + return prettySymbol(chars.front()); + + // {1,3,5,a,b,c,d,e,f,z] + // -> + // "123a-fz" + + stringstream sstr; + Symbol ymin = 0; + Symbol ymax = ymin; + int i = 0; + + for (Symbol c: chars) + { + if (c == ymax + 1) + { // range growing + ymax = c; + } + else + { // gap found + if (i) + { + sstr << prettyCharRange(ymin, ymax); + } + ymin = ymax = c; + } + i++; + } + sstr << prettyCharRange(ymin, ymax); + + return sstr.str(); +} + +SymbolSet::SymbolSet(DotMode): set_(256, true), size_ { 255 }, hash_ { 2166136261 } +{ + set_[(size_t) '\n'] = false; + for (Symbol s: *this) + { + hash_ = (hash_ * 16777619) ^ s; + } +} + +bool SymbolSet::isDot() const noexcept +{ + static SymbolSet dot(SymbolSet::Dot); + return *this == dot; +} + +string SymbolSet::to_string() const +{ + if (isDot()) + return "."; + + return groupCharacterClassRanges(set_); +} + +void SymbolSet::complement() +{ + // flip bits + for (size_t i = 0, e = set_.size(); i != e; ++i) + { + set_[i] = !set_[i]; + } + + // flip size + size_ = set_.size() - size_; + + recalculateHash(); +} + +void SymbolSet::recalculateHash() +{ + // recalculate hash + hash_ = 2166136261; + for (Symbol s: *this) + { + hash_ = (hash_ * 16777619) ^ s; + } +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Symbols.h b/src/regex_dfa/Symbols.h new file mode 100644 index 0000000000..bb8a5488e1 --- /dev/null +++ b/src/regex_dfa/Symbols.h @@ -0,0 +1,208 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +//! input symbol as used for transitions +using Symbol = int; + +[[nodiscard]] std::string prettySymbol(Symbol input); +[[nodiscard]] std::string prettyCharRange(Symbol ymin, Symbol ymax); +[[nodiscard]] std::string groupCharacterClassRanges(const std::vector& syms); +[[nodiscard]] std::string groupCharacterClassRanges(std::vector syms); + +// new way of wrapping up Symbols +struct Symbols +{ + // NOLINTBEGIN(readability-identifier-naming) + constexpr static Symbol Epsilon = -1; + constexpr static Symbol Error = -2; + constexpr static Symbol BeginOfLine = -3; + constexpr static Symbol EndOfLine = -4; + constexpr static Symbol EndOfFile = -5; + constexpr static Symbol Character(char ch) { return Symbol(ch); } + // NOLINTEND(readability-identifier-naming) + + [[nodiscard]] constexpr static bool isSpecial(Symbol s) + { + switch (s) + { + case Symbols::EndOfFile: + case Symbols::EndOfLine: + case Symbols::BeginOfLine: + case Symbols::Epsilon: + case Symbols::Error: return true; + default: return false; + } + } +}; + +/** + * Represents a set of symbols. + */ +class SymbolSet +{ + public: + enum DotMode + { + Dot + }; + + explicit SymbolSet(DotMode); + SymbolSet(): set_(256, false), size_ { 0 }, hash_ { 2166136261 } {} + + explicit SymbolSet(std::initializer_list list): SymbolSet() + { + std::for_each(list.begin(), list.end(), [this](Symbol s) { insert(s); }); + } + + [[nodiscard]] bool empty() const noexcept { return size_ == 0; } + [[nodiscard]] size_t size() const noexcept { return size_; } + + //! Transforms into the complement set. + void complement(); + + //! Inserts given Symbol @p s into this set. + void insert(Symbol s) + { + if (!contains(s)) + { + set_[s] = true; + hash_ = (hash_ * 16777619) ^ s; + size_++; + } + } + + //! Inserts a range of Simples between [a, b]. + void insert(const std::pair& range) + { + for (Symbol s = range.first; s <= range.second; ++s) + { + insert(s); + } + } + + //! @returns whether or not given Symbol @p s is in this set. + [[nodiscard]] bool contains(Symbol s) const + { + assert(s >= 0 && s <= 255 && "Only ASCII allowed."); + return set_[(size_t) s]; + } + + //! Tests whether or not this SymbolSet can be represented as dot (.), i.e. all but \n. + [[nodiscard]] bool isDot() const noexcept; + + //! @returns a human readable representation of this set + [[nodiscard]] std::string to_string() const; + + bool operator==(const SymbolSet& rhs) const noexcept { return hash_ == rhs.hash_ && set_ == rhs.set_; } + bool operator!=(const SymbolSet& rhs) const noexcept { return !(*this == rhs); } + + class const_iterator // NOLINT(readability-identifier-naming) + { // {{{ + public: + const_iterator(std::vector::const_iterator beg, + std::vector::const_iterator end, + size_t n): + beg_ { beg }, end_ { end }, offset_ { n } + { + while (beg_ != end_ && !*beg_) + { + ++beg_; + ++offset_; + } + } + + Symbol operator*() const { return static_cast(offset_); } + + const_iterator& operator++(int) + { + do + { + ++beg_; + ++offset_; + } while (beg_ != end_ && !*beg_); + return *this; + } + + const_iterator& operator++() + { + do + { + beg_++; + offset_++; + } while (beg_ != end_ && !*beg_); + return *this; + } + + bool operator==(const const_iterator& rhs) const noexcept { return beg_ == rhs.beg_; } + bool operator!=(const const_iterator& rhs) const noexcept { return beg_ != rhs.beg_; } + + private: + std::vector::const_iterator beg_; + std::vector::const_iterator end_; + size_t offset_; + }; // }}} + + [[nodiscard]] const_iterator begin() const { return const_iterator(set_.begin(), set_.end(), 0); } + [[nodiscard]] const_iterator end() const { return const_iterator(set_.end(), set_.end(), set_.size()); } + + [[nodiscard]] size_t hash() const noexcept { return hash_; } + + private: + void recalculateHash(); + + private: + // XXX we chose vector as it is an optimized bit vector + std::vector set_; + size_t size_; + size_t hash_; +}; + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::SymbolSet& v, FormatContext& ctx) + { + return fmt::format_to(ctx.out(), "{}", v.to_string()); + } +}; +} // namespace fmt + +namespace std +{ +template <> +struct hash +{ + size_t operator()(const regex_dfa::SymbolSet& set) const { return set.hash(); } +}; +} // namespace std diff --git a/src/regex_dfa/Symbols_test.cpp b/src/regex_dfa/Symbols_test.cpp new file mode 100644 index 0000000000..3374865b1b --- /dev/null +++ b/src/regex_dfa/Symbols_test.cpp @@ -0,0 +1,112 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; +using regex_dfa::SymbolSet; + +TEST_CASE("regex_SymbolSet.s0") +{ + SymbolSet s0; + REQUIRE(0 == s0.size()); // NOLINT(readability-container-size-empty) + REQUIRE(s0.empty()); +} + +TEST_CASE("regex_SymbolSet.s1") +{ + SymbolSet s1; + + // first add + s1.insert('a'); + CHECK(1 == s1.size()); + REQUIRE_FALSE(s1.empty()); + + // overwrite + s1.insert('a'); + CHECK(1 == s1.size()); + REQUIRE_FALSE(s1.empty()); +} + +TEST_CASE("regex_SymbolSet.initializer_list") +{ + SymbolSet a { 'a' }; + CHECK(1 == a.size()); + CHECK(a.contains('a')); + + SymbolSet s2 { 'a', 'b', 'b', 'c' }; + CHECK(3 == s2.size()); + CHECK("abc" == s2.to_string()); +} + +TEST_CASE("regex_SymbolSet.dot") +{ + SymbolSet dot(SymbolSet::Dot); + REQUIRE(!dot.contains('\n')); + CHECK(dot.contains('\0')); + CHECK(dot.contains(' ')); + CHECK(dot.isDot()); + CHECK("." == dot.to_string()); +} + +TEST_CASE("regex_SymbolSet.complement") +{ + SymbolSet s; + s.insert('\n'); + CHECK("\\n" == s.to_string()); + s.complement(); + CHECK("." == s.to_string()); +} + +TEST_CASE("regex_SymbolSet.range") +{ + SymbolSet r; + r.insert(make_pair('a', 'f')); + + CHECK(6 == r.size()); + CHECK("a-f" == r.to_string()); + + r.insert(make_pair('0', '9')); + CHECK(16 == r.size()); + CHECK("0-9a-f" == r.to_string()); +} + +TEST_CASE("regex_SymbolSet.fmt_format") +{ + SymbolSet s; + s.insert(make_pair('0', '9')); + s.insert(make_pair('a', 'f')); + + CHECK("0-9a-f" == fmt::format("{}", s)); +} + +TEST_CASE("regex_SymbolSet.hash_map") +{ + SymbolSet s0; + SymbolSet s1 { 'a' }; + SymbolSet s2 { 'a', 'b' }; + + unordered_map map; + map[s0] = 0; + map[s1] = 1; + map[s2] = 2; + + CHECK(0 == map[s0]); + CHECK(1 == map[s1]); + CHECK(2 == map[s2]); +} + +TEST_CASE("regex_SymbolSet.compare") +{ + SymbolSet s1 { 'a', 'b' }; + SymbolSet s2 { 'a', 'b' }; + SymbolSet s3 { 'a', 'c' }; + REQUIRE(s1 == s2); + REQUIRE(s1 != s3); +} diff --git a/src/regex_dfa/TransitionMap-inl.h b/src/regex_dfa/TransitionMap-inl.h new file mode 100644 index 0000000000..df949ecab7 --- /dev/null +++ b/src/regex_dfa/TransitionMap-inl.h @@ -0,0 +1,49 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +namespace regex_dfa +{ + +inline void TransitionMap::define(StateId currentState, Symbol charCat, StateId nextState) +{ + mapping_[currentState][charCat] = nextState; +} + +inline StateId TransitionMap::apply(StateId currentState, Symbol charCat) const +{ + if (auto i = mapping_.find(currentState); i != mapping_.end()) + if (auto k = i->second.find(charCat); k != i->second.end()) + return k->second; + + return ErrorState; +} + +inline std::vector TransitionMap::states() const +{ + std::vector v; + v.reserve(mapping_.size()); + for (const auto& i: mapping_) + v.push_back(i.first); + std::sort(v.begin(), v.end()); + return v; +} + +inline std::map TransitionMap::map(StateId inputState) const +{ + std::map m; + if (auto mapping = mapping_.find(inputState); mapping != mapping_.end()) + for (const auto& i: mapping->second) + m[i.first] = i.second; + return m; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/TransitionMap.h b/src/regex_dfa/TransitionMap.h new file mode 100644 index 0000000000..5b0693748b --- /dev/null +++ b/src/regex_dfa/TransitionMap.h @@ -0,0 +1,66 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include + +namespace regex_dfa +{ + +using CharCatId = int; + +constexpr CharCatId ErrorCharCat = static_cast(-1); + +/** + * Represents an error-state, such as invalid input character or unexpected EOF. + */ +constexpr StateId ErrorState { 808080 }; // static_cast(-1); + +/** + * Transition mapping API to map the input (currentState, charCat) to (newState). + */ +class TransitionMap +{ + public: + using Container = std::map>; + + TransitionMap(): mapping_ {} {} + + TransitionMap(Container mapping): mapping_ { std::move(mapping) } {} + + /** + * Defines a new mapping for (currentState, charCat) to (nextState). + */ + void define(StateId currentState, Symbol charCat, StateId nextState); + + /** + * Retrieves the next state for the input (currentState, charCat). + * + * @returns the transition from (currentState, charCat) to (nextState) or ErrorState if not defined. + */ + [[nodiscard]] StateId apply(StateId currentState, Symbol charCat) const; + + /** + * Retrieves a list of all available states. + */ + [[nodiscard]] std::vector states() const; + + /** + * Retrieves a map of all transitions from given state @p inputState. + */ + [[nodiscard]] std::map map(StateId inputState) const; + + private: + Container mapping_; +}; + +} // namespace regex_dfa + +#include diff --git a/src/regex_dfa/klex_test.cpp b/src/regex_dfa/klex_test.cpp new file mode 100644 index 0000000000..17f2164e24 --- /dev/null +++ b/src/regex_dfa/klex_test.cpp @@ -0,0 +1,13 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +int main(int argc, const char* argv[]) +{ + return regex_dfa::util::testing::main(argc, argv); +} diff --git a/src/regex_dfa/regex_dfa_test.cpp b/src/regex_dfa/regex_dfa_test.cpp new file mode 100644 index 0000000000..45742d7742 --- /dev/null +++ b/src/regex_dfa/regex_dfa_test.cpp @@ -0,0 +1,25 @@ +/** + * This file is part of the "libterminal" project + * Copyright (c) 2019-2020 Christian Parpart + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define CATCH_CONFIG_RUNNER +#include + +int main(int argc, char const* argv[]) +{ + int const result = Catch::Session().run(argc, argv); + + // avoid closing extern console to close on VScode/windows + // system("pause"); + + return result; +} diff --git a/src/regex_dfa/util/iterator-detail.h b/src/regex_dfa/util/iterator-detail.h new file mode 100644 index 0000000000..948beabf06 --- /dev/null +++ b/src/regex_dfa/util/iterator-detail.h @@ -0,0 +1,177 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +namespace regex_dfa::util::detail +{ + +template +struct reversed +{ + const Container container; + + auto begin() { return container.crbegin(); } + auto end() { return container.crend(); } +}; + +template +struct indexed +{ + Container& container; + + struct iterator + { + typename Container::iterator iter; + std::size_t index = 0; + + iterator& operator++() + { + ++iter; + ++index; + return *this; + } + + iterator& operator++(int) + { + ++*this; + return *this; + } + + auto operator*() const { return std::make_pair(index, *iter); } + + bool operator==(const iterator& rhs) const noexcept { return iter == rhs.iter; } + bool operator!=(const iterator& rhs) const noexcept { return iter != rhs.iter; } + }; + + struct const_iterator + { + typename Container::const_iterator iter; + std::size_t index = 0; + + const_iterator& operator++() + { + ++iter; + ++index; + return *this; + } + + const_iterator& operator++(int) + { + ++*this; + return *this; + } + + auto operator*() const { return std::make_pair(index, *iter); } + + bool operator==(const const_iterator& rhs) const noexcept { return iter == rhs.iter; } + bool operator!=(const const_iterator& rhs) const noexcept { return iter != rhs.iter; } + }; + + auto begin() const + { + if constexpr (std::is_const::value) + return const_iterator { container.cbegin() }; + else + return iterator { container.begin() }; + } + + auto end() const + { + if constexpr (std::is_const::value) + return const_iterator { container.cend() }; + else + return iterator { container.end() }; + } +}; + +template +struct filter +{ + Container& container; + Lambda proc; + + struct iterator + { + using iterator_category = std::forward_iterator_tag; + using value_type = typename Container::value_type; + using difference_type = long; + using pointer = value_type*; + using reference = value_type&; + + typename Container::iterator i; + typename Container::iterator e; + Lambda filter; + + auto operator*() const { return *i; } + + iterator& operator++() + { + ++i; + while (i != e && !filter(*i)) + ++i; + return *this; + } + + iterator& operator++(int) { return ++*this; } + + bool operator==(const iterator& rhs) const noexcept { return i == rhs.i; } + bool operator!=(const iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + struct const_iterator + { + typename Container::const_iterator i; + typename Container::const_iterator e; + Lambda filter; + + auto operator*() const { return *i; } + + const_iterator& operator++() + { + ++i; + while (i != e && !filter(*i)) + ++i; + return *this; + } + + const_iterator& operator++(int) { return ++*this; } + + bool operator==(const const_iterator& rhs) const noexcept { return i == rhs.i; } + bool operator!=(const const_iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + auto begin() const + { + if constexpr (std::is_const::value) + { + auto i = const_iterator { std::cbegin(container), std::cend(container), proc }; + while (i != end() && !proc(*i)) + ++i; + return i; + } + else + { + auto i = iterator { std::begin(container), std::end(container), proc }; + while (i != end() && !proc(*i)) + ++i; + return i; + } + } + + auto end() const + { + if constexpr (std::is_const::value) + return const_iterator { std::cend(container), std::cend(container), proc }; + else + return iterator { std::end(container), std::end(container), proc }; + } +}; + +} // namespace regex_dfa::util::detail diff --git a/src/regex_dfa/util/iterator.h b/src/regex_dfa/util/iterator.h new file mode 100644 index 0000000000..81c95838d9 --- /dev/null +++ b/src/regex_dfa/util/iterator.h @@ -0,0 +1,109 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa::util +{ + +template +inline auto reversed(Container&& c) +{ + if constexpr (std::is_reference::value) + return detail::reversed { std::forward(c) }; + else + return detail::reversed { std::forward(c) }; +} + +template +inline auto indexed(const Container& c) +{ + return typename std::add_const>::type { c }; +} + +template +inline auto indexed(Container& c) +{ + return detail::indexed { c }; +} + +template +inline auto translate(const Container& container, Lambda mapfn) +{ + using namespace std; + using T = decltype(mapfn(*begin(container))); + + vector out; + out.reserve(distance(begin(container), end(container))); + transform(begin(container), end(container), back_inserter(out), std::move(mapfn)); + + return out; +} + +template +inline std::string join(const Container& container, const std::string& separator = ", ") +{ + std::stringstream out; + + for (const auto&& [i, v]: indexed(container)) + if (i) + out << separator << v; + else + out << v; + + return out.str(); +} + +template +inline auto filter(std::initializer_list&& c, Lambda proc) +{ + return typename std::add_const, Lambda>>::type { c, proc }; +} + +template +inline auto filter(const Container& c, Lambda proc) +{ + return typename std::add_const>::type { c, proc }; +} + +template +inline auto filter(Container& c, Lambda proc) +{ + return detail::filter { c, proc }; +} + +/** + * Finds the last occurence of a given element satisfying @p test. + * + * @returns the iterator representing the last item satisfying @p test or @p end if none found. + */ +template +auto find_last(const Container& container, Test test) -> decltype(std::cbegin(container)) +{ + auto begin = std::cbegin(container); + auto end = std::cend(container); + + for (auto i = std::prev(end); i != begin; --i) + if (test(*i)) + return i; + + if (test(*begin)) + return begin; + else + return end; +} + +} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/iterator_test.cpp b/src/regex_dfa/util/iterator_test.cpp new file mode 100644 index 0000000000..aa41e5a5dd --- /dev/null +++ b/src/regex_dfa/util/iterator_test.cpp @@ -0,0 +1,182 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +#include +#include +#include +#include + +using namespace std; +using namespace regex_dfa::util; + +TEST_CASE("util_iterator_reversed.empty") +{ + const vector v; + auto x = reversed(v); + auto i = begin(x); + REQUIRE(i == end(x)); +} + +TEST_CASE("util_iterator_reversed.one") +{ + const vector v { 1 }; + auto x = reversed(v); + auto i = begin(x); + REQUIRE(1 == *i); + i++; + REQUIRE(i == end(x)); +} + +TEST_CASE("util_iterator_reversed.many") +{ + const vector v { 1, 2, 3 }; + auto x = reversed(v); + auto i = begin(x); + REQUIRE(3 == *i); + i++; + REQUIRE(2 == *i); + i++; + REQUIRE(1 == *i); + i++; + REQUIRE(i == end(x)); +} + +TEST_CASE("util_iterator_indexed.many_const") +{ + const vector v { 10, 20, 30 }; + const auto x = indexed(v); + static_assert(is_const::value); + auto i = begin(x); + + REQUIRE(0 == (*i).first); + REQUIRE(10 == (*i).second); + i++; + + REQUIRE(1 == (*i).first); + REQUIRE(20 == (*i).second); + i++; + + REQUIRE(2 == (*i).first); + REQUIRE(30 == (*i).second); + i++; + + REQUIRE(i == end(x)); +} + +TEST_CASE("util_iterator_indexed.many") +{ + vector v { "zero", "one", "two" }; + auto x = indexed(v); + auto i = begin(x); + + REQUIRE(0 == (*i).first); + REQUIRE("zero" == (*i).second); + i++; + + REQUIRE(1 == (*i).first); + REQUIRE("one" == (*i).second); + i++; + + REQUIRE(2 == (*i).first); + REQUIRE("two" == (*i).second); + i++; + + REQUIRE(i == end(x)); +} + +TEST_CASE("util_iterator_indexed.range_based_for_loop") +{ + INFO("const:"); + const vector v1 { 10, 20, 30 }; + for (const auto&& [index, value]: indexed(v1)) + INFO(fmt::format("index {}, value {}", index, value)); + + INFO("non-const:"); + vector v2 { 10, 20, 30 }; + for (const auto&& [index, value]: indexed(v2)) + INFO(fmt::format("index {}, value {}", index, value)); +} + +TEST_CASE("util_iterator_filter.for_range") +{ + const vector nums = { 1, 2, 3, 4 }; + vector odds; + for (const int i: filter(nums, [](int x) { return x % 2 != 0; })) + odds.push_back(i); + + REQUIRE(2 == odds.size()); + REQUIRE(1 == odds[0]); + CHECK(3 == odds[1]); +} + +TEST_CASE("util_iterator_filter.count_proc_invocations") +{ + static const array numbers = { 1, 2, 3, 4 }; + int count = 0; + auto counter = [&](int) { + ++count; + return true; + }; + const auto f = filter(numbers, counter); + for_each(begin(f), end(f), [](int) {}); + REQUIRE(4 == count); +} + +TEST_CASE("util_iterator_filter.for_range_initializer_list") +{ + static const array numbers = { 1, 2, 3, 4 }; + vector odds; + auto f_odd = [&](int x) { + INFO(fmt::format("f_odd: x={0}", x)); + return x % 2 != 0; + }; + for (const int i: filter(numbers, f_odd)) + odds.push_back(i); + + REQUIRE(2 == odds.size()); + CHECK(1 == odds[0]); + CHECK(3 == odds[1]); +} + +TEST_CASE("util_iterator_translate.vector") +{ + const vector in { 1, 2, 3, 4 }; + const vector out = translate(in, [](auto i) -> int { return int(i * 2); }); + + for (const auto&& [i, v]: indexed(out)) + INFO(fmt::format("out[{}] = {}", i, v)); + + REQUIRE(4 == out.size()); + + CHECK(2 == out[0]); + CHECK(4 == out[1]); + CHECK(6 == out[2]); + CHECK(8 == out[3]); +} + +TEST_CASE("util_iterator_translate.chain_translate_join") +{ + const vector in { 1, 2, 3, 4 }; + const string out { join(translate(in, [](int i) -> string { return to_string(i); }), ", ") }; + + REQUIRE("1, 2, 3, 4" == out); +} + +TEST_CASE("util_iterator.find_last") +{ + const vector v { 1, 2, 3, 4 }; + const auto i = find_last(v, [](int i) { return i % 2 != 0; }); // find last odd value -> 3 + + REQUIRE(i != end(v)); + REQUIRE(3 == *i); +} diff --git a/src/regex_dfa/util/literals.h b/src/regex_dfa/util/literals.h new file mode 100644 index 0000000000..9a1f9bc698 --- /dev/null +++ b/src/regex_dfa/util/literals.h @@ -0,0 +1,75 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include +#include +#include + +namespace regex_dfa::util::literals +{ + +/** + * Strips a multiline string's indentation prefix. + * + * Example: + * \code + * string s = R"(|line one + * |line two + * |line three + * )"_multiline; + * fmt::print(s); + * \endcode + * + * This prints three lines: @c "line one\nline two\nline three\n" + */ +inline std::string operator""_multiline(const char* text, size_t /*size*/) +{ + if (!*text) + return {}; + + enum class State + { + LineData, + SkipUntilPrefix, + }; + + constexpr char LF = '\n'; + State state = State::LineData; + std::stringstream sstr; + char sep = *text++; + + while (*text) + { + switch (state) + { + case State::LineData: + if (*text == LF) + { + state = State::SkipUntilPrefix; + sstr << *text++; + } + else + sstr << *text++; + break; + case State::SkipUntilPrefix: + if (*text == sep) + { + state = State::LineData; + text++; + } + else + text++; + break; + } + } + + return sstr.str(); +} + +} // namespace regex_dfa::util::literals diff --git a/src/vtbackend/CMakeLists.txt b/src/vtbackend/CMakeLists.txt index 4ad0ab1cf1..fc00e9b5f7 100644 --- a/src/vtbackend/CMakeLists.txt +++ b/src/vtbackend/CMakeLists.txt @@ -103,6 +103,8 @@ target_link_libraries(vtbackend PUBLIC fmt::fmt-header-only range-v3::range-v3 ${LIBUNICODE_LIBS} + regex_dfa + ctre::ctre vtparser vtpty ) diff --git a/src/vtbackend/Settings.h b/src/vtbackend/Settings.h index cfc77865f3..a907ca9c03 100644 --- a/src/vtbackend/Settings.h +++ b/src/vtbackend/Settings.h @@ -65,6 +65,8 @@ struct Settings bool highlightDoubleClickedWord = true; // TODO: ^^^ make also use of it. probably rename to how VScode has named it. + std::string urlPattern = R"((https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])"; + struct PrimaryScreen { bool allowReflowOnResize = true; diff --git a/src/vtbackend/TerminalState.cpp b/src/vtbackend/TerminalState.cpp index d4df1a1cc7..b850815e5c 100644 --- a/src/vtbackend/TerminalState.cpp +++ b/src/vtbackend/TerminalState.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace terminal { @@ -16,6 +18,7 @@ TerminalState::TerminalState(Terminal& terminal): te->discardImage(*image); } }, hyperlinks { HyperlinkCache { 1024 } }, + urlPattern { regex_dfa::RegExprParser {}.parse(settings.urlPattern) }, sequencer { terminal }, parser { std::ref(sequencer) }, viCommands { terminal }, diff --git a/src/vtbackend/TerminalState.h b/src/vtbackend/TerminalState.h index 70635e4b7b..7940465c25 100644 --- a/src/vtbackend/TerminalState.h +++ b/src/vtbackend/TerminalState.h @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -26,6 +28,7 @@ #include #include #include +#include #include #include @@ -193,6 +196,7 @@ struct TerminalState // Hyperlink related // HyperlinkStorage hyperlinks {}; + regex_dfa::RegExpr urlPattern; std::string windowTitle {}; std::stack savedWindowTitles {};