From cbf1aad87a40db8d937e9524ebbaeff9e2ac6079 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Mon, 10 Apr 2023 01:24:54 +0200 Subject: [PATCH 1/5] WIP: first bits for URL detection Signed-off-by: Christian Parpart --- src/contour/Config.h | 2 ++ src/contour/TerminalSession.cpp | 1 + src/vtbackend/Settings.h | 2 ++ src/vtbackend/TerminalState.cpp | 3 +++ src/vtbackend/TerminalState.h | 2 ++ 5 files changed, 10 insertions(+) diff --git a/src/contour/Config.h b/src/contour/Config.h index 5c9fc948f6..619f82528a 100644 --- a/src/contour/Config.h +++ b/src/contour/Config.h @@ -180,6 +180,8 @@ struct TerminalProfile bool highlightDoubleClickedWord = true; terminal::StatusDisplayType initialStatusDisplayType = terminal::StatusDisplayType::None; + std::string urlPattern = R"((https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])"; + terminal::Opacity backgroundOpacity; // value between 0 (fully transparent) and 0xFF (fully visible). bool backgroundBlur; // On Windows 10, this will enable Acrylic Backdrop. diff --git a/src/contour/TerminalSession.cpp b/src/contour/TerminalSession.cpp index f7905c623c..a4e5df9581 100644 --- a/src/contour/TerminalSession.cpp +++ b/src/contour/TerminalSession.cpp @@ -122,6 +122,7 @@ namespace settings.primaryScreen.allowReflowOnResize = config.reflowOnResize; settings.highlightDoubleClickedWord = profile.highlightDoubleClickedWord; settings.highlightTimeout = profile.highlightTimeout; + settings.urlPattern = profile.urlPattern; return settings; } diff --git a/src/vtbackend/Settings.h b/src/vtbackend/Settings.h index cfc77865f3..a907ca9c03 100644 --- a/src/vtbackend/Settings.h +++ b/src/vtbackend/Settings.h @@ -65,6 +65,8 @@ struct Settings bool highlightDoubleClickedWord = true; // TODO: ^^^ make also use of it. probably rename to how VScode has named it. + std::string urlPattern = R"((https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|])"; + struct PrimaryScreen { bool allowReflowOnResize = true; diff --git a/src/vtbackend/TerminalState.cpp b/src/vtbackend/TerminalState.cpp index d4df1a1cc7..7b1fb76a8d 100644 --- a/src/vtbackend/TerminalState.cpp +++ b/src/vtbackend/TerminalState.cpp @@ -16,6 +16,9 @@ TerminalState::TerminalState(Terminal& terminal): te->discardImage(*image); } }, hyperlinks { HyperlinkCache { 1024 } }, + urlPattern { settings.urlPattern, + std::regex_constants::ECMAScript | std::regex_constants::optimize + | std::regex_constants::icase }, sequencer { terminal }, parser { std::ref(sequencer) }, viCommands { terminal }, diff --git a/src/vtbackend/TerminalState.h b/src/vtbackend/TerminalState.h index 70635e4b7b..cc875dbcfa 100644 --- a/src/vtbackend/TerminalState.h +++ b/src/vtbackend/TerminalState.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -193,6 +194,7 @@ struct TerminalState // Hyperlink related // HyperlinkStorage hyperlinks {}; + std::regex urlPattern; std::string windowTitle {}; std::stack savedWindowTitles {}; From fb571c9f67ad1dfaf64ef267ef1f2921736eacef Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Mon, 10 Apr 2023 01:24:33 +0200 Subject: [PATCH 2/5] Adds regex_dfa library (repurposed from klex project) Signed-off-by: Christian Parpart --- .clang-format | 2 + src/CMakeLists.txt | 1 + src/regex_dfa/Alphabet.cpp | 56 +++ src/regex_dfa/Alphabet.h | 60 +++ src/regex_dfa/CMakeLists.txt | 41 ++ src/regex_dfa/CharStream.h | 67 +++ src/regex_dfa/Compiler.cpp | 191 +++++++++ src/regex_dfa/Compiler.h | 104 +++++ src/regex_dfa/DFA.cpp | 158 +++++++ src/regex_dfa/DFA.h | 170 ++++++++ src/regex_dfa/DFABuilder.cpp | 220 ++++++++++ src/regex_dfa/DFABuilder.h | 63 +++ src/regex_dfa/DFABuilder_test.cpp | 33 ++ src/regex_dfa/DFAMinimizer.cpp | 277 ++++++++++++ src/regex_dfa/DFAMinimizer.h | 65 +++ src/regex_dfa/DotVisitor.h | 29 ++ src/regex_dfa/DotWriter.cpp | 115 +++++ src/regex_dfa/DotWriter.h | 84 ++++ src/regex_dfa/DotWriter_test.cpp | 69 +++ src/regex_dfa/Lexable.h | 590 ++++++++++++++++++++++++++ src/regex_dfa/Lexer-inl.h | 343 +++++++++++++++ src/regex_dfa/Lexer.h | 284 +++++++++++++ src/regex_dfa/LexerDef.h | 87 ++++ src/regex_dfa/Lexer_test.cpp | 600 ++++++++++++++++++++++++++ src/regex_dfa/MultiDFA.cpp | 33 ++ src/regex_dfa/MultiDFA.h | 29 ++ src/regex_dfa/NFA.cpp | 375 ++++++++++++++++ src/regex_dfa/NFA.h | 222 ++++++++++ src/regex_dfa/NFABuilder.cpp | 124 ++++++ src/regex_dfa/NFABuilder.h | 55 +++ src/regex_dfa/NFA_test.cpp | 84 ++++ src/regex_dfa/RegExpr.cpp | 117 +++++ src/regex_dfa/RegExpr.h | 102 +++++ src/regex_dfa/RegExprParser.cpp | 481 +++++++++++++++++++++ src/regex_dfa/RegExprParser.h | 91 ++++ src/regex_dfa/RegExprParser_test.cpp | 299 +++++++++++++ src/regex_dfa/Report.cpp | 109 +++++ src/regex_dfa/Report.h | 223 ++++++++++ src/regex_dfa/Rule.h | 137 ++++++ src/regex_dfa/RuleParser.cpp | 379 +++++++++++++++++ src/regex_dfa/RuleParser.h | 187 ++++++++ src/regex_dfa/RuleParser_test.cpp | 247 +++++++++++ src/regex_dfa/SourceLocation.cpp | 27 ++ src/regex_dfa/SourceLocation.h | 40 ++ src/regex_dfa/State.cpp | 37 ++ src/regex_dfa/State.h | 53 +++ src/regex_dfa/State_test.cpp | 18 + src/regex_dfa/Symbols.cpp | 184 ++++++++ src/regex_dfa/Symbols.h | 206 +++++++++ src/regex_dfa/Symbols_test.cpp | 112 +++++ src/regex_dfa/TransitionMap-inl.h | 49 +++ src/regex_dfa/TransitionMap.h | 66 +++ src/regex_dfa/klex_test.cpp | 13 + src/regex_dfa/util/AnsiColor.h | 153 +++++++ src/regex_dfa/util/Flags.cpp | 578 +++++++++++++++++++++++++ src/regex_dfa/util/Flags.h | 171 ++++++++ src/regex_dfa/util/IntVector.h | 40 ++ src/regex_dfa/util/UnboxedRange.h | 94 +++++ src/regex_dfa/util/iterator-detail.h | 169 ++++++++ src/regex_dfa/util/iterator.h | 106 +++++ src/regex_dfa/util/iterator_test.cpp | 179 ++++++++ src/regex_dfa/util/literals.h | 73 ++++ src/regex_dfa/util/overloaded.h | 21 + src/regex_dfa/util/testing.cpp | 610 +++++++++++++++++++++++++++ src/regex_dfa/util/testing.h | 425 +++++++++++++++++++ 65 files changed, 10427 insertions(+) create mode 100644 src/regex_dfa/Alphabet.cpp create mode 100644 src/regex_dfa/Alphabet.h create mode 100644 src/regex_dfa/CMakeLists.txt create mode 100644 src/regex_dfa/CharStream.h create mode 100644 src/regex_dfa/Compiler.cpp create mode 100644 src/regex_dfa/Compiler.h create mode 100644 src/regex_dfa/DFA.cpp create mode 100644 src/regex_dfa/DFA.h create mode 100644 src/regex_dfa/DFABuilder.cpp create mode 100644 src/regex_dfa/DFABuilder.h create mode 100644 src/regex_dfa/DFABuilder_test.cpp create mode 100644 src/regex_dfa/DFAMinimizer.cpp create mode 100644 src/regex_dfa/DFAMinimizer.h create mode 100644 src/regex_dfa/DotVisitor.h create mode 100644 src/regex_dfa/DotWriter.cpp create mode 100644 src/regex_dfa/DotWriter.h create mode 100644 src/regex_dfa/DotWriter_test.cpp create mode 100644 src/regex_dfa/Lexable.h create mode 100644 src/regex_dfa/Lexer-inl.h create mode 100644 src/regex_dfa/Lexer.h create mode 100644 src/regex_dfa/LexerDef.h create mode 100644 src/regex_dfa/Lexer_test.cpp create mode 100644 src/regex_dfa/MultiDFA.cpp create mode 100644 src/regex_dfa/MultiDFA.h create mode 100644 src/regex_dfa/NFA.cpp create mode 100644 src/regex_dfa/NFA.h create mode 100644 src/regex_dfa/NFABuilder.cpp create mode 100644 src/regex_dfa/NFABuilder.h create mode 100644 src/regex_dfa/NFA_test.cpp create mode 100644 src/regex_dfa/RegExpr.cpp create mode 100644 src/regex_dfa/RegExpr.h create mode 100644 src/regex_dfa/RegExprParser.cpp create mode 100644 src/regex_dfa/RegExprParser.h create mode 100644 src/regex_dfa/RegExprParser_test.cpp create mode 100644 src/regex_dfa/Report.cpp create mode 100644 src/regex_dfa/Report.h create mode 100644 src/regex_dfa/Rule.h create mode 100644 src/regex_dfa/RuleParser.cpp create mode 100644 src/regex_dfa/RuleParser.h create mode 100644 src/regex_dfa/RuleParser_test.cpp create mode 100644 src/regex_dfa/SourceLocation.cpp create mode 100644 src/regex_dfa/SourceLocation.h create mode 100644 src/regex_dfa/State.cpp create mode 100644 src/regex_dfa/State.h create mode 100644 src/regex_dfa/State_test.cpp create mode 100644 src/regex_dfa/Symbols.cpp create mode 100644 src/regex_dfa/Symbols.h create mode 100644 src/regex_dfa/Symbols_test.cpp create mode 100644 src/regex_dfa/TransitionMap-inl.h create mode 100644 src/regex_dfa/TransitionMap.h create mode 100644 src/regex_dfa/klex_test.cpp create mode 100644 src/regex_dfa/util/AnsiColor.h create mode 100644 src/regex_dfa/util/Flags.cpp create mode 100644 src/regex_dfa/util/Flags.h create mode 100644 src/regex_dfa/util/IntVector.h create mode 100644 src/regex_dfa/util/UnboxedRange.h create mode 100644 src/regex_dfa/util/iterator-detail.h create mode 100644 src/regex_dfa/util/iterator.h create mode 100644 src/regex_dfa/util/iterator_test.cpp create mode 100644 src/regex_dfa/util/literals.h create mode 100644 src/regex_dfa/util/overloaded.h create mode 100644 src/regex_dfa/util/testing.cpp create mode 100644 src/regex_dfa/util/testing.h diff --git a/.clang-format b/.clang-format index bd35115950..9c0b3a8142 100644 --- a/.clang-format +++ b/.clang-format @@ -77,6 +77,8 @@ IncludeCategories: Priority: 3 - Regex: '^<(vtrasterizer)/' Priority: 4 + - Regex: '^<(regex_dfa)/' + Priority: 5 - Regex: '^<(text_shaper)/' Priority: 5 - Regex: '^<(crispy)/' diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 89c65121bc..f945b3715c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,7 @@ endif() include(PedanticCompiler) add_subdirectory(crispy) +add_subdirectory(regex_dfa) add_subdirectory(text_shaper) add_subdirectory(vtpty) add_subdirectory(vtparser) diff --git a/src/regex_dfa/Alphabet.cpp b/src/regex_dfa/Alphabet.cpp new file mode 100644 index 0000000000..704c59c7f6 --- /dev/null +++ b/src/regex_dfa/Alphabet.cpp @@ -0,0 +1,56 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +void Alphabet::insert(Symbol ch) +{ + if (alphabet_.find(ch) == alphabet_.end()) + { + DEBUG("Alphabet: insert '{:}'", prettySymbol(ch)); + alphabet_.insert(ch); + } +} + +string Alphabet::to_string() const +{ + stringstream sstr; + + sstr << '{'; + + for (Symbol c: alphabet_) + sstr << prettySymbol(c); + + sstr << '}'; + + return sstr.str(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Alphabet.h b/src/regex_dfa/Alphabet.h new file mode 100644 index 0000000000..eb6e7bf6df --- /dev/null +++ b/src/regex_dfa/Alphabet.h @@ -0,0 +1,60 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include + +namespace regex_dfa +{ + +/** + * Represents the alphabet of a finite automaton or regular expression. + */ +class Alphabet +{ + public: + using set_type = std::set; + using iterator = set_type::iterator; + + size_t size() const noexcept { return alphabet_.size(); } + + void insert(Symbol ch); + + std::string to_string() const; + + const iterator begin() const { return alphabet_.begin(); } + const iterator end() const { return alphabet_.end(); } + + private: + set_type alphabet_; +}; + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::Alphabet& v, FormatContext& ctx) + { + return format_to(ctx.out(), "{}", v.to_string()); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/CMakeLists.txt b/src/regex_dfa/CMakeLists.txt new file mode 100644 index 0000000000..3b4c594f96 --- /dev/null +++ b/src/regex_dfa/CMakeLists.txt @@ -0,0 +1,41 @@ +add_library(regex_dfa STATIC + Alphabet.cpp + Compiler.cpp + DFA.cpp + DFABuilder.cpp + DFAMinimizer.cpp + DotWriter.cpp + MultiDFA.cpp + NFA.cpp + NFABuilder.cpp + RegExpr.cpp + RegExprParser.cpp + RuleParser.cpp + State.cpp + Symbols.cpp + Report.cpp + SourceLocation.cpp +) + +target_include_directories(regex_dfa PUBLIC ${PROJECT_SOURCE_DIR}/src ${CMAKE_SOURCE_DIR}/src) +target_link_libraries(regex_dfa PUBLIC fmt::fmt-header-only) + +# ---------------------------------------------------------------------------- +if(TESTS) + add_executable(regex_dfa_test + regex_dfa_test.cpp + DFABuilder_test.cpp + DotWriter_test.cpp + Lexer_test.cpp + NFA_test.cpp + RegExprParser_test.cpp + RuleParser_test.cpp + State_test.cpp + Symbols_test.cpp + util/iterator_test.cpp + util/testing.cpp + ) + + target_link_libraries(regex_dfa_test PUBLIC regex_dfa) + target_link_libraries(regex_dfa_test PUBLIC fmt::fmt-header-only) +endif(TESTS) diff --git a/src/regex_dfa/CharStream.h b/src/regex_dfa/CharStream.h new file mode 100644 index 0000000000..79f087eec6 --- /dev/null +++ b/src/regex_dfa/CharStream.h @@ -0,0 +1,67 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +namespace regex_dfa +{ + +class CharStream +{ + public: + virtual ~CharStream() = default; + + [[nodiscard]] virtual bool isEof() const noexcept = 0; + virtual char get() = 0; + virtual void rollback(int count) = 0; + virtual void rewind() = 0; +}; + +class StringStream: public CharStream +{ + public: + explicit StringStream(std::string&& s): source_ { std::move(s) } {} + + [[nodiscard]] bool isEof() const noexcept override { return pos_ >= source_.size(); } + char get() override { return source_[pos_++]; } + void rollback(int count) override { pos_ -= count; } + void rewind() override { pos_ = 0; } + + private: + std::string source_; + size_t pos_ = 0; +}; + +class StandardStream: public CharStream +{ + public: + explicit StandardStream(std::istream* source); + + [[nodiscard]] bool isEof() const noexcept override { return !source_->good(); } + char get() override { return static_cast(source_->get()); } + + void rollback(int count) override + { + source_->clear(); + source_->seekg(-count, std::ios::cur); + } + + void rewind() override + { + source_->clear(); + source_->seekg(initialOffset_, std::ios::beg); + } + + private: + std::istream* source_; + std::streamoff initialOffset_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/Compiler.cpp b/src/regex_dfa/Compiler.cpp new file mode 100644 index 0000000000..42e7dca814 --- /dev/null +++ b/src/regex_dfa/Compiler.cpp @@ -0,0 +1,191 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +void Compiler::parse(string text) +{ + parse(make_unique(move(text))); +} + +void Compiler::parse(unique_ptr stream) +{ + declareAll(RuleParser { move(stream) }.parseRules()); +} + +void Compiler::declareAll(RuleList rules) +{ + rules_.reserve(rules_.size() + rules.size()); + + // populate RegExpr + for (Rule& rule: rules) + rule.regexpr = make_unique(RegExprParser {}.parse(rule.pattern, rule.line, rule.column)); + + containsBeginOfLine_ = any_of(rules.begin(), rules.end(), ruleContainsBeginOfLine); + + if (containsBeginOfLine_) + { + // We have at least one BOL-rule. + for (Rule& rule: rules) + { + if (!regex_dfa::containsBeginOfLine(*rule.regexpr)) + { + NFA nfa = NFABuilder {}.construct(*rule.regexpr, rule.tag); + for (const string& condition: rule.conditions) + { + NFA& fa = fa_[condition]; + if (fa.empty()) + fa = nfa.clone(); + else + fa.alternate(nfa.clone()); + } + declare(rule); + } + declare(rule, "_0"); // BOL + } + } + else + { + // No BOL-rules present, just declare them then. + for (Rule& rule: rules) + declare(rule); + } + + for (Rule& rule: rules) + { + if (auto i = names_.find(rule.tag); i != names_.end() && i->first != rule.tag) + // Can actually only happen on "ignore" attributed rule count > 1. + names_[rule.tag] = fmt::format("{}, {}", i->second, rule.name); + else + names_[rule.tag] = rule.name; + + rules_.emplace_back(move(rule)); + } +} + +size_t Compiler::size() const +{ + size_t result = 0; + for (const pair& fa: fa_) + result += fa.second.size(); + return result; +} + +void Compiler::declare(const Rule& rule, const string& conditionSuffix) +{ + NFA nfa = NFABuilder {}.construct(*rule.regexpr, rule.tag); + + for (const string& condition: rule.conditions) + { + NFA& fa = fa_[condition + conditionSuffix]; + + if (fa.empty()) + fa = nfa.clone(); + else + fa.alternate(nfa.clone()); + } +} + +// const map& Compiler::automata() const { +// return fa_; +// } + +MultiDFA Compiler::compileMultiDFA(OvershadowMap* overshadows) +{ + map dfaMap; + for (const auto& fa: fa_) + dfaMap[fa.first] = DFABuilder { fa.second.clone() }.construct(overshadows); + + return constructMultiDFA(move(dfaMap)); +} + +DFA Compiler::compileDFA(OvershadowMap* overshadows) +{ + assert((!containsBeginOfLine_ && fa_.size() == 1) || (containsBeginOfLine_ && fa_.size() == 2)); + return DFABuilder { fa_.begin()->second.clone() }.construct(overshadows); +} + +DFA Compiler::compileMinimalDFA() +{ + return DFAMinimizer { compileDFA() }.constructDFA(); +} + +LexerDef Compiler::compile() +{ + return generateTables(compileMinimalDFA(), containsBeginOfLine_, move(names_)); +} + +LexerDef Compiler::compileMulti(OvershadowMap* overshadows) +{ + MultiDFA multiDFA = compileMultiDFA(overshadows); + multiDFA = DFAMinimizer { multiDFA }.constructMultiDFA(); + return generateTables(multiDFA, containsBeginOfLine_, names()); +} + +LexerDef Compiler::generateTables(const DFA& dfa, bool requiresBeginOfLine, const map& names) +{ + const Alphabet alphabet = dfa.alphabet(); + TransitionMap transitionMap; + + for (StateId state = 0, sE = dfa.lastState(); state <= sE; ++state) + for (Symbol c: alphabet) + if (optional nextState = dfa.delta(state, c); nextState.has_value()) + transitionMap.define(state, c, nextState.value()); + + map acceptStates; + for (StateId s: dfa.acceptStates()) + acceptStates.emplace(s, *dfa.acceptTag(s)); + + // TODO: many initial states ! + return LexerDef { { { "INITIAL", dfa.initialState() } }, + requiresBeginOfLine, + move(transitionMap), + move(acceptStates), + dfa.backtracking(), + move(names) }; +} + +LexerDef Compiler::generateTables(const MultiDFA& multiDFA, + bool requiresBeginOfLine, + const map& names) +{ + const Alphabet alphabet = multiDFA.dfa.alphabet(); + TransitionMap transitionMap; + + for (StateId state = 0, sE = multiDFA.dfa.lastState(); state <= sE; ++state) + for (const Symbol c: alphabet) + if (optional nextState = multiDFA.dfa.delta(state, c); nextState.has_value()) + transitionMap.define(state, c, nextState.value()); + + map acceptStates; + for (StateId s: multiDFA.dfa.acceptStates()) + acceptStates.emplace(s, *multiDFA.dfa.acceptTag(s)); + + // TODO: many initial states ! + return LexerDef { multiDFA.initialStates, requiresBeginOfLine, move(transitionMap), + move(acceptStates), multiDFA.dfa.backtracking(), move(names) }; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Compiler.h b/src/regex_dfa/Compiler.h new file mode 100644 index 0000000000..88d2160b81 --- /dev/null +++ b/src/regex_dfa/Compiler.h @@ -0,0 +1,104 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +struct MultiDFA; + +/** + * Top-Level API for compiling lexical patterns into table definitions for Lexer. + * + * @see Lexer + */ +class Compiler +{ + public: + using TagNameMap = std::map; + using OvershadowMap = DFABuilder::OvershadowMap; + using AutomataMap = std::map; + + Compiler(): rules_ {}, containsBeginOfLine_ { false }, fa_ {}, names_ {} {} + + /** + * Parses a @p stream of textual rule definitions to construct their internal data structures. + */ + void parse(std::unique_ptr stream); + void parse(std::string text); + + /** + * Parses a list of @p rules to construct their internal data structures. + */ + void declareAll(RuleList rules); + + const RuleList& rules() const noexcept { return rules_; } + const TagNameMap& names() const noexcept { return names_; } + size_t size() const; + + /** + * Compiles all previousely parsed rules into a DFA. + */ + DFA compileDFA(OvershadowMap* overshadows = nullptr); + MultiDFA compileMultiDFA(OvershadowMap* overshadows = nullptr); + + /** + * Compiles all previousely parsed rules into a minimal DFA. + */ + DFA compileMinimalDFA(); + + /** + * Compiles all previousely parsed rules into a suitable data structure for Lexer. + * + * @see Lexer + */ + LexerDef compile(); + + /** + * Compiles all previousely parsed rules into a suitable data structure for Lexer, taking care of + * multiple conditions as well as begin-of-line. + */ + LexerDef compileMulti(OvershadowMap* overshadows = nullptr); + + /** + * Translates the given DFA @p dfa with a given TagNameMap @p names into trivial table mappings. + * + * @see Lexer + */ + static LexerDef generateTables(const DFA& dfa, bool requiresBeginOfLine, const TagNameMap& names); + static LexerDef generateTables(const MultiDFA& dfa, bool requiresBeginOfLine, const TagNameMap& names); + + const std::map& automata() const { return fa_; } + + bool containsBeginOfLine() const noexcept { return containsBeginOfLine_; } + + private: + /** + * Parses a single @p rule to construct their internal data structures. + */ + void declare(const Rule& rule, const std::string& conditionSuffix = ""); + + private: + RuleList rules_; + bool containsBeginOfLine_; + AutomataMap fa_; + TagNameMap names_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFA.cpp b/src/regex_dfa/DFA.cpp new file mode 100644 index 0000000000..e0ee1f12d1 --- /dev/null +++ b/src/regex_dfa/DFA.cpp @@ -0,0 +1,158 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include +#include +#include +#include +#include + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +using namespace std; + +namespace regex_dfa +{ + +Alphabet DFA::alphabet() const +{ + Alphabet alphabet; + for (const State& state: states_) + for (const pair& t: state.transitions) + alphabet.insert(t.first); + + return alphabet; +} + +vector DFA::acceptStates() const +{ + vector states; + states.reserve(acceptTags_.size()); + for_each(begin(acceptTags_), end(acceptTags_), [&](const pair& s) { + states.push_back(s.first); + }); + return states; +} + +// -------------------------------------------------------------------------- + +void DFA::createStates(size_t count) +{ + states_.resize(states_.size() + count); +} + +void DFA::setInitialState(StateId s) +{ + // TODO: assert (s is having no predecessors) + initialState_ = s; +} + +void DFA::setTransition(StateId from, Symbol symbol, StateId to) +{ + // if (auto i = states_[from].transitions.find(symbol); i != states_[from].transitions.end()) + // fmt::print("overwriting transition! {} --({})--> {} (new: {})\n", from, prettySymbol(symbol), + // i->second, to); + + // XXX assert(s.transitions.find(symbol) == s.transitions.end()); + states_[from].transitions[symbol] = to; +} + +void DFA::removeTransition(StateId from, Symbol symbol) +{ + State& s = states_[from]; + if (auto i = s.transitions.find(symbol); i != s.transitions.end()) + s.transitions.erase(i); +} + +StateId DFA::append(DFA&& other, StateId q0) +{ + assert(other.initialState() == 0); + + other.prepareStateIds(states_.size(), q0); + + states_.reserve(size() + other.size() - 1); + states_[q0] = other.states_[0]; + states_.insert(states_.end(), next(other.states_.begin()), other.states_.end()); + backtrackStates_.insert(other.backtrackStates_.begin(), other.backtrackStates_.end()); + acceptTags_.insert(other.acceptTags_.begin(), other.acceptTags_.end()); + + return other.initialState(); +} + +void DFA::prepareStateIds(StateId baseId, StateId q0) +{ + // adjust transition state IDs + // traverse through each state's transition set + // traverse through each transition in the transition set + // traverse through each element and add BASE_ID + + auto transformId = [baseId, q0, this](StateId s) -> StateId { + // we subtract 1, because we already have a slot for q0 elsewhere (pre-allocated) + return s != initialState_ ? baseId + s - 1 : q0; + }; + + // for each state's transitions + for (State& state: states_) + for (pair& t: state.transitions) + t.second = transformId(t.second); + + AcceptMap remapped; + for (auto& a: acceptTags_) + remapped[transformId(a.first)] = a.second; + acceptTags_ = move(remapped); + + BacktrackingMap backtracking; + for (const auto& bt: backtrackStates_) + backtracking[transformId(bt.first)] = transformId(bt.second); + backtrackStates_ = move(backtracking); + + initialState_ = q0; +} + +void DFA::visit(DotVisitor& v) const +{ + v.start(initialState_); + + // STATE: initial + v.visitNode(initialState_, true, isAccepting(initialState_)); + + // STATE: accepting + for (StateId s: acceptStates()) + if (s != initialState_) + v.visitNode(s, false, true); + + // STATE: any other + for (StateId s = 0, sE = lastState(); s != sE; ++s) + if (s != initialState_ && !isAccepting(s)) + v.visitNode(s, false, false); + + // TRANSITIONS + for (StateId s = 0, sE = size(); s != sE; ++s) + { + const TransitionMap& T = states_[s].transitions; + for_each(T.begin(), T.end(), [&](const auto& t) { v.visitEdge(s, t.second, t.first); }); + for_each(T.begin(), T.end(), [&](const auto& t) { v.endVisitEdge(s, t.second); }); + } + v.end(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFA.h b/src/regex_dfa/DFA.h new file mode 100644 index 0000000000..ceb82c4018 --- /dev/null +++ b/src/regex_dfa/DFA.h @@ -0,0 +1,170 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace regex_dfa +{ + +class NFA; +class DFABuilder; +class DotVisitor; + +/** + * Represents a deterministic finite automaton. + */ +class DFA +{ + public: + using TransitionMap = std::map; + struct State + { + // std::vector states; + TransitionMap transitions; + }; + using StateVec = std::vector; + + //! defines a mapping between accept state ID and another (prior) ID to track roll back the input stream + //! to. + using BacktrackingMap = std::map; + + DFA(const DFA& other) = delete; + DFA& operator=(const DFA& other) = delete; + DFA(DFA&&) = default; + DFA& operator=(DFA&&) = default; + ~DFA() = default; + + DFA(): states_ {}, initialState_ { 0 }, backtrackStates_ {}, acceptTags_ {} {} + + [[nodiscard]] bool empty() const noexcept { return states_.empty(); } + [[nodiscard]] size_t size() const noexcept { return states_.size(); } + + [[nodiscard]] StateId lastState() const noexcept + { + assert(!empty()); + return states_.size() - 1; + } + + //! Retrieves the alphabet of this finite automaton. + Alphabet alphabet() const; + + //! Retrieves the initial state. + StateId initialState() const { return initialState_; } + + //! Retrieves the list of available states. + const StateVec& states() const { return states_; } + StateVec& states() { return states_; } + + StateIdVec stateIds() const + { + StateIdVec v; + v.reserve(states_.size()); + for (size_t i = 0, e = states_.size(); i != e; ++i) + v.push_back(i); // funny, I know + return v; + } + + //! Retrieves the list of accepting states. + std::vector acceptStates() const; + + /** + * Traverses all states and edges in this NFA and calls @p visitor for each state & edge. + * + * Use this function to e.g. get a GraphViz dot-file drawn. + */ + void visit(DotVisitor& visitor) const; + + void createStates(size_t count); + + void setInitialState(StateId state); + + const TransitionMap& stateTransitions(StateId id) const + { + return states_[static_cast(id)].transitions; + } + + // {{{ backtracking (for lookahead) + void setBacktrack(StateId from, StateId to) { backtrackStates_[from] = to; } + + std::optional backtrack(StateId acceptState) const + { + if (auto i = backtrackStates_.find(acceptState); i != backtrackStates_.end()) + return i->second; + + return std::nullopt; + } + + const BacktrackingMap& backtracking() const noexcept { return backtrackStates_; } + // }}} + + //! Flags given state as accepting-state with given Tag @p acceptTag. + void setAccept(StateId state, Tag acceptTag) { acceptTags_[state] = acceptTag; } + + bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } + + std::optional acceptTag(StateId s) const + { + if (auto i = acceptTags_.find(s); i != acceptTags_.end()) + return i->second; + + return std::nullopt; + } + + std::optional delta(StateId state, Symbol symbol) const + { + const auto& T = states_[state].transitions; + if (auto i = T.find(symbol); i != T.end()) + return i->second; + + return std::nullopt; + } + + void setTransition(StateId from, Symbol symbol, StateId to); + void removeTransition(StateId from, Symbol symbol); + + StateIdVec nonAcceptStates() const + { + StateIdVec result; + result.reserve( + std::abs(static_cast(states_.size()) - static_cast(acceptTags_.size()))); + + for (StateId s = 0, sE = size(); s != sE; ++s) + if (!isAccepting(s)) + result.push_back(s); + + return result; + } + + bool isAcceptor(Tag t) const + { + for (const std::pair& p: acceptTags_) + if (p.second == t) + return true; + + return false; + } + + StateId append(DFA&& other, StateId q0); + + private: + void prepareStateIds(StateId baseId, StateId q0); + + private: + StateVec states_; + StateId initialState_; + BacktrackingMap backtrackStates_; + AcceptMap acceptTags_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFABuilder.cpp b/src/regex_dfa/DFABuilder.cpp new file mode 100644 index 0000000000..1c2ef725cf --- /dev/null +++ b/src/regex_dfa/DFABuilder.cpp @@ -0,0 +1,220 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +struct DFABuilder::TransitionTable +{ // {{{ + void insert(StateId q, Symbol c, StateId t); + unordered_map> transitions; +}; + +inline void DFABuilder::TransitionTable::insert(StateId q, Symbol c, StateId t) +{ + transitions[q][c] = t; +} +// }}} + +/* DFA construction visualization + REGEX: a(b|c)* + + NFA: n0 --(a)--> n1 --> n2 -----------------------------------> "n7" + \ ^ + \---> n3 <------------------------ / + \ \ \ / + \ \----> n4 --(b)--> n5 --> n6 + \ ^ + \----> n8 --(c)--> n9 ---/ + + DFA: + <--- + d0 --(a)--> "d1" ----(b)--> "d2"--(b) + \ |^ + \ (c)||(b) + \ v| + \--(c)--> "d3"--(c) + <--- + + + TABLE: + + set | DFA | NFA | + name | state | state | 'a' | 'b' | 'c' + -------------------------------------------------------------------------------------------------------- + q0 | d0 | {n0} | {n1,n2,n3,n4,n7,n8} | -none- | -none- + q1 | d1 | {n1,n2,n3,n4,n7,n8} | -none- | {n3,n4,n5,n6,n7,n8} | {n3,n4,n6,n7,n8,n9} + q2 | d2 | {n3,n4,n5,n6,n7,n8} | -none- | q2 | q3 + q3 | d3 | {n3,n4,n6,n7,n8,n9} | -none- | q2 | q3 +*/ + +DFA DFABuilder::construct(OvershadowMap* overshadows) +{ + const StateIdVec q_0 = nfa_.epsilonClosure({ nfa_.initialStateId() }); + vector Q = { q_0 }; // resulting states + deque workList = { q_0 }; + TransitionTable T; + + const Alphabet alphabet = nfa_.alphabet(); + + StateIdVec eclosure; + StateIdVec delta; + while (!workList.empty()) + { + const StateIdVec q = + move(workList.front()); // each set q represents a valid configuration from the NFA + workList.pop_front(); + const StateId q_i = *configurationNumber(Q, q); + + for (Symbol c: alphabet) + { + nfa_.epsilonClosure(*nfa_.delta(q, c, &delta), &eclosure); + if (!eclosure.empty()) + { + if (optional t_i = configurationNumber(Q, eclosure); t_i.has_value()) + T.insert(q_i, c, *t_i); // T[q][c] = eclosure; + else + { + Q.emplace_back(eclosure); + t_i = StateId { Q.size() - 1 }; // equal to configurationNumber(Q, eclosure); + T.insert(q_i, c, *t_i); // T[q][c] = eclosure; + workList.emplace_back(move(eclosure)); + } + eclosure.clear(); + } + delta.clear(); + } + } + + // Q now contains all the valid configurations and T all transitions between them + return constructDFA(Q, T, overshadows); +} + +DFA DFABuilder::constructDFA(const vector& Q, + const TransitionTable& T, + OvershadowMap* overshadows) const +{ + DFA dfa; + dfa.createStates(Q.size()); + + // build remaps table (used as cache for quickly finding DFA StateIds from NFA StateIds) + unordered_map remaps; + for_each(begin(Q), end(Q), [q_i = StateId { 0 }, &remaps](StateIdVec const& q) mutable { + for_each(begin(q), end(q), [&](StateId s) { remaps[s] = q_i; }); + q_i++; + }); + + // map q_i to d_i and flag accepting states + map overshadowing; + StateId q_i = 0; + for (const StateIdVec& q: Q) + { + // d_i represents the corresponding state in the DFA for all states of q from the NFA + const StateId d_i = q_i; + // cerr << fmt::format("map q{} to d{} for {} states, {}.\n", q_i, d_i->id(), q.size(), + // to_string(q, "d")); + + // if q contains an accepting state, then d is an accepting state in the DFA + if (nfa_.isAnyAccepting(q)) + { + optional tag = determineTag(q, &overshadowing); + assert(tag.has_value() && "DFA accept state merged from input states with different tags."); + // DEBUG("determineTag: q{} tag {} from {}.", q_i, *tag, q); + dfa.setAccept(d_i, *tag); + } + + if (optional bt = nfa_.containsBacktrackState(q); bt.has_value()) + { + // TODO: verify: must not contain more than one backtracking mapping + assert(dfa.isAccepting(d_i)); + dfa.setBacktrack(d_i, remaps[*bt]); + } + + q_i++; + } + + // observe mapping from q_i to d_i + for (auto const& [q_i, branch]: T.transitions) + for (auto const [c, t_i]: branch) + dfa.setTransition(q_i, c, t_i); + + // q_0 becomes d_0 (initial state) + dfa.setInitialState(0); + + if (overshadows) + { + // check if tag is an acceptor in NFA but not in DFA, hence, it was overshadowed by another rule + for (const pair a: nfa_.acceptMap()) + { + const Tag tag = a.second; + if (!dfa.isAcceptor(tag)) + if (auto i = overshadowing.find(tag); i != overshadowing.end()) + overshadows->emplace_back(tag, i->second); + } + } + + return dfa; +} + +optional DFABuilder::configurationNumber(const vector& Q, const StateIdVec& t) +{ + if (auto i = find(begin(Q), end(Q), t); i != end(Q)) + return distance(begin(Q), i); + else + return nullopt; +} + +optional DFABuilder::determineTag(const StateIdVec& qn, map* overshadows) const +{ + deque tags; + + for (StateId s: qn) + if (optional t = nfa_.acceptTag(s); t.has_value()) + tags.push_back(*t); + + if (tags.empty()) + return nullopt; + + sort(begin(tags), end(tags)); + + optional lowestTag = tags.front(); + tags.erase(begin(tags)); + + for (Tag tag: tags) + (*overshadows)[tag] = *lowestTag; // {tag} is overshadowed by {lowestTag} + + return lowestTag; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFABuilder.h b/src/regex_dfa/DFABuilder.h new file mode 100644 index 0000000000..6f3eb6138e --- /dev/null +++ b/src/regex_dfa/DFABuilder.h @@ -0,0 +1,63 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include +#include + +namespace regex_dfa +{ + +class DFA; +class State; + +class DFABuilder +{ + public: + //! Map of rules that shows which rule is overshadowed by which other rule. + using OvershadowMap = std::vector>; + + explicit DFABuilder(NFA&& nfa): nfa_ { std::move(nfa) } {} + + /** + * Constructs a DFA out of the NFA. + * + * @param overshadows if not nullptr, it will be used to store semantic information about + * which rule tags have been overshadowed by which. + */ + DFA construct(OvershadowMap* overshadows = nullptr); + + private: + struct TransitionTable; + + DFA constructDFA(const std::vector& Q, + const TransitionTable& T, + OvershadowMap* overshadows) const; + + /** + * Finds @p t in @p Q and returns its offset (aka configuration number) or -1 if not found. + */ + static std::optional configurationNumber(const std::vector& Q, const StateIdVec& t); + + /** + * Determines the tag to use for the deterministic state representing @p q from non-deterministic FA @p + * fa. + * + * @param q the set of states that reflect a single state in the DFA equal to the input FA + * + * @returns the determined tag or std::nullopt if none + */ + std::optional determineTag(const StateIdVec& q, std::map* overshadows) const; + + private: + const NFA nfa_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFABuilder_test.cpp b/src/regex_dfa/DFABuilder_test.cpp new file mode 100644 index 0000000000..5ff7a6a9a1 --- /dev/null +++ b/src/regex_dfa/DFABuilder_test.cpp @@ -0,0 +1,33 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include + +#include +#include + +#include + +using namespace regex_dfa; + +TEST(regex_DFABuilder, shadowing) +{ + Compiler cc; + cc.parse(std::make_unique(R"( + Identifier ::= [a-z][a-z0-9]* + TrueLiteral ::= "true" + )")); + // rule 2 is overshadowed by rule 1 + Compiler::OvershadowMap overshadows; + DFA dfa = cc.compileDFA(&overshadows); + ASSERT_EQ(1, overshadows.size()); + EXPECT_EQ(2, overshadows[0].first); // overshadowee + EXPECT_EQ(1, overshadows[0].second); // overshadower +} diff --git a/src/regex_dfa/DFAMinimizer.cpp b/src/regex_dfa/DFAMinimizer.cpp new file mode 100644 index 0000000000..f2c37af674 --- /dev/null +++ b/src/regex_dfa/DFAMinimizer.cpp @@ -0,0 +1,277 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +DFAMinimizer::DFAMinimizer(const DFA& dfa): + dfa_ { dfa }, + initialStates_ { { "INITIAL", dfa.initialState() } }, + alphabet_ { dfa_.alphabet() }, + targetStateIdMap_ {} +{ +} + +DFAMinimizer::DFAMinimizer(const MultiDFA& multiDFA): + dfa_ { multiDFA.dfa }, + initialStates_ { multiDFA.initialStates }, + alphabet_ { dfa_.alphabet() }, + targetStateIdMap_ {} +{ +} + +/** + * Tests whether or not StateId @p s is an initial state in any of the DFAs of the MultiDFA. + */ +bool DFAMinimizer::isMultiInitialState(StateId s) const +{ + return any_of(initialStates_.begin(), initialStates_.end(), [s](const auto& p) { return p.second == s; }); +} + +/** + * Tests whether any s in S is the initial state in the DFA that is to be minimized. + */ +bool DFAMinimizer::containsInitialState(const StateIdVec& S) const +{ + return any_of(S.begin(), S.end(), [this](StateId s) { return s == dfa_.initialState(); }); +} + +DFAMinimizer::PartitionVec::iterator DFAMinimizer::findGroup(StateId s) +{ + return find_if(begin(T), end(T), [&](StateIdVec& group) { + return dfa_.acceptTag(group.front()) == dfa_.acceptTag(s); + }); +} + +int DFAMinimizer::partitionId(StateId s) const +{ + auto i = + find_if(P.begin(), P.end(), [s](const auto& p) { return find(p.begin(), p.end(), s) != p.end(); }); + assert(i != P.end() && "State ID must be present in any of the partition sets."); + return static_cast(distance(P.begin(), i)); +} + +DFAMinimizer::PartitionVec DFAMinimizer::split(const StateIdVec& S) const +{ + for (Symbol c: alphabet_) + { + // if c splits S into s_1 and s_2 + // that is, phi(s_1, c) and phi(s_2, c) reside in two different p_i's (partitions) + // then return {s_1, s_2} + + map t_i; + for (StateId s: S) + { + if (const optional t = dfa_.delta(s, c); t.has_value()) + t_i[partitionId(*t)].push_back(s); + else + t_i[-1].push_back(s); + } + if (t_i.size() > 1) + { + DEBUG("split: {} on character '{}' into {} sets", to_string(S), (char) c, t_i.size()); + PartitionVec result; + for (const pair& t: t_i) + { + result.emplace_back(move(t.second)); + DEBUG(" partition {}: {}", t.first, t.second); + } + return result; + } + + assert(t_i.size() == 1); + + // t_i's only element thus is a reconstruction of S. + assert(t_i.begin()->second == S); + + for (StateId s: S) + { + PartitionVec result; + StateIdVec main; + + if (isMultiInitialState(s)) + result.emplace_back(StateIdVec { s }); + else + main.emplace_back(s); + + if (!main.empty()) + result.emplace_back(move(main)); + } + } + + DEBUG("split: no split needed for {}", to_string(S)); + return { S }; +} + +void DFAMinimizer::dumpGroups(const PartitionVec& T) +{ + DEBUG("dumping groups ({})", T.size()); + int groupNr = 0; + for (const auto& t: T) + { + stringstream sstr; + sstr << "{"; + for (size_t i = 0, e = t.size(); i != e; ++i) + { + if (i) + sstr << ", "; + sstr << "n" << t[i]; + } + sstr << "}"; + DEBUG("group {}: {}", groupNr, sstr.str()); + groupNr++; + } +} + +DFA DFAMinimizer::constructDFA() +{ + constructPartitions(); + return constructFromPartitions(P); +} + +MultiDFA DFAMinimizer::constructMultiDFA() +{ + constructPartitions(); + DFA dfamin = constructFromPartitions(P); + + // patch initialStates and the master-initial-state's transition symbol + MultiDFA::InitialStateMap initialStates; + for (const pair& p: initialStates_) + dfamin.removeTransition(dfamin.initialState(), static_cast(p.second)); + + for (const pair& p: initialStates_) + { + const StateId t = targetStateId(p.second); + initialStates[p.first] = t; + dfamin.setTransition(dfamin.initialState(), static_cast(t), t); + } + + return MultiDFA { move(initialStates), move(dfamin) }; +} + +void DFAMinimizer::constructPartitions() +{ + // group all accept states by their tag + for (StateId s: dfa_.acceptStates()) + { + if (auto group = findGroup(s); group != T.end()) + group->push_back(s); + else + T.push_back({ s }); + } + + // add another group for all non-accept states + T.emplace_back(dfa_.nonAcceptStates()); + + dumpGroups(T); + + PartitionVec splits; + while (P != T) + { + swap(P, T); + T.clear(); + + for (StateIdVec& p: P) + T.splice(T.end(), split(p)); + } + + // build up cache to quickly get target state ID from input DFA's state ID + targetStateIdMap_ = [&]() { + unordered_map remaps; + StateId p_i = 0; + for (const StateIdVec& p: P) + { + for (StateId s: p) + remaps[s] = p_i; + + p_i++; + } + return remaps; + }(); +} + +DFA DFAMinimizer::constructFromPartitions(const PartitionVec& P) const +{ + DEBUG("minimization terminated with {} unique partition sets", P.size()); + + // instanciate states + DFA dfamin; + dfamin.createStates(P.size()); + StateId p_i = 0; + for (const StateIdVec& p: P) + { + const StateId s = *p.begin(); + const StateId q = p_i; + DEBUG("Creating p{}: {} {}", + p_i, + dfa_.isAccepting(s) ? "accepting" : "rejecting", + containsInitialState(p) ? "initial" : ""); + if (optional tag = dfa_.acceptTag(s); tag.has_value()) + dfamin.setAccept(q, *tag); + + if (containsInitialState(p)) + dfamin.setInitialState(q); + + if (optional bt = containsBacktrackState(p); bt.has_value()) + dfamin.setBacktrack(p_i, targetStateId(*bt)); + + p_i++; + } + + // setup transitions + p_i = 0; + for (const StateIdVec& p: P) + { + const StateId s = *p.begin(); + for (const pair& transition: dfa_.stateTransitions(s)) + { + const int t_i = partitionId(transition.second); + DEBUG("map p{} --({})--> p{}", p_i, prettySymbol(transition.first), t_i); + dfamin.setTransition(p_i, transition.first, t_i); + } + p_i++; + } + + return dfamin; +} + +optional DFAMinimizer::containsBacktrackState(const StateIdVec& Q) const +{ + for (StateId q: Q) + if (optional t = dfa_.backtrack(q); t.has_value()) + return *t; + + return nullopt; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DFAMinimizer.h b/src/regex_dfa/DFAMinimizer.h new file mode 100644 index 0000000000..40647044c3 --- /dev/null +++ b/src/regex_dfa/DFAMinimizer.h @@ -0,0 +1,65 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class DFA; + +class DFAMinimizer +{ + public: + explicit DFAMinimizer(const DFA& dfa); + explicit DFAMinimizer(const MultiDFA& multiDFA); + + DFA constructDFA(); + MultiDFA constructMultiDFA(); + + private: + using PartitionVec = std::list; + + void constructPartitions(); + StateIdVec nonAcceptStates() const; + bool containsInitialState(const StateIdVec& S) const; + bool isMultiInitialState(StateId s) const; + PartitionVec::iterator findGroup(StateId s); + int partitionId(StateId s) const; + PartitionVec split(const StateIdVec& S) const; + DFA constructFromPartitions(const PartitionVec& P) const; + std::optional containsBacktrackState(const StateIdVec& Q) const; + + static void dumpGroups(const PartitionVec& T); + + StateId targetStateId(StateId oldId) const + { + auto i = targetStateIdMap_.find(oldId); + assert(i != targetStateIdMap_.end()); + return i->second; + } + + private: + const DFA& dfa_; + const MultiDFA::InitialStateMap initialStates_; + const Alphabet alphabet_; + PartitionVec T; + PartitionVec P; + std::unordered_map targetStateIdMap_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotVisitor.h b/src/regex_dfa/DotVisitor.h new file mode 100644 index 0000000000..6eb4a62cc1 --- /dev/null +++ b/src/regex_dfa/DotVisitor.h @@ -0,0 +1,29 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +#include + +namespace regex_dfa +{ + +class DotVisitor +{ + public: + virtual ~DotVisitor() {} + + virtual void start(StateId initialState) = 0; + virtual void visitNode(StateId number, bool start, bool accept) = 0; + virtual void visitEdge(StateId from, StateId to, Symbol s) = 0; + virtual void endVisitEdge(StateId from, StateId to) = 0; + virtual void end() = 0; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotWriter.cpp b/src/regex_dfa/DotWriter.cpp new file mode 100644 index 0000000000..c5883a93ea --- /dev/null +++ b/src/regex_dfa/DotWriter.cpp @@ -0,0 +1,115 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +template +static string escapeString(const StringType& str) +{ + stringstream stream_; + for (char ch: str) + { + // \t\n\r is already converted to escape sequence + switch (ch) + { + case '\\': stream_ << "\\\\"; break; + case '"': stream_ << "\\\""; break; + default: stream_ << ch; break; + } + } + return stream_.str(); +} + +void DotWriter::start(StateId initialState) +{ + initialState_ = initialState; + stream_ << "digraph {\n"; + stream_ << " rankdir=LR;\n"; + // stream_ << " label=\"" << escapeString("FA" /*TODO*/) << "\";\n"; +} + +void DotWriter::visitNode(StateId number, bool start, bool accept) +{ + if (start) + { + const string_view shape = accept ? "doublecircle" : "circle"; + stream_ << " \"\" [shape=plaintext];\n"; + stream_ << " node [shape=" << shape << ",color=red];\n"; + stream_ << " \"\" -> " << stateLabelPrefix_ << number << ";\n"; + stream_ << " node [color=black];\n"; + } + else if (accept) + { + stream_ << " node [shape=doublecircle]; " << stateLabelPrefix_ << number << ";\n"; + stream_ << " node [shape=circle,color=black];\n"; + } + else + { + // stream_ << stateLabelPrefix_ << number << ";\n"; + } +} + +void DotWriter::visitEdge(StateId from, StateId to, Symbol s) +{ + transitionGroups_[to].push_back(s); +} + +void DotWriter::endVisitEdge(StateId from, StateId to) +{ + auto& tgroup = transitionGroups_[to]; + if (!tgroup.empty()) + { + if (from == initialState_ && initialStates_ != nullptr) + { + for (Symbol s: tgroup) + { + const string label = [this, s]() { + for (const auto& p: *initialStates_) + if (p.second == static_cast(s)) + return fmt::format("<{}>", p.first); + return prettySymbol(s); + }(); + stream_ << fmt::format(" {}{} -> {}{} [label=\"{}\"];\n", + stateLabelPrefix_, + from, + stateLabelPrefix_, + to, + escapeString(label)); + } + } + else + { + string label = groupCharacterClassRanges(move(tgroup)); + stream_ << fmt::format(" {}{} -> {}{} [label=\"{}\"];\n", + stateLabelPrefix_, + from, + stateLabelPrefix_, + to, + escapeString(label)); + } + tgroup.clear(); + } +} + +void DotWriter::end() +{ + stream_ << "}\n"; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotWriter.h b/src/regex_dfa/DotWriter.h new file mode 100644 index 0000000000..66fa177ec5 --- /dev/null +++ b/src/regex_dfa/DotWriter.h @@ -0,0 +1,84 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class DotWriter: public DotVisitor +{ + public: + DotWriter(std::ostream& os, std::string stateLabelPrefix): + ownedStream_ {}, + stream_ { os }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { nullptr }, + initialState_ { 0 } + { + } + + DotWriter(const std::string& filename, std::string stateLabelPrefix): + ownedStream_ { std::make_unique(filename) }, + stream_ { *ownedStream_.get() }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { nullptr }, + initialState_ { 0 } + { + } + + DotWriter(std::ostream& os, std::string stateLabelPrefix, const MultiDFA::InitialStateMap& initialStates): + ownedStream_ {}, + stream_ { os }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { &initialStates }, + initialState_ { 0 } + { + } + + DotWriter(const std::string& filename, + std::string stateLabelPrefix, + const MultiDFA::InitialStateMap& initialStates): + ownedStream_ { std::make_unique(filename) }, + stream_ { *ownedStream_.get() }, + stateLabelPrefix_ { stateLabelPrefix }, + transitionGroups_ {}, + initialStates_ { &initialStates }, + initialState_ { 0 } + { + } + + public: + void start(StateId initialState) override; + void visitNode(StateId number, bool start, bool accept) override; + void visitEdge(StateId from, StateId to, Symbol s) override; + void endVisitEdge(StateId from, StateId to) override; + void end() override; + + private: + std::unique_ptr ownedStream_; + std::ostream& stream_; + std::string stateLabelPrefix_; + std::map /*transition symbols*/> transitionGroups_; + const MultiDFA::InitialStateMap* initialStates_; + StateId initialState_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/DotWriter_test.cpp b/src/regex_dfa/DotWriter_test.cpp new file mode 100644 index 0000000000..78b24eff74 --- /dev/null +++ b/src/regex_dfa/DotWriter_test.cpp @@ -0,0 +1,69 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +using namespace std; +using namespace regex_dfa; + +TEST(regex_DotWriter, simple) +{ + stringstream sstr; + DotWriter dw(sstr, "n"); + + dw.start(0); + dw.visitNode(0, true, true); + dw.visitEdge(0, 1, 'a'); + dw.endVisitEdge(0, 1); + + dw.visitNode(1, false, true); + dw.visitEdge(1, 1, 'b'); + dw.visitEdge(1, 1, '\r'); + dw.visitEdge(1, 1, '\n'); + dw.visitEdge(1, 1, '\t'); + dw.visitEdge(1, 1, ' '); + dw.endVisitEdge(1, 1); + dw.end(); + + log(sstr.str()); + ASSERT_TRUE(!sstr.str().empty()); + // just make sure it processes +} + +TEST(regex_DotWriter, multidfa_simple) +{ + stringstream sstr; + const MultiDFA::InitialStateMap mis { { "foo", 1 }, { "bar", 2 } }; + DotWriter dw(sstr, "n", mis); + + dw.start(0); + dw.visitNode(0, true, false); + dw.visitNode(1, false, true); + dw.visitNode(2, false, true); + + dw.visitEdge(0, 1, 0x01); + dw.endVisitEdge(0, 1); + + dw.visitEdge(0, 2, 0x02); + dw.endVisitEdge(0, 2); + + dw.visitEdge(1, 1, 'a'); + dw.endVisitEdge(1, 1); + + dw.visitEdge(2, 2, 'a'); + dw.endVisitEdge(2, 2); + + dw.end(); + + log(sstr.str()); + ASSERT_TRUE(!sstr.str().empty()); + // just make sure it processes +} diff --git a/src/regex_dfa/Lexable.h b/src/regex_dfa/Lexable.h new file mode 100644 index 0000000000..f9988522df --- /dev/null +++ b/src/regex_dfa/Lexable.h @@ -0,0 +1,590 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include // TokenInfo: TODO: remove that header/API (inline TokenInfo here then) +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +//! Runtime exception that is getting thrown when a word could not be recognized. +struct LexerError: public std::runtime_error +{ + explicit LexerError(unsigned int _offset): + std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", _offset) }, + offset { _offset } + { + } + + unsigned int offset; +}; + +template +class LexerIterator +{ + public: + using TokenInfo = regex_dfa::TokenInfo; + using TraceFn = std::function; + + using difference_type = long; + using value_type = TokenInfo; + using pointer = TokenInfo*; + using reference = TokenInfo&; + using iterator_category = std::forward_iterator_tag; + + enum class Eof + { + EofMark + }; + + /** + * Initializes a LexerIterator that purely marks the end of a lexically analyzed stream. + */ + explicit LexerIterator(Eof); + + /** + * Initializes a LexerIterator for a given source to be analyzed with given lexer definition . + */ + LexerIterator(const LexerDef& ld, std::istream& source, TraceFn trace = TraceFn {}); + + /** + * Retrieves the default DFA machine that is used to recognize words. + */ + Machine defaultMachine() const noexcept; + + /** + * Sets the active deterministic finite automaton to use for recognizing words. + * + * @param machine the DFA machine to use for recognizing words. + * @return the previous Machine state. + */ + Machine setMachine(Machine machine); + + const TokenInfo& operator*() const noexcept { return currentToken_; } + auto offset() const noexcept { return currentToken_.offset; } + auto literal() const noexcept -> const std::string& { return currentToken_.literal; } + auto token() const noexcept { return currentToken_.token; } + auto name() const noexcept { return name(token()); } + + bool operator==(const LexerIterator& rhs) const noexcept; + bool operator!=(const LexerIterator& rhs) const noexcept; + + LexerIterator& operator++(); + LexerIterator& operator++(int); + + private: + void recognize(); + Token recognizeOne(); + + // --------------------------------------------------------------------------------- + // state helpers + + static constexpr StateId BadState = std::numeric_limits::max(); + + StateId getInitialState() const noexcept; + bool isAcceptState(StateId state) const; + + /** + * Retrieves the next state for given input state and input symbol. + * + * @param currentState the current State the DFA is in to. + * @param inputSymbol the input symbol that is used for transitioning from current state to the next + * state. + * @returns the next state to transition to. + */ + StateId delta(StateId currentState, Symbol inputSymbol) const; + + // --------------------------------------------------------------------------------- + // stream helpers + + int currentChar() const noexcept { return currentChar_; } + bool eof() const noexcept { return !source_->good(); } + Symbol nextChar(); + void rollback(); + + // --------------------------------------------------------------------------------- + // debugging helpers + + template + void tracef(const char* msg, Args&&... args) const; + + const std::string& name(Token t) const; + + std::string toString(const std::deque& stack); + Token token(StateId s) const; + static std::string stateName(StateId s); + + private: + const LexerDef* def_ = nullptr; + const TraceFn trace_; + std::istream* source_ = nullptr; + int eof_ = 0; // 0=No, 1=EOF_INIT, 2=EOF_FINAL + + TokenInfo currentToken_; + Machine initialStateId_ = def_ ? defaultMachine() : Machine {}; + unsigned offset_ = 0; + bool isBeginOfLine_ = true; + int currentChar_ = -1; + std::vector buffered_; +}; + +template +inline Token token(const LexerIterator& it) +{ + return it.token(); +} + +template +inline size_t offset(const LexerIterator& it) +{ + return it.offset(); +} + +template +inline const std::string& literal(const LexerIterator& it) +{ + return it.literal(); +} + +/** + * @brief Holds a lexically analyzable stream of characters with a Lexer definition. + */ +template +class Lexable +{ + public: + using TraceFn = std::function; + using iterator = LexerIterator; + using value_type = TokenInfo; + + Lexable(const LexerDef& ld, std::istream& src, TraceFn trace = TraceFn {}): + def_ { ld }, source_ { &src }, initialOffset_ { source_->tellg() }, trace_ { std::move(trace) } + { + if constexpr (!RequiresBeginOfLine) + if (def_.containsBeginOfLineStates) + throw std::invalid_argument { + "LexerDef contains a grammar that requires begin-of-line handling, but this Lexer has " + "begin-of-line support disabled." + }; + } + + Lexable(const LexerDef& ld, const std::string& src, TraceFn trace = TraceFn {}): + Lexable { ld, std::make_unique(src), std::move(trace) } + { + } + + Lexable(const LexerDef& ld, std::unique_ptr&& src, TraceFn trace = TraceFn {}): + Lexable(ld, *src, std::move(trace)) + { + ownedSource_ = std::move(src); + } + + auto begin() const + { + source_->clear(); + source_->seekg(initialOffset_, std::ios::beg); + return iterator { def_, *source_, trace_ }; + } + + auto end() const { return iterator { iterator::Eof::EofMark }; } + + private: + const LexerDef& def_; + std::unique_ptr ownedSource_; + std::istream* source_; + std::streamoff initialOffset_; + TraceFn trace_; +}; + +template +inline auto begin(const Lexable& ls) +{ + return ls.begin(); +} + +template +inline auto end(const Lexable& ls) +{ + return ls.end(); +} + +// {{{ LexerIterator: impl +template +LexerIterator::LexerIterator(Eof): eof_ { 2 } +{ +} + +template +LexerIterator::LexerIterator(const LexerDef& ld, + std::istream& source, + TraceFn trace): + def_ { &ld }, trace_ { trace }, source_ { &source } +{ + recognize(); +} + +template +Machine LexerIterator::defaultMachine() const noexcept +{ + auto i = def_->initialStates.find("INITIAL"); + assert(i != def_->initialStates.end()); + return static_cast(i->second); +} + +template +Machine LexerIterator::setMachine(Machine machine) +{ + return initialStateId_ = static_cast(machine); +} + +template +bool LexerIterator::operator==( + const LexerIterator& rhs) const noexcept +{ + return offset_ == rhs.offset_ || (eof_ == 2 && rhs.eof_ == 2); +} + +template +bool LexerIterator::operator!=( + const LexerIterator& rhs) const noexcept +{ + return !(*this == rhs); +} + +template +LexerIterator& LexerIterator::operator++() +{ + if (eof()) + eof_++; + + recognize(); + return *this; +} + +template +LexerIterator& LexerIterator::operator++(int) +{ + if (eof()) + eof_++; + + recognize(); + return *this; +} + +template +inline void LexerIterator::recognize() +{ + for (;;) + if (Token tag = recognizeOne(); static_cast(tag) != IgnoreTag) + return; +} + +template +inline Token LexerIterator::recognizeOne() +{ + // init + currentToken_.offset = offset_; + currentToken_.literal.clear(); + + StateId state = getInitialState(); + std::deque stack; + stack.push_back(BadState); + + if constexpr (Trace) + tracef("recognize: startState {}, offset {} {}", + stateName(state), + offset_, + isBeginOfLine_ ? "BOL" : "no-BOL"); + + // advance + while (state != ErrorState) + { + Symbol ch = nextChar(); // one of: input character, ERROR or EOF + currentToken_.literal.push_back(ch); + + // we do not stack.clear() stack if isAcceptState(state) as we need this information iff + // lookahead is required. Otherwise we could clear here (for space savings) + + stack.push_back(state); + state = delta(state, ch); + } + + // backtrack to last (right-most) accept state + while (state != BadState && !isAcceptState(state)) + { + if constexpr (Trace) + tracef("recognize: backtrack: current state {} {}; stack: {}", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + toString(stack)); + + state = stack.back(); + stack.pop_back(); + if (!currentToken_.literal.empty()) + { + rollback(); + currentToken_.literal.resize(currentToken_.literal.size() - 1); + } + } + + // backtrack to right-most non-lookahead position in input stream + if (auto i = def_->backtrackingStates.find(state); i != def_->backtrackingStates.end()) + { + const StateId tmp = state; + const StateId backtrackState = i->second; + if constexpr (Trace) + tracef("recognize: backtracking from {} to {}; stack: {}", + stateName(state), + stateName(backtrackState), + toString(stack)); + while (!stack.empty() && state != backtrackState) + { + state = stack.back(); + stack.pop_back(); + if constexpr (Trace) + tracef("recognize: backtrack: state {}", stateName(state)); + if (!currentToken_.literal.empty()) + { + rollback(); + currentToken_.literal.resize(currentToken_.literal.size() - 1); + } + } + state = tmp; + } + + if constexpr (Trace) + tracef("recognize: final state {} {} {} {}-{} {} [currentChar: {}]", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + isAcceptState(state) ? name(token(state)) : std::string(), + currentToken_.offset, + offset_, + quotedString(currentToken_.literal), + quoted(currentChar_)); + + if (!isAcceptState(state)) + throw LexerError { offset_ }; + + auto i = def_->acceptStates.find(state); + assert(i != def_->acceptStates.end() && "Accept state hit, but no tag assigned."); + isBeginOfLine_ = currentToken_.literal.back() == '\n'; + + return currentToken_.token = static_cast(i->second); +} + +template +inline StateId LexerIterator::getInitialState() const noexcept +{ + if constexpr (RequiresBeginOfLine) + if (isBeginOfLine_ && def_->containsBeginOfLineStates) + return static_cast(initialStateId_) + 1; + + return static_cast(initialStateId_); +} + +template +inline bool LexerIterator::isAcceptState(StateId id) const +{ + return def_->acceptStates.find(id) != def_->acceptStates.end(); +} + +template +StateId LexerIterator::delta(StateId currentState, + Symbol inputSymbol) const +{ + const StateId nextState = def_->transitions.apply(currentState, inputSymbol); + if constexpr (Trace) + { + if (isAcceptState(nextState)) + tracef("recognize: state {:>4} --{:-^7}--> {:<6} (accepting: {})", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState), + name(token(nextState))); + else + tracef("recognize: state {:>4} --{:-^7}--> {:<6}", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState)); + } + + return nextState; +} + +template +inline Symbol LexerIterator::nextChar() +{ + if (!buffered_.empty()) + { + int ch = buffered_.back(); + currentChar_ = ch; + buffered_.resize(buffered_.size() - 1); + if constexpr (Trace) + tracef("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; + } + + if (!source_->good()) + { // EOF or I/O error + if constexpr (Trace) + tracef("Lexer:{}: advance '{}'", offset_, "EOF"); + return Symbols::EndOfFile; + } + + int ch = source_->get(); + if (ch < 0) + { + currentChar_ = Symbols::EndOfFile; + offset_++; + if constexpr (Trace) + tracef("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + return currentChar_; + } + + currentChar_ = ch; + if constexpr (Trace) + tracef("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; +} + +template +inline void LexerIterator::rollback() +{ + currentChar_ = currentToken_.literal.back(); + if (currentToken_.literal.back() != -1) + { + offset_--; + buffered_.push_back(currentToken_.literal.back()); + } +} + +// ================================================================================= + +template +template +inline void LexerIterator::tracef(const char* msg, + Args&&... args) const +{ + if constexpr (Trace) + if (trace_) + trace_(fmt::format(msg, std::forward(args)...)); +} + +template +inline const std::string& LexerIterator::name(Token t) const +{ + auto i = def_->tagNames.find(static_cast(t)); + assert(i != def_->tagNames.end()); + return i->second; +} + +template +inline std::string LexerIterator::toString( + const std::deque& stack) +{ + std::stringstream sstr; + sstr << "{"; + int i = 0; + for (const auto s: stack) + { + if (i) + sstr << ","; + sstr << stateName(s); + i++; + } + + sstr << "}"; + return sstr.str(); +} + +template +Token LexerIterator::token(StateId s) const +{ + auto i = def_->acceptStates.find(s); + assert(i != def_->acceptStates.end()); + return static_cast(i->second); +} + +template +inline std::string LexerIterator::stateName(StateId s) +{ + switch (s) + { + case BadState: return "Bad"; + case ErrorState: return "Error"; + default: return fmt::format("n{}", std::to_string(s)); + } +} +// }}} + +} // namespace regex_dfa + +namespace std +{ +template +struct iterator_traits> +{ + using iterator = regex_dfa::LexerIterator; + + using difference_type = typename iterator::difference_type; + using value_type = typename iterator::value_type; + using pointer = typename iterator::pointer; + using reference = typename iterator::reference; + using iterator_category = typename iterator::iterator_category; +}; +} // namespace std + +namespace fmt +{ +template +struct formatter> +{ + using TokenInfo = regex_dfa::TokenInfo; + using LexerIterator = regex_dfa::LexerIterator; + + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const LexerIterator& v, FormatContext& ctx) + { + return format_to(ctx.out(), "{} ({})", v.literal(), v.name()); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/Lexer-inl.h b/src/regex_dfa/Lexer-inl.h new file mode 100644 index 0000000000..fbc1521c99 --- /dev/null +++ b/src/regex_dfa/Lexer-inl.h @@ -0,0 +1,343 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +static inline std::string quoted(char ch) +{ + if (ch < 0) + return "<>"; + if (ch == '\n') + return "\\n"; + if (ch == ' ') + return "\\s"; + return fmt::format("{}", ch); +} + +static inline std::string quotedString(const std::string& s) +{ + std::stringstream sstr; + sstr << std::quoted(s); + return sstr.str(); +} + +template +inline Lexer::Lexer(const LexerDef& info, DebugLogger logger): + def_ { info }, + debug_ { logger }, + initialStateId_ { defaultMachine() }, + word_ {}, + ownedStream_ {}, + stream_ { nullptr }, + oldOffset_ { 0 }, + offset_ { 0 }, + fileSize_ { 0 }, + isBeginOfLine_ { true }, + token_ { 0 } +{ + if constexpr (!RequiresBeginOfLine) + if (def_.containsBeginOfLineStates) + throw std::invalid_argument { + "LexerDef contains a grammar that requires begin-of-line handling, but this Lexer has " + "begin-of-line support disabled." + }; +} + +template +inline Lexer::Lexer(const LexerDef& info, + std::unique_ptr stream, + DebugLogger logger): + Lexer { info, std::move(logger) } +{ + reset(std::move(stream)); +} + +template +inline Lexer::Lexer(const LexerDef& info, + std::istream& stream, + DebugLogger logger): + Lexer { info, std::move(logger) } +{ + stream_ = &stream; + fileSize_ = getFileSize(); +} + +template +inline Lexer::Lexer(const LexerDef& info, + std::string input, + DebugLogger logger): + Lexer { info, std::move(logger) } +{ + reset(std::make_unique(std::move(input))); +} + +template +inline void Lexer::reset(std::unique_ptr stream) +{ + ownedStream_ = std::move(stream); + stream_ = ownedStream_.get(); + oldOffset_ = 0; + offset_ = 0; + isBeginOfLine_ = true; + fileSize_ = getFileSize(); +} + +template +inline void Lexer::reset(const std::string& text) +{ + reset(std::make_unique(text)); +} + +template +inline size_t Lexer::getFileSize() +{ + std::streamoff oldpos = stream_->tellg(); + stream_->seekg(0, stream_->end); + + std::streamoff theSize = stream_->tellg(); + stream_->seekg(oldpos, stream_->beg); + + return static_cast(theSize); +} + +template +inline std::string Lexer::stateName(StateId s, + const std::string_view& n) +{ + switch (s) + { + case BadState: return "Bad"; + case ErrorState: return "Error"; + default: return fmt::format("{}{}", n, std::to_string(s)); + } +} + +template +inline std::string Lexer::toString( + const std::deque& stack) +{ + std::stringstream sstr; + sstr << "{"; + int i = 0; + for (const auto s: stack) + { + if (i) + sstr << ","; + sstr << stateName(s); + i++; + } + + sstr << "}"; + return sstr.str(); +} + +template +inline auto Lexer::recognize() -> TokenInfo +{ + for (;;) + if (Token tag = recognizeOne(); static_cast(tag) != IgnoreTag) + return TokenInfo { tag, word_, oldOffset_ }; +} + +template +inline StateId Lexer::getInitialState() const noexcept +{ + if constexpr (RequiresBeginOfLine) + { + if (isBeginOfLine_ && def_.containsBeginOfLineStates) + { + return static_cast(initialStateId_) + 1; + } + } + + return static_cast(initialStateId_); +} + +template +inline Token Lexer::recognizeOne() +{ + // init + oldOffset_ = offset_; + word_.clear(); + StateId state = getInitialState(); + std::deque stack; + stack.push_back(BadState); + + if constexpr (Debug) + debugf("recognize: startState {}, offset {} {}", + stateName(state), + offset_, + isBeginOfLine_ ? "BOL" : "no-BOL"); + + // advance + while (state != ErrorState) + { + Symbol ch = nextChar(); // one of: input character, ERROR or EOF + word_.push_back(ch); + + // we do not stack.clear() stack if isAcceptState(state) as we need this information iff + // lookahead is required. Otherwise we could clear here (for space savings) + + stack.push_back(state); + state = delta(state, ch); + } + + // backtrack to last (right-most) accept state + while (state != BadState && !isAcceptState(state)) + { + if constexpr (Debug) + debugf("recognize: backtrack: current state {} {}; stack: {}", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + toString(stack)); + + state = stack.back(); + stack.pop_back(); + if (!word_.empty()) + { + rollback(); + word_.resize(word_.size() - 1); + } + } + + // backtrack to right-most non-lookahead position in input stream + if (auto i = def_.backtrackingStates.find(state); i != def_.backtrackingStates.end()) + { + const StateId tmp = state; + const StateId backtrackState = i->second; + if constexpr (Debug) + debugf("recognize: backtracking from {} to {}; stack: {}", + stateName(state), + stateName(backtrackState), + toString(stack)); + while (!stack.empty() && state != backtrackState) + { + state = stack.back(); + stack.pop_back(); + if constexpr (Debug) + debugf("recognize: backtrack: state {}", stateName(state)); + if (!word_.empty()) + { + rollback(); + word_.resize(word_.size() - 1); + } + } + state = tmp; + } + + if constexpr (Debug) + debugf("recognize: final state {} {} {} {}-{} {} [currentChar: {}]", + stateName(state), + isAcceptState(state) ? "accepting" : "non-accepting", + isAcceptState(state) ? name(token(state)) : std::string(), + oldOffset_, + offset_, + quotedString(word_), + quoted(currentChar_)); + + if (!isAcceptState(state)) + throw LexerError { offset_ }; + + auto i = def_.acceptStates.find(state); + assert(i != def_.acceptStates.end() && "Accept state hit, but no tag assigned."); + isBeginOfLine_ = word_.back() == '\n'; + return token_ = static_cast(i->second); +} + +template +inline StateId Lexer::delta(StateId currentState, + Symbol inputSymbol) const +{ + const StateId nextState = def_.transitions.apply(currentState, inputSymbol); + if constexpr (Debug) + { + if (isAcceptState(nextState)) + { + debugf("recognize: state {:>4} --{:-^7}--> {:<6} (accepting: {})", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState), + name(token(nextState))); + } + else + { + debugf("recognize: state {:>4} --{:-^7}--> {:<6}", + stateName(currentState), + prettySymbol(inputSymbol), + stateName(nextState)); + } + } + + return nextState; +} + +template +inline bool Lexer::isAcceptState(StateId id) const +{ + return def_.acceptStates.find(id) != def_.acceptStates.end(); +} + +template +inline Symbol Lexer::nextChar() +{ + if (!buffered_.empty()) + { + int ch = buffered_.back(); + currentChar_ = ch; + buffered_.resize(buffered_.size() - 1); + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; + } + + if (!stream_->good()) + { // EOF or I/O error + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, "EOF"); + return Symbols::EndOfFile; + } + + int ch = stream_->get(); + if (ch < 0) + { + currentChar_ = Symbols::EndOfFile; + offset_++; + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + return currentChar_; + } + + currentChar_ = ch; + if constexpr (Debug) + debugf("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + offset_++; + return ch; +} + +template +inline void Lexer::rollback() +{ + currentChar_ = word_.back(); + if (word_.back() != -1) + { + offset_--; + buffered_.push_back(word_.back()); + } +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Lexer.h b/src/regex_dfa/Lexer.h new file mode 100644 index 0000000000..db75097717 --- /dev/null +++ b/src/regex_dfa/Lexer.h @@ -0,0 +1,284 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +template +struct TokenInfo +{ + Token token; + std::string literal; + size_t offset; + + operator Token() const noexcept { return token; } + + friend bool operator==(const TokenInfo& a, Token b) noexcept { return a.token == b; } + friend bool operator!=(const TokenInfo& a, Token b) noexcept { return a.token != b; } + friend bool operator==(Token a, const TokenInfo& b) noexcept { return b == a; } + friend bool operator!=(Token a, const TokenInfo& b) noexcept { return b != a; } +}; + +template +inline Token token(const TokenInfo& it) +{ + return it.token; +} + +template +inline size_t offset(const TokenInfo& it) +{ + return it.offset; +} + +template +inline const std::string& literal(const TokenInfo& it) +{ + return it.literal; +} + +template +inline const std::string& to_string(const TokenInfo& info) noexcept +{ + return info.literal; +} + +/** + * Lexer API for recognizing words. + */ +template +class Lexer +{ + public: + using value_type = Token; + using DebugLogger = std::function; + using TokenInfo = regex_dfa::TokenInfo; + + //! Constructs the Lexer with the given information table. + explicit Lexer(const LexerDef& info, DebugLogger logger = DebugLogger {}); + + //! Constructs the Lexer with the given information table and input stream. + Lexer(const LexerDef& info, std::unique_ptr input, DebugLogger logger = DebugLogger {}); + + //! Constructs the Lexer with the given information table and input stream. + Lexer(const LexerDef& info, std::istream& input, DebugLogger logger = DebugLogger {}); + + //! Constructs the Lexer with the given information table and input stream. + Lexer(const LexerDef& info, std::string input, DebugLogger logger = DebugLogger {}); + + /** + * Open given input stream. + */ + void reset(std::unique_ptr input); + void reset(const std::string& input); + + /** + * Recognizes one token (ignored patterns are skipped). + */ + TokenInfo recognize(); + + /** + * Recognizes one token, regardless of it is to be ignored or not. + */ + Token recognizeOne(); + + //! the underlying word of the currently recognized token + const std::string& word() const { return word_; } + + //! @returns the absolute offset of the file the lexer is currently reading from. + std::pair offset() const noexcept { return std::make_pair(oldOffset_, offset_); } + + //! @returns the last recognized token. + Token token() const noexcept { return token_; } + + //! @returns the name of the current token. + const std::string& name() const { return name(token_); } + + //! @returns the name of the token represented by Token @p t. + const std::string& name(Token t) const + { + auto i = def_.tagNames.find(static_cast(t)); + assert(i != def_.tagNames.end()); + return i->second; + } + + /** + * Retrieves the next state for given input state and input symbol. + * + * @param currentState the current State the DFA is in to. + * @param inputSymbol the input symbol that is used for transitioning from current state to the next + * state. + * @returns the next state to transition to. + */ + inline StateId delta(StateId currentState, Symbol inputSymbol) const; + + /** + * Sets the active deterministic finite automaton to use for recognizing words. + * + * @param machine the DFA machine to use for recognizing words. + */ + Machine setMachine(Machine machine) + { + // since Machine is a 1:1 mapping into the State's ID, we can simply cast here. + initialStateId_ = static_cast(machine); + } + + /** + * Retrieves the default DFA machine that is used to recognize words. + */ + Machine defaultMachine() const + { + auto i = def_.initialStates.find("INITIAL"); + assert(i != def_.initialStates.end()); + return static_cast(i->second); + } + + /** + * Runtime exception that is getting thrown when a word could not be recognized. + */ + struct LexerError: public std::runtime_error + { + LexerError(unsigned int _offset): + std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", _offset) }, + offset { _offset } + { + } + + unsigned int offset; + }; + + struct iterator + { + Lexer& lx; + int end; + TokenInfo info; + + const TokenInfo& operator*() const { return info; } + + iterator& operator++() + { + if (lx.eof()) + ++end; + + info = lx.recognize(); + + return *this; + } + + iterator& operator++(int) { return ++*this; } + bool operator==(const iterator& rhs) const noexcept { return end == rhs.end; } + bool operator!=(const iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + iterator begin() + { + const Token t = recognize(); + return iterator { *this, 0, TokenInfo { t, word() } }; + } + + iterator end() { return iterator { *this, 2, TokenInfo { 0, "" } }; } + + bool eof() const { return !stream_->good(); } + + size_t fileSize() const noexcept { return fileSize_; } + + private: + template + inline void debugf(const char* msg, Args... args) const + { + if constexpr (Debug) + if (debug_) + debug_(fmt::format(msg, args...)); + } + + Symbol nextChar(); + void rollback(); + StateId getInitialState() const noexcept; + bool isAcceptState(StateId state) const; + static std::string stateName(StateId s, const std::string_view& n = "n"); + static constexpr StateId BadState = 101010; + std::string toString(const std::deque& stack); + + int currentChar() const noexcept { return currentChar_; } + + Token token(StateId s) const + { + auto i = def_.acceptStates.find(s); + assert(i != def_.acceptStates.end()); + return static_cast(i->second); + } + + size_t getFileSize(); + + private: + const LexerDef& def_; + const DebugLogger debug_; + + Machine initialStateId_; + std::string word_; + std::unique_ptr ownedStream_; + std::istream* stream_; + std::vector buffered_; + unsigned oldOffset_; + unsigned offset_; + size_t fileSize_; // cache + bool isBeginOfLine_; + int currentChar_; + Token token_; +}; + +template +inline const std::string& to_string( + const typename Lexer::iterator& it) noexcept +{ + return it.info.literal; +} + +} // namespace regex_dfa + +namespace fmt +{ +template +struct formatter> +{ + using TokenInfo = regex_dfa::TokenInfo; + + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const TokenInfo& v, FormatContext& ctx) + { + return format_to(ctx.out(), "{}", v.literal); + } +}; +} // namespace fmt +#include diff --git a/src/regex_dfa/LexerDef.h b/src/regex_dfa/LexerDef.h new file mode 100644 index 0000000000..90a1c01803 --- /dev/null +++ b/src/regex_dfa/LexerDef.h @@ -0,0 +1,87 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include +#include +#include + +namespace regex_dfa +{ + +// special tags +constexpr Tag IgnoreTag = static_cast(-1); +constexpr Tag FirstUserTag = 1; + +using AcceptStateMap = std::map; + +//! defines a mapping between accept state ID and another (prior) ID to track roll back the input stream to. +using BacktrackingMap = std::map; + +struct LexerDef +{ + std::map initialStates; + bool containsBeginOfLineStates; + TransitionMap transitions; + AcceptStateMap acceptStates; + BacktrackingMap backtrackingStates; + std::map tagNames; + + std::string to_string() const; + + bool isValidTag(Tag t) const noexcept { return tagNames.find(t) != tagNames.end(); } + + std::string tagName(Tag t) const + { + auto i = tagNames.find(t); + assert(i != tagNames.end()); + return i->second; + } +}; + +inline std::string LexerDef::to_string() const +{ + std::stringstream sstr; + + sstr << fmt::format("initializerStates:\n"); + for (const std::pair q0: initialStates) + sstr << fmt::format(" {}: {}\n", q0.first, q0.second); + sstr << fmt::format("totalStates: {}\n", transitions.states().size()); + + sstr << "transitions:\n"; + for (StateId inputState: transitions.states()) + { + std::map> T; + for (const std::pair p: transitions.map(inputState)) + { + T[p.second].push_back(p.first); + } + for (auto& t: T) + { + sstr << fmt::format( + "- n{} --({})--> n{}\n", inputState, groupCharacterClassRanges(std::move(t.second)), t.first); + } + } + + sstr << "accepts:\n"; + for (const std::pair a: acceptStates) + sstr << fmt::format("- n{} to {} ({})\n", a.first, a.second, tagName(a.second)); + + if (!backtrackingStates.empty()) + { + sstr << "backtracking:\n"; + for (const std::pair bt: backtrackingStates) + sstr << fmt::format("- n{} to n{}\n", bt.first, bt.second); + } + + return sstr.str(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Lexer_test.cpp b/src/regex_dfa/Lexer_test.cpp new file mode 100644 index 0000000000..41b4624355 --- /dev/null +++ b/src/regex_dfa/Lexer_test.cpp @@ -0,0 +1,600 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include +#include +#include + +#include +#include + +using namespace std; +using namespace regex_dfa; +using namespace regex_dfa::util::literals; + +/* FEATURE UNITTEST CHECKLIST: + * + * - [ ] concatenation + * - [ ] alternation + * - [ ] {n} + * - [ ] {m,n} + * - [ ] {m,} + * - [ ] ? + * - [ ] character class, [a-z], [a-z0-9] + * - [ ] character class by name, such as [[:upper:]] + * - [ ] inverted character class, [^a-z], [^a-z0-9] + * - [ ] generic lookahead r/s + * - [ ] EOL lookahead r$ + * - [ ] BOL lookbehind ^r + */ + +const string RULES = R"( + Space(ignore) ::= [\s\t\n]+ + Eof ::= <> + ABBA ::= abba + AB_CD ::= ab/cd + CD ::= cd + CDEF ::= cdef + EOL_LF ::= eol$ + XAnyLine ::= x.* +)"; + +enum class LookaheadToken +{ + Eof = 1, + ABBA, + AB_CD, + CD, + CDEF, + EOL_LF, + XAnyLine +}; +namespace fmt +{ // it sucks that I've to specify that here +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const LookaheadToken& v, FormatContext& ctx) + { + switch (v) + { + case LookaheadToken::Eof: return format_to(ctx.out(), "Eof"); + case LookaheadToken::ABBA: return format_to(ctx.out(), "abba"); + case LookaheadToken::AB_CD: return format_to(ctx.out(), "ab/cd"); + case LookaheadToken::CD: return format_to(ctx.out(), "cd"); + case LookaheadToken::CDEF: return format_to(ctx.out(), "cdef"); + case LookaheadToken::EOL_LF: return format_to(ctx.out(), "eol$"); + case LookaheadToken::XAnyLine: return format_to(ctx.out(), ""); + default: return format_to(ctx.out(), "<{}>", static_cast(v)); + } + } +}; +} // namespace fmt + +TEST(regex_Lexer, lookahead) +{ + Compiler cc; + cc.parse(RULES); + + const LexerDef lexerDef = cc.compile(); + logf("LexerDef:\n{}", lexerDef.to_string()); + Lexable ls { lexerDef, "abba abcdef", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + + ASSERT_EQ(LookaheadToken::ABBA, *lexer); + ASSERT_EQ(LookaheadToken::AB_CD, *++lexer); + ASSERT_EQ(LookaheadToken::CDEF, *++lexer); + ASSERT_EQ(LookaheadToken::Eof, *++lexer); + ASSERT_EQ(end(ls), ++lexer); +} + +TEST(regex_Lexable, one) +{ + Compiler cc; + cc.parse(RULES); + + const LexerDef ld = cc.compile(); + logf("LexerDef:\n{}", ld.to_string()); + auto src = Lexable { ld, + make_unique("abba abcdef"), + [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(src); + auto eof = end(src); + + ASSERT_TRUE(lexer != eof); + EXPECT_EQ(LookaheadToken::ABBA, token(lexer)); + EXPECT_EQ(0, offset(lexer)); + + ++lexer; + EXPECT_EQ(LookaheadToken::AB_CD, token(lexer)); + EXPECT_EQ(5, offset(lexer)); + + ++lexer; + EXPECT_EQ(LookaheadToken::CDEF, token(lexer)); + EXPECT_EQ(7, offset(lexer)); + + ++lexer; + EXPECT_EQ(LookaheadToken::Eof, token(lexer)); + EXPECT_EQ(11, offset(lexer)); + + ++lexer; + ASSERT_FALSE(lexer != eof); // TODO: make that work +} + +TEST(regex_Lexer, LexerError) +{ + Compiler cc; + cc.parse(RULES); + + const LexerDef ld = cc.compile(); + Lexable ls { ld, "invalid" }; + EXPECT_THROW(begin(ls), LexerError); +} + +TEST(regex_Lexer, evaluateDotToken) +{ + Compiler cc; + cc.parse(RULES); + + const LexerDef ld = cc.compile(); + Lexable ls { ld, "xanything" }; + auto lexer = begin(ls); + + ASSERT_EQ(LookaheadToken::XAnyLine, *lexer); + ASSERT_EQ(LookaheadToken::Eof, *++lexer); +} + +TEST(regex_Lexer, match_eol) +{ + Compiler cc; + cc.parse(RULES); + + LexerDef ld = cc.compile(); + Lexable ls { ld, "abba eol\nabba", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + + ASSERT_EQ(LookaheadToken::ABBA, *lexer); + EXPECT_EQ(0, offset(lexer)); + + ASSERT_EQ(LookaheadToken::EOL_LF, *++lexer); + EXPECT_EQ(5, offset(lexer)); + + ASSERT_EQ(LookaheadToken::ABBA, *++lexer); + EXPECT_EQ(9, offset(lexer)); + + ASSERT_EQ(LookaheadToken::Eof, *++lexer); +} + +TEST(regex_Lexer, bol) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Unknown ::= . + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + Lexable ls { ld, "pragma", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + ASSERT_EQ(1, *lexer); // ^pragma + ASSERT_EQ(4, *++lexer); // EOS +} + +TEST(regex_Lexer, bol_no_match) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Unknown ::= . + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + logf("LexerDef:\n{}", ld.to_string()); + Lexable ls { ld, "test pragma", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + ASSERT_EQ(2, *lexer); // test + + // pragma (char-wise) - must not be recognized as ^pragma + ASSERT_EQ(3, *++lexer); + ASSERT_EQ(3, *++lexer); + ASSERT_EQ(3, *++lexer); + ASSERT_EQ(3, *++lexer); + ASSERT_EQ(3, *++lexer); + ASSERT_EQ(3, *++lexer); + + ASSERT_EQ(4, *++lexer); // EOS +} + +TEST(regex_Lexer, bol_line2) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + logf("LexerDef:\n{}", ld.to_string()); + Lexable ls { ld, "test\npragma", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + ASSERT_EQ(2, *lexer); // test + ASSERT_EQ(1, *++lexer); // ^pragma + ASSERT_EQ(3, *++lexer); // EOS +} + +TEST(regex_Lexer, bol_and_other_conditions) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Pragma ::= ^pragma + |Test ::= test + |Eof ::= <> + |Jump ::= jmp)"_multiline); + LexerDef ld = cc.compileMulti(); + logf("LexerDef:\n{}", ld.to_string()); + + Lexable ls { ld, "pragma test", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + ASSERT_EQ(1, *lexer); // ^pragma + ASSERT_EQ(2, *++lexer); // test + ASSERT_EQ(3, *++lexer); // <> +} + +TEST(regex_Lexer, bol_rules_on_non_bol_lexer) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |Test ::= "test" + |Pragma ::= ^"pragma" + |Unknown ::= . + |)"_multiline); + + LexerDef ld = cc.compile(); + using SimpleLexer = Lexable; + ASSERT_THROW(SimpleLexer(ld, "pragma"), invalid_argument); +} + +TEST(regex_Lexer, non_bol_rules_on_non_bol_lexer) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |Test ::= "test" + |Unknown ::= . + |)"_multiline); + + LexerDef ld = cc.compile(); + Lexable ls { ld, " test " }; + auto lexer = begin(ls); + + ASSERT_EQ(2, *lexer); // "test" + ASSERT_EQ(1, *++lexer); // <> +} + +TEST(regex_Lexer, non_bol_rules_on_bol_lexer) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |Test ::= "test" + |Unknown ::= . + |)"_multiline); + + LexerDef ld = cc.compile(); + Lexable ls { ld, " test " }; + auto lexer = begin(ls); + + ASSERT_EQ(2, *lexer); // "test" + ASSERT_EQ(1, *++lexer); // <> +} + +TEST(regex_Lexer, iterator) +{ + Compiler cc; + cc.parse(make_unique(R"( + Spacing(ignore) ::= [\s\t\n]+ + A ::= a + B ::= b + Eof ::= <> + )")); + + auto const ld = cc.compile(); + auto const ls = Lexable { ld, make_unique("a b b a") }; + auto const e = ls.end(); + auto i = ls.begin(); + + // a + ASSERT_EQ(1, *i); + ASSERT_TRUE(i != e); + + // b + i++; + ASSERT_EQ(2, *i); + ASSERT_TRUE(i != e); + + // b + i++; + ASSERT_EQ(2, *i); + ASSERT_TRUE(i != e); + + // a + i++; + ASSERT_EQ(1, *i); + ASSERT_TRUE(i != e); + + // <> + i++; + ASSERT_EQ(3, *i); + ASSERT_TRUE(i != e); + + i++; + ASSERT_EQ(3, *i); // still EOF + ASSERT_TRUE(i == e); +} + +TEST(regex_Lexer, empty_alt) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Test ::= aa(bb|) + |Eof ::= <> + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + Lexable ls { ld, "aabb aa aabb", [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + + ASSERT_EQ(1, *lexer); + ASSERT_EQ(1, *++lexer); + ASSERT_EQ(1, *++lexer); + ASSERT_EQ(2, *++lexer); // EOF +} + +TEST(regex_Lexer, ignore_many) +{ + Compiler cc; + cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ + |Comment(ignore) ::= #.* + |Eof ::= <> + |Foo ::= foo + |Bar ::= bar + |)"_multiline); + + LexerDef ld = cc.compileMulti(); + Lexable ls { ld, + R"(|# some foo + |foo + | + |# some bar + |bar + |)"_multiline, + [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + + ASSERT_EQ(2, *lexer); + ASSERT_EQ("foo", literal(lexer)); + + ASSERT_EQ(3, *++lexer); + ASSERT_EQ("bar", literal(lexer)); + + ASSERT_EQ(1, *++lexer); // EOF +} + +TEST(regex_Lexer, realworld_ipv4) +{ + Compiler cc; + cc.parse(R"(| + |Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |IPv4Octet(ref) ::= [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] + |IPv4(ref) ::= {IPv4Octet}(\.{IPv4Octet}){3} + |IPv4Literal ::= {IPv4} + |)"_multiline); + + auto ld = cc.compile(); + auto ls = Lexable { ld, + R"(0.0.0.0 4.2.2.1 10.10.40.199 255.255.255.255)", + [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + + ASSERT_EQ(2, *lexer); + ASSERT_EQ("0.0.0.0", literal(lexer)); + + ASSERT_EQ(2, *++lexer); + ASSERT_EQ("4.2.2.1", literal(lexer)); + + ASSERT_EQ(2, *++lexer); + ASSERT_EQ("10.10.40.199", literal(lexer)); + + ASSERT_EQ(2, *++lexer); + ASSERT_EQ("255.255.255.255", literal(lexer)); + + ASSERT_EQ(1, *++lexer); +} + +enum class RealWorld +{ + Eof = 1, + IPv4, + IPv6 +}; +namespace fmt +{ // it sucks that I've to specify that here +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const RealWorld& v, FormatContext& ctx) + { + switch (v) + { + case RealWorld::Eof: return format_to(ctx.out(), "Eof"); + case RealWorld::IPv4: return format_to(ctx.out(), "IPv4"); + case RealWorld::IPv6: return format_to(ctx.out(), "IPv6"); + default: return format_to(ctx.out(), "<{}>", static_cast(v)); + } + } +}; +} // namespace fmt + +TEST(regex_Lexer, realworld_ipv6) +{ + Compiler cc; + cc.parse(R"(| + |Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + | + |IPv4Octet(ref) ::= [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] + |IPv4(ref) ::= {IPv4Octet}(\.{IPv4Octet}){3} + |IPv4Literal ::= {IPv4} + | + |ipv6Part(ref) ::= [[:xdigit:]]{1,4} + |IPv6 ::= {ipv6Part}(:{ipv6Part}){7,7} + | | ({ipv6Part}:){1,7}: + | | :(:{ipv6Part}){1,7} + | | :: + | | ({ipv6Part}:){1}(:{ipv6Part}){0,6} + | | ({ipv6Part}:){2}(:{ipv6Part}){0,5} + | | ({ipv6Part}:){3}(:{ipv6Part}){0,4} + | | ({ipv6Part}:){4}(:{ipv6Part}){0,3} + | | ({ipv6Part}:){5}(:{ipv6Part}){0,2} + | | ({ipv6Part}:){6}(:{ipv6Part}){0,1} + | | ::[fF]{4}:{IPv4} + )"_multiline); + + static const string TEXT = R"(|0:0:0:0:0:0:0:0 + |1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd + |2001:0db8:85a3:0000:0000:8a2e:0370:7334 + |1234:5678:: + |0:: + |::0 + |:: + |1::3:4:5:6:7:8 + |1::4:5:6:7:8 + |1::5:6:7:8 + |1::8 + |1:2::4:5:6:7:8 + |1:2::5:6:7:8 + |1:2::8 + |::ffff:127.0.0.1 + |::ffff:c000:0280 + |)"_multiline; + + auto ld = cc.compileMulti(); + auto ls = Lexable { ld, TEXT, [this](const string& msg) { + log(msg); + } }; + auto lexer = begin(ls); + + ASSERT_EQ(RealWorld::IPv6, *lexer); + ASSERT_EQ("0:0:0:0:0:0:0:0", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("2001:0db8:85a3:0000:0000:8a2e:0370:7334", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1234:5678::", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("0::", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("::0", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("::", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1::3:4:5:6:7:8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1::4:5:6:7:8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1::5:6:7:8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1::8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1:2::4:5:6:7:8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1:2::5:6:7:8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("1:2::8", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("::ffff:127.0.0.1", literal(lexer)); + + ASSERT_EQ(RealWorld::IPv6, *++lexer); + ASSERT_EQ("::ffff:c000:0280", literal(lexer)); + + ASSERT_EQ(RealWorld::Eof, *++lexer); +} + +TEST(regex_Lexer, internal) +{ + ASSERT_EQ("Eof", fmt::format("{}", LookaheadToken::Eof)); + ASSERT_EQ("abba", fmt::format("{}", LookaheadToken::ABBA)); + ASSERT_EQ("ab/cd", fmt::format("{}", LookaheadToken::AB_CD)); + ASSERT_EQ("cd", fmt::format("{}", LookaheadToken::CD)); + ASSERT_EQ("cdef", fmt::format("{}", LookaheadToken::CDEF)); + ASSERT_EQ("eol$", fmt::format("{}", LookaheadToken::EOL_LF)); + ASSERT_EQ("", fmt::format("{}", LookaheadToken::XAnyLine)); + ASSERT_EQ("<724>", fmt::format("{}", static_cast(724))); + + ASSERT_EQ("Eof", fmt::format("{}", RealWorld::Eof)); + ASSERT_EQ("IPv4", fmt::format("{}", RealWorld::IPv4)); + ASSERT_EQ("IPv6", fmt::format("{}", RealWorld::IPv6)); + ASSERT_EQ("<724>", fmt::format("{}", static_cast(724))); +} diff --git a/src/regex_dfa/MultiDFA.cpp b/src/regex_dfa/MultiDFA.cpp new file mode 100644 index 0000000000..3cd02d8d5a --- /dev/null +++ b/src/regex_dfa/MultiDFA.cpp @@ -0,0 +1,33 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +using namespace std; + +namespace regex_dfa +{ + +MultiDFA constructMultiDFA(map many) +{ + MultiDFA multiDFA {}; + multiDFA.dfa.createStates(1 + many.size()); + multiDFA.dfa.setInitialState(0); + + StateId q0 = 1; + for (pair& p: many) + { + multiDFA.dfa.append(move(p.second), q0); + multiDFA.initialStates[p.first] = q0; + multiDFA.dfa.setTransition(0, static_cast(q0), q0); + q0++; + } + + return multiDFA; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/MultiDFA.h b/src/regex_dfa/MultiDFA.h new file mode 100644 index 0000000000..76a30c0907 --- /dev/null +++ b/src/regex_dfa/MultiDFA.h @@ -0,0 +1,29 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include +#include + +namespace regex_dfa +{ + +struct MultiDFA +{ + using InitialStateMap = std::map; + + InitialStateMap initialStates; + DFA dfa; +}; + +MultiDFA constructMultiDFA(std::map many); + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFA.cpp b/src/regex_dfa/NFA.cpp new file mode 100644 index 0000000000..29ea460cc9 --- /dev/null +++ b/src/regex_dfa/NFA.cpp @@ -0,0 +1,375 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +#include +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +Alphabet NFA::alphabet() const +{ + Alphabet alphabet; + + for (const TransitionMap& transitions: states_) + { + for (const pair& t: transitions) + { + switch (t.first) + { + case Symbols::Epsilon: break; + default: alphabet.insert(t.first); + } + } + } + + return alphabet; +} + +NFA NFA::clone() const +{ + return *this; +} + +StateId NFA::createState() +{ + states_.emplace_back(); + return states_.size() - 1; +} + +StateIdVec NFA::delta(const StateIdVec& S, Symbol c) const +{ + StateIdVec result; + delta(S, c, &result); + return result; +} + +StateIdVec* NFA::delta(const StateIdVec& S, Symbol c, StateIdVec* result) const +{ + for (StateId s: S) + { + const TransitionMap& transitions = stateTransitions(s); + for (const auto& transition: transitions) + { + if (transition.first == c) + { + for (StateId targetState: transition.second) + { + result->push_back(targetState); + } + } + } + } + + return result; +} + +StateIdVec NFA::epsilonTransitions(StateId s) const +{ + StateIdVec t; + + const TransitionMap& transitions = stateTransitions(s); + for (const pair& p: transitions) + if (p.first == Symbols::Epsilon) + t.insert(t.end(), p.second.begin(), p.second.end()); + + return t; +} + +StateIdVec NFA::epsilonClosure(const StateIdVec& S) const +{ + StateIdVec eclosure; + epsilonClosure(S, &eclosure); + return eclosure; +} + +void NFA::epsilonClosure(const StateIdVec& S, StateIdVec* eclosure) const +{ + *eclosure = S; + vector availabilityCheck(1 + size(), false); + stack workList; + for (StateId s: S) + { + workList.push(s); + availabilityCheck[s] = true; + } + + while (!workList.empty()) + { + const StateId s = workList.top(); + workList.pop(); + + for (StateId t: epsilonTransitions(s)) + { + if (!availabilityCheck[t]) + { + eclosure->push_back(t); + workList.push(t); + } + } + } + + sort(eclosure->begin(), eclosure->end()); +} + +void NFA::prepareStateIds(StateId baseId) +{ + // adjust transition state IDs + // traverse through each state's transition set + // traverse through each transition in the transition set + // traverse through each element and add BASE_ID + + // for each state's transitions + for (StateId i = 0, e = size(); i != e; ++i) + { + TransitionMap& transitions = states_[i]; + + // for each vector of target-state-id per transition-symbol + for (auto t = transitions.begin(), tE = transitions.end(); t != tE; ++t) + { + StateIdVec& transition = t->second; + + // for each target state ID + for (StateId k = 0, kE = transition.size(); k != kE; ++k) + { + transition[k] += baseId; + } + } + } + + initialState_ += baseId; + acceptState_ += baseId; + + AcceptMap remapped; + for (auto& a: acceptTags_) + remapped[baseId + a.first] = a.second; + acceptTags_ = move(remapped); + + BacktrackingMap backtracking; + for (const auto& bt: backtrackStates_) + backtracking[baseId + bt.first] = baseId + bt.second; + backtrackStates_ = move(backtracking); +} + +NFA NFA::join(const map& mappings) +{ + if (mappings.size() == 1) + return mappings.begin()->second; + + NFA multi; + + for (size_t i = 0; i <= mappings.size(); ++i) + multi.createState(); + + Symbol transitionSymbol = 0; + for (const auto& mapping: mappings) + { + transitionSymbol++; + + NFA rhs = mapping.second.clone(); + rhs.prepareStateIds(multi.size()); + + multi.states_.reserve(multi.size() + rhs.size()); + multi.states_.insert(multi.states_.end(), rhs.states_.begin(), rhs.states_.end()); + multi.acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + + multi.addTransition(multi.initialState_, transitionSymbol, rhs.initialState_); + multi.backtrackStates_[rhs.acceptState_] = multi.acceptState_; + multi.acceptState_ = rhs.acceptState_; + } + + return multi; +} + +NFA& NFA::lookahead(NFA&& rhs) +{ + if (empty()) + { + *this = move(rhs); + backtrackStates_[acceptState_] = initialState_; + } + else + { + rhs.prepareStateIds(states_.size()); + states_.reserve(size() + rhs.size()); + states_.insert(states_.end(), rhs.states_.begin(), rhs.states_.end()); + acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + + addTransition(acceptState_, Symbols::Epsilon, rhs.initialState_); + backtrackStates_[rhs.acceptState_] = acceptState_; + acceptState_ = rhs.acceptState_; + } + + return *this; +} + +NFA& NFA::alternate(NFA&& rhs) +{ + StateId newStart = createState(); + StateId newEnd = createState(); + + rhs.prepareStateIds(states_.size()); + states_.insert(states_.end(), rhs.states_.begin(), rhs.states_.end()); + acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + backtrackStates_.insert(rhs.backtrackStates_.begin(), rhs.backtrackStates_.end()); + + addTransition(newStart, Symbols::Epsilon, initialState_); + addTransition(newStart, Symbols::Epsilon, rhs.initialState_); + + addTransition(acceptState_, Symbols::Epsilon, newEnd); + addTransition(rhs.acceptState_, Symbols::Epsilon, newEnd); + + initialState_ = newStart; + acceptState_ = newEnd; + + return *this; +} + +NFA& NFA::concatenate(NFA&& rhs) +{ + rhs.prepareStateIds(states_.size()); + states_.reserve(size() + rhs.size()); + states_.insert(states_.end(), rhs.states_.begin(), rhs.states_.end()); + acceptTags_.insert(rhs.acceptTags_.begin(), rhs.acceptTags_.end()); + backtrackStates_.insert(rhs.backtrackStates_.begin(), rhs.backtrackStates_.end()); + + addTransition(acceptState_, Symbols::Epsilon, rhs.initialState_); + acceptState_ = rhs.acceptState_; + + return *this; +} + +NFA& NFA::optional() +{ + StateId newStart = createState(); + StateId newEnd = createState(); + + addTransition(newStart, Symbols::Epsilon, initialState_); + addTransition(newStart, Symbols::Epsilon, newEnd); + addTransition(acceptState_, Symbols::Epsilon, newEnd); + + initialState_ = newStart; + acceptState_ = newEnd; + + return *this; +} + +NFA& NFA::recurring() +{ + // {0, inf} + StateId newStart = createState(); + StateId newEnd = createState(); + + addTransition(newStart, Symbols::Epsilon, initialState_); + addTransition(newStart, Symbols::Epsilon, newEnd); + + addTransition(acceptState_, Symbols::Epsilon, initialState_); + addTransition(acceptState_, Symbols::Epsilon, newEnd); + + initialState_ = newStart; + acceptState_ = newEnd; + + return *this; +} + +NFA& NFA::positive() +{ + return concatenate(move(clone().recurring())); +} + +NFA& NFA::times(unsigned factor) +{ + assert(factor != 0); + + if (factor == 1) + return *this; + + NFA base = clone(); + for (unsigned n = 2; n <= factor; ++n) + concatenate(base.clone()); + + return *this; +} + +NFA& NFA::repeat(unsigned minimum, unsigned maximum) +{ + assert(minimum <= maximum); + + NFA factor = clone(); + + if (minimum != 0) + times(minimum); + + for (unsigned n = minimum + 1; n <= maximum; n++) + alternate(move(factor.clone().times(n))); + + if (minimum == 0) + optional(); + + return *this; +} + +void NFA::visit(DotVisitor& v) const +{ + v.start(initialState_); + + // initial state + v.visitNode(initialState_, true, acceptTags_.find(initialState_) != acceptTags_.end()); + + // accepting states + for (pair acceptTag: acceptTags_) + if (acceptTag.first != initialState_) + v.visitNode(acceptTag.first, false, true); + + // other states + for (StateId i = 0, e = size(); i != e; ++i) + if (i != initialState_ && acceptTags_.find(i) == acceptTags_.end()) + v.visitNode(i, false, false); + + // transitions + for (StateId sourceState = 0, sE = size(); sourceState != sE; ++sourceState) + { + map> reversed; + for (const pair& transitions: states_[sourceState]) + for (StateId targetState: transitions.second) + reversed[targetState].push_back(transitions.first /* symbol */); + + for (const pair>& tr: reversed) + { + StateId targetState = tr.first; + const vector& T = tr.second; + for_each(T.begin(), T.end(), [&](const Symbol t) { v.visitEdge(sourceState, targetState, t); }); + v.endVisitEdge(sourceState, targetState); + } + } + v.end(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFA.h b/src/regex_dfa/NFA.h new file mode 100644 index 0000000000..7cb776c664 --- /dev/null +++ b/src/regex_dfa/NFA.h @@ -0,0 +1,222 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class Alphabet; +class DotVisitor; +class DFA; + +/** + * NFA Builder with the Thompson's Construction properties. + * + *
    + *
  • There is exactly one initial state and exactly one accepting state.. + *
  • No transition other than the initial transition enters the initial state. + *
  • The accepting state has no leaving edges + *
  • An ε-transition always connects two states that were (earlier in the construction process) + * the initial state and the accepting state of NFAs for some component REs. + *
  • Each state has at most two entering states and at most two leaving states. + *
+ */ +class NFA +{ + private: + NFA(const NFA& other) = default; + NFA& operator=(const NFA& other) = default; + + public: + //! represent a transition table for a specific state + using TransitionMap = std::map; + + //! defines a set of states within one NFA. the index represents the state Id. + using StateVec = std::vector; + + //! defines a mapping between accept state ID and another (prior) ID to track roll back the input stream + //! to. + using BacktrackingMap = std::map; + + NFA(NFA&&) = default; + NFA& operator=(NFA&&) = default; + + //! Constructs an empty NFA. + NFA(): states_ {}, initialState_ { 0 }, acceptState_ { 0 }, backtrackStates_ {}, acceptTags_ {} {} + + /** + * Constructs an NFA for a single character transition. + * + * *No* acceptState flag is set on the accepting node! + */ + explicit NFA(Symbol value): NFA {} + { + initialState_ = createState(); + acceptState_ = createState(); + addTransition(initialState_, value, acceptState_); + } + + explicit NFA(SymbolSet value): NFA {} + { + initialState_ = createState(); + acceptState_ = createState(); + for (Symbol s: value) + addTransition(initialState_, s, acceptState_); + } + + void addTransition(StateId from, Symbol s, StateId to) { states_[from][s].push_back(to); } + + static NFA join(const std::map& mappings); + + /** + * Traverses all states and edges in this NFA and calls @p visitor for each state & edge. + * + * Use this function to e.g. get a GraphViz dot-file drawn. + */ + void visit(DotVisitor& visitor) const; + + //! Tests whether or not this is an empty NFA. + bool empty() const noexcept { return states_.empty(); } + + //! Retrieves the number of states of this NFA. + size_t size() const noexcept { return states_.size(); } + + //! Retrieves the one and only initial state. This value is nullptr iff the NFA is empty. + StateId initialStateId() const noexcept { return initialState_; } + + //! Retrieves the one and only accept state. This value is nullptr iff the NFA is empty. + StateId acceptStateId() const noexcept { return acceptState_; } + + //! Retrieves the list of states this FA contains. + const StateVec& states() const { return states_; } + StateVec& states() { return states_; } + + //! Retrieves the alphabet of this finite automaton. + Alphabet alphabet() const; + + //! Clones this NFA. + NFA clone() const; + + /** + * Constructs an NFA where @p rhs is following but backtracking to @c acceptState(this) when + * when @p rhs is fully matched. + * + * This resembles the syntax r/s (or r(?=s) in Perl) where r is matched when also s is following. + */ + NFA& lookahead(NFA&& rhs); + + //! Reconstructs this FA to alternate between this FA and the @p other FA. + NFA& alternate(NFA&& other); + + //! Concatenates the right FA's initial state with this FA's accepting state. + NFA& concatenate(NFA&& rhs); + + //! Reconstructs this FA to allow optional input. X -> X? + NFA& optional(); + + //! Reconstructs this FA with the given @p quantifier factor. + NFA& times(unsigned quantifier); + + //! Reconstructs this FA to allow recurring input. X -> X* + NFA& recurring(); + + //! Reconstructs this FA to be recurring at least once. X+ = XX* + NFA& positive(); + + //! Reconstructs this FA to be repeatable between range [minimum, maximum]. + NFA& repeat(unsigned minimum, unsigned maximum); + + //! Retrieves transitions for state with the ID @p id. + const TransitionMap& stateTransitions(StateId id) const { return states_[id]; } + + //! Retrieves all states that can be reached from @p S with one single input Symbol @p c. + StateIdVec delta(const StateIdVec& S, Symbol c) const; + StateIdVec* delta(const StateIdVec& S, Symbol c, StateIdVec* result) const; + + //! Retrieves all states that can be directly or indirectly accessed via epsilon-transitions exclusively. + StateIdVec epsilonClosure(const StateIdVec& S) const; + void epsilonClosure(const StateIdVec& S, StateIdVec* result) const; + + TransitionMap& stateTransitions(StateId s) { return states_[s]; } + + //! Flags given state as accepting-state with given Tag @p acceptTag. + void setAccept(Tag acceptTag) { acceptTags_[acceptState_] = acceptTag; } + + void setAccept(StateId state, Tag tag) { acceptTags_[state] = tag; } + + std::optional acceptTag(StateId s) const + { + if (auto i = acceptTags_.find(s); i != acceptTags_.end()) + return i->second; + + return std::nullopt; + } + + bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } + + /** + * Returns whether or not the StateSet @p Q contains at least one State that is also "accepting". + */ + bool isAnyAccepting(const StateIdVec& Q) const + { + for (StateId q: Q) + if (isAccepting(q)) + return true; + + return false; + } + + const AcceptMap& acceptMap() const noexcept { return acceptTags_; } + AcceptMap& acceptMap() noexcept { return acceptTags_; } + + std::optional backtrack(StateId s) const + { + if (auto i = backtrackStates_.find(s); i != backtrackStates_.end()) + return i->second; + + return std::nullopt; + } + + /** + * Checks if @p Q contains a state that is flagged as backtracking state in the NFA and returns + * the target state within the NFA or @c std::nullopt if not a backtracking state. + */ + std::optional containsBacktrackState(const StateIdVec& Q) const + { + for (StateId q: Q) + if (std::optional t = backtrack(q); t.has_value()) + return *t; + + return std::nullopt; + } + + private: + StateId createState(); + void visit(DotVisitor& v, StateId s, std::unordered_map& registry) const; + void prepareStateIds(StateId baseId); + + //! Retrieves all epsilon-transitions directly connected to State @p s. + StateIdVec epsilonTransitions(StateId s) const; + + private: + StateVec states_; + StateId initialState_; + StateId acceptState_; + BacktrackingMap backtrackStates_; + AcceptMap acceptTags_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFABuilder.cpp b/src/regex_dfa/NFABuilder.cpp new file mode 100644 index 0000000000..912d875470 --- /dev/null +++ b/src/regex_dfa/NFABuilder.cpp @@ -0,0 +1,124 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +NFA NFABuilder::construct(const RegExpr& re, Tag tag) +{ + visit(*this, re); + + // fa_.setAccept(acceptState_.value_or(fa_.acceptStateId()), tag); + if (acceptState_) + fa_.setAccept(acceptState_.value(), tag); + else + fa_.setAccept(tag); + + return move(fa_); +} + +NFA NFABuilder::construct(const RegExpr& re) +{ + visit(*this, re); + return move(fa_); +} + +void NFABuilder::operator()(const LookAheadExpr& lookaheadExpr) +{ + // fa_ = move(construct(lookaheadExpr.leftExpr()).lookahead(construct(lookaheadExpr.rightExpr()))); + NFA lhs = construct(*lookaheadExpr.left); + NFA rhs = construct(*lookaheadExpr.right); + lhs.lookahead(move(rhs)); + fa_ = move(lhs); +} + +void NFABuilder::operator()(const AlternationExpr& alternationExpr) +{ + NFA lhs = construct(*alternationExpr.left); + NFA rhs = construct(*alternationExpr.right); + lhs.alternate(move(rhs)); + fa_ = move(lhs); +} + +void NFABuilder::operator()(const ConcatenationExpr& concatenationExpr) +{ + NFA lhs = construct(*concatenationExpr.left); + NFA rhs = construct(*concatenationExpr.right); + lhs.concatenate(move(rhs)); + fa_ = move(lhs); +} + +void NFABuilder::operator()(const CharacterExpr& characterExpr) +{ + fa_ = NFA { characterExpr.value }; +} + +void NFABuilder::operator()(const CharacterClassExpr& characterClassExpr) +{ + fa_ = NFA { characterClassExpr.symbols }; +} + +void NFABuilder::operator()(const ClosureExpr& closureExpr) +{ + const unsigned xmin = closureExpr.minimumOccurrences; + const unsigned xmax = closureExpr.maximumOccurrences; + constexpr unsigned Infinity = numeric_limits::max(); + + if (xmin == 0 && xmax == 1) + fa_ = move(construct(*closureExpr.subExpr).optional()); + else if (xmin == 0 && xmax == Infinity) + fa_ = move(construct(*closureExpr.subExpr).recurring()); + else if (xmin == 1 && xmax == Infinity) + fa_ = move(construct(*closureExpr.subExpr).positive()); + else if (xmin < xmax) + fa_ = move(construct(*closureExpr.subExpr).repeat(xmin, xmax)); + else if (xmin == xmax) + fa_ = move(construct(*closureExpr.subExpr).times(xmin)); + else + throw invalid_argument { "closureExpr" }; +} + +void NFABuilder::operator()(const BeginOfLineExpr&) +{ + fa_ = NFA { Symbols::Epsilon }; +} + +void NFABuilder::operator()(const EndOfLineExpr& eolExpr) +{ + // NFA lhs; + // NFA rhs{'\n'}; + // lhs.lookahead(move(rhs)); + // fa_ = move(lhs); + fa_ = move(NFA {}.lookahead(NFA { '\n' })); +} + +void NFABuilder::operator()(const EndOfFileExpr& eofExpr) +{ + fa_ = NFA { Symbols::EndOfFile }; +} + +void NFABuilder::operator()(const DotExpr& dotExpr) +{ + // any character except LF + fa_ = NFA { '\t' }; + for (int ch = 32; ch < 127; ++ch) + { + fa_.addTransition(fa_.initialStateId(), ch, fa_.acceptStateId()); + } +} + +void NFABuilder::operator()(const EmptyExpr& emptyExpr) +{ + fa_ = NFA { Symbols::Epsilon }; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFABuilder.h b/src/regex_dfa/NFABuilder.h new file mode 100644 index 0000000000..646ca9ad4d --- /dev/null +++ b/src/regex_dfa/NFABuilder.h @@ -0,0 +1,55 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class DFA; + +/*! + * Generates a finite automaton from the given input (a regular expression). + */ +class NFABuilder +{ + public: + explicit NFABuilder(): fa_ {} {} + + NFA construct(const RegExpr& re, Tag tag); + NFA construct(const RegExpr& re); + void operator()(const LookAheadExpr& lookaheadExpr); + void operator()(const ConcatenationExpr& concatenationExpr); + void operator()(const AlternationExpr& alternationExpr); + void operator()(const CharacterExpr& characterExpr); + void operator()(const CharacterClassExpr& characterClassExpr); + void operator()(const ClosureExpr& closureExpr); + void operator()(const BeginOfLineExpr& bolExpr); + void operator()(const EndOfLineExpr& eolExpr); + void operator()(const EndOfFileExpr& eofExpr); + void operator()(const DotExpr& dotExpr); + void operator()(const EmptyExpr& emptyExpr); + + private: + NFA fa_; + std::optional acceptState_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/NFA_test.cpp b/src/regex_dfa/NFA_test.cpp new file mode 100644 index 0000000000..1ef09727c4 --- /dev/null +++ b/src/regex_dfa/NFA_test.cpp @@ -0,0 +1,84 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +using namespace std; +using namespace regex_dfa; + +TEST(regex_NFA, emptyCtor) +{ + const NFA nfa; + ASSERT_EQ(0, nfa.size()); + ASSERT_TRUE(nfa.empty()); +} + +TEST(regex_NFA, characterCtor) +{ + const NFA nfa { 'a' }; + ASSERT_EQ(2, nfa.size()); + ASSERT_EQ(0, nfa.initialStateId()); + ASSERT_EQ(1, nfa.acceptStateId()); + ASSERT_EQ(StateIdVec { 1 }, nfa.delta(StateIdVec { 0 }, 'a')); +} + +TEST(regex_NFA, concatenate) +{ + const NFA ab = move(NFA { 'a' }.concatenate(NFA { 'b' })); + ASSERT_EQ(4, ab.size()); + ASSERT_EQ(0, ab.initialStateId()); + ASSERT_EQ(3, ab.acceptStateId()); + + // TODO: check ab.initial == A.initial + // TODO: check A.accept == B.initial + // TODO: check ab.accept == B.accept +} + +TEST(regex_NFA, alternate) +{ + const NFA ab = move(NFA { 'a' }.alternate(NFA { 'b' })); + ASSERT_EQ(6, ab.size()); + ASSERT_EQ(2, ab.initialStateId()); + ASSERT_EQ(3, ab.acceptStateId()); + + // TODO: check acceptState transitions to A and B + // TODO: check A and B's outgoing edges to final acceptState +} + +TEST(regex_NFA, epsilonClosure) +{ + const NFA nfa { 'a' }; + ASSERT_EQ(0, nfa.initialStateId()); + ASSERT_EQ(1, nfa.acceptStateId()); + ASSERT_EQ(StateIdVec { 0 }, nfa.epsilonClosure(StateIdVec { 0 })); + + const NFA abc = move(NFA { 'a' }.concatenate(move(NFA { 'b' }.alternate(NFA { 'c' }).recurring()))); + ASSERT_EQ(StateIdVec { 0 }, abc.epsilonClosure(StateIdVec { 0 })); + + const StateIdVec e1 { 1, 2, 4, 6, 8, 9 }; + ASSERT_EQ(e1, abc.epsilonClosure(StateIdVec { 1 })); +} + +TEST(regex_NFA, delta) +{ + const NFA nfa { 'a' }; + ASSERT_EQ(0, nfa.initialStateId()); + ASSERT_EQ(1, nfa.acceptStateId()); + ASSERT_EQ(StateIdVec { 1 }, nfa.delta(StateIdVec { 0 }, 'a')); +} + +TEST(regex_NFA, alphabet) +{ + ASSERT_EQ("{}", NFA {}.alphabet().to_string()); + ASSERT_EQ("{a}", NFA { 'a' }.alphabet().to_string()); + ASSERT_EQ("{ab}", NFA { 'a' }.concatenate(NFA { 'b' }).alphabet().to_string()); + ASSERT_EQ("{abc}", NFA { 'a' }.concatenate(NFA { 'b' }).alternate(NFA { 'c' }).alphabet().to_string()); +} diff --git a/src/regex_dfa/RegExpr.cpp b/src/regex_dfa/RegExpr.cpp new file mode 100644 index 0000000000..d087705e6d --- /dev/null +++ b/src/regex_dfa/RegExpr.cpp @@ -0,0 +1,117 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +#include +#include +#include + +using namespace std; + +/* + REGULAR EXPRESSION SYNTAX: + -------------------------- + + expr := alternation + alternation := concatenation ('|' concatenation)* + concatenation := closure (closure)* + closure := atom ['*' | '?' | '{' NUM [',' NUM] '}'] + atom := character | characterClass | '(' expr ')' + characterClass := '[' ['^'] characterClassFragment+ ']' + characterClassFragment := character | character '-' character +*/ + +namespace regex_dfa +{ + +auto embrace(const RegExpr& outer, const RegExpr& inner) +{ + if (precedence(outer) > precedence(inner)) + return "(" + to_string(inner) + ")"; + else + return to_string(inner); +} + +std::string to_string(const RegExpr& re) +{ + return visit( + overloaded { + [&](const ClosureExpr& e) { + stringstream sstr; + sstr << embrace(re, *e.subExpr); + if (e.minimumOccurrences == 0 && e.maximumOccurrences == 1) + sstr << '?'; + else if (e.minimumOccurrences == 0 && e.maximumOccurrences == numeric_limits::max()) + sstr << '*'; + else if (e.minimumOccurrences == 1 && e.maximumOccurrences == numeric_limits::max()) + sstr << '+'; + else + sstr << '{' << e.minimumOccurrences << ',' << e.maximumOccurrences << '}'; + return sstr.str(); + }, + [&](const AlternationExpr& e) { return embrace(re, *e.left) + "|" + embrace(re, *e.right); }, + [&](const ConcatenationExpr& e) { return embrace(re, *e.left) + embrace(re, *e.right); }, + [&](const LookAheadExpr& e) { return embrace(re, *e.left) + "/" + embrace(re, *e.right); }, + [](const CharacterExpr& e) { return string(1, e.value); }, + [](const EndOfFileExpr& e) { return string { "<>" }; }, + [](const BeginOfLineExpr& e) { return string { "^" }; }, + [](const EndOfLineExpr& e) { return string { "$" }; }, + [](const CharacterClassExpr& e) { return e.symbols.to_string(); }, + [](const DotExpr& e) { return string { "." }; }, + [](const EmptyExpr& e) { return string {}; }, + }, + re); +} + +int precedence(const RegExpr& regex) +{ + return visit(overloaded { + [](const AlternationExpr& e) { return 1; }, + [](const BeginOfLineExpr& e) { return 4; }, + [](const CharacterClassExpr& e) { return 4; }, + [](const CharacterExpr& e) { return 4; }, + [](const ClosureExpr& e) { return 3; }, + [](const ConcatenationExpr& e) { return 2; }, + [](const DotExpr& e) { return 4; }, + [](const EmptyExpr& e) { return 4; }, + [](const EndOfFileExpr& e) { return 4; }, + [](const EndOfLineExpr& e) { return 4; }, + [](const LookAheadExpr& e) { return 0; }, + }, + regex); +} + +bool containsBeginOfLine(const RegExpr& regex) +{ + return visit(overloaded { + [](const AlternationExpr& e) { + return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); + }, + [](const BeginOfLineExpr& e) { return true; }, + [](const CharacterClassExpr& e) { return false; }, + [](const CharacterExpr& e) { return false; }, + [](const ClosureExpr& e) { return containsBeginOfLine(*e.subExpr); }, + [](const ConcatenationExpr& e) { + return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); + }, + [](const DotExpr& e) { return false; }, + [](const EmptyExpr& e) { return false; }, + [](const EndOfFileExpr& e) { return false; }, + [](const EndOfLineExpr& e) { return false; }, + [](const LookAheadExpr& e) { + return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); + }, + }, + regex); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExpr.h b/src/regex_dfa/RegExpr.h new file mode 100644 index 0000000000..c8ca1fdce7 --- /dev/null +++ b/src/regex_dfa/RegExpr.h @@ -0,0 +1,102 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +struct AlternationExpr; +struct BeginOfLineExpr; +struct CharacterClassExpr; +struct CharacterExpr; +struct ClosureExpr; +struct ConcatenationExpr; +struct DotExpr; +struct EmptyExpr; +struct EndOfFileExpr; +struct EndOfLineExpr; +struct LookAheadExpr; + +using RegExpr = std::variant; + +struct LookAheadExpr +{ + std::unique_ptr left; + std::unique_ptr right; +}; + +struct AlternationExpr +{ + std::unique_ptr left; + std::unique_ptr right; +}; + +struct ConcatenationExpr +{ + std::unique_ptr left; + std::unique_ptr right; +}; + +struct ClosureExpr +{ + std::unique_ptr subExpr; + unsigned minimumOccurrences { 0 }; + unsigned maximumOccurrences { std::numeric_limits::max() }; +}; + +struct CharacterExpr +{ + Symbol value; +}; + +struct CharacterClassExpr +{ + SymbolSet symbols; +}; + +struct DotExpr +{ +}; +struct BeginOfLineExpr +{ +}; +struct EndOfLineExpr +{ +}; +struct EndOfFileExpr +{ +}; +struct EmptyExpr +{ +}; + +std::string to_string(const RegExpr& regex); +int precedence(const RegExpr& regex); +bool containsBeginOfLine(const RegExpr& regex); + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser.cpp b/src/regex_dfa/RegExprParser.cpp new file mode 100644 index 0000000000..b338be1a19 --- /dev/null +++ b/src/regex_dfa/RegExprParser.cpp @@ -0,0 +1,481 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +#include +#include +#include +#include + +using namespace std; + +#if 0 + #define DEBUG(msg, ...) \ + do \ + { \ + cerr << fmt::format(msg, __VA_ARGS__) << "\n"; \ + } while (0) +#else + #define DEBUG(msg, ...) \ + do \ + { \ + } while (0) +#endif + +/* + REGULAR EXPRESSION SYNTAX: + -------------------------- + + expr := alternation + alternation := concatenation ('|' concatenation)* + concatenation := closure (closure)* + closure := atom ['*' | '?' | '{' NUM [',' NUM] '}'] + atom := character + | '^' + | '$' + | '<>' + | '"' LITERAL '"' + | characterClass + | '(' expr ')' + | EPSILON + characterClass := '[' ['^'] characterClassFragment+ ']' + characterClassFragment := character | character '-' character +*/ + +namespace regex_dfa +{ + +RegExprParser::RegExprParser(): input_ {}, currentChar_ { input_.end() }, line_ { 1 }, column_ { 0 } +{ +} + +int RegExprParser::currentChar() const +{ + if (currentChar_ != input_.end()) + return *currentChar_; + else + return -1; +} + +bool RegExprParser::consumeIf(int ch) +{ + if (currentChar() != ch) + return false; + + consume(); + return true; +} + +int RegExprParser::consume() +{ + if (currentChar_ == input_.end()) + return -1; + + int ch = *currentChar_; + if (ch == '\n') + { + line_++; + column_ = 1; + } + else + { + column_++; + } + ++currentChar_; + DEBUG("consume: '{}'", (char) ch); + return ch; +} + +void RegExprParser::consume(int expected) +{ + int actual = currentChar(); + consume(); + if (actual != expected) + { + throw UnexpectedToken { line_, column_, actual, expected }; + } +} + +RegExpr RegExprParser::parse(string_view expr, int line, int column) +{ + input_ = move(expr); + currentChar_ = input_.begin(); + line_ = line; + column_ = column; + + return parseExpr(); +} + +RegExpr RegExprParser::parseExpr() +{ + return parseLookAheadExpr(); +} + +RegExpr RegExprParser::parseLookAheadExpr() +{ + RegExpr lhs = parseAlternation(); + + if (currentChar() == '/') + { + consume(); + RegExpr rhs = parseAlternation(); + lhs = LookAheadExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + } + + return lhs; +} + +RegExpr RegExprParser::parseAlternation() +{ + RegExpr lhs = parseConcatenation(); + + while (currentChar() == '|') + { + consume(); + RegExpr rhs = parseConcatenation(); + lhs = AlternationExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + } + + return lhs; +} + +RegExpr RegExprParser::parseConcatenation() +{ + // FOLLOW-set, the set of terminal tokens that can occur right after a concatenation + static const string_view follow = "/|)"; + RegExpr lhs = parseClosure(); + + while (!eof() && follow.find(currentChar()) == follow.npos) + { + RegExpr rhs = parseClosure(); + lhs = ConcatenationExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + } + + return lhs; +} + +RegExpr RegExprParser::parseClosure() +{ + RegExpr subExpr = parseAtom(); + + switch (currentChar()) + { + case '?': consume(); return ClosureExpr { make_unique(move(subExpr)), 0, 1 }; + case '*': consume(); return ClosureExpr { make_unique(move(subExpr)), 0 }; + case '+': consume(); return ClosureExpr { make_unique(move(subExpr)), 1 }; + case '{': { + consume(); + unsigned int m = parseInt(); + if (currentChar() == ',') + { + consume(); + unsigned int n = parseInt(); + consume('}'); + return ClosureExpr { make_unique(move(subExpr)), m, n }; + } + else + { + consume('}'); + return ClosureExpr { make_unique(move(subExpr)), m, m }; + } + } + default: return subExpr; + } +} + +unsigned RegExprParser::parseInt() +{ + unsigned n = 0; + while (isdigit(currentChar())) + { + n *= 10; + n += currentChar() - '0'; + consume(); + } + return n; +} + +RegExpr RegExprParser::parseAtom() +{ + // skip any whitespace (except newlines) + while (!eof() && isspace(currentChar()) && currentChar() != '\n') + consume(); + + switch (currentChar()) + { + case -1: // EOF + case ')': return EmptyExpr {}; + case '<': + consume(); + consume('<'); + consume('E'); + consume('O'); + consume('F'); + consume('>'); + consume('>'); + return EndOfFileExpr {}; + case '(': { + consume(); + RegExpr subExpr = parseExpr(); + consume(')'); + return subExpr; + } + case '"': { + consume(); + RegExpr lhs = CharacterExpr { consume() }; + while (!eof() && currentChar() != '"') + { + RegExpr rhs = CharacterExpr { consume() }; + lhs = ConcatenationExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + } + consume('"'); + return lhs; + } + case '[': return parseCharacterClass(); + case '.': consume(); return DotExpr {}; + case '^': consume(); return BeginOfLineExpr {}; + case '$': consume(); return EndOfLineExpr {}; + default: return CharacterExpr { parseSingleCharacter() }; + } +} + +RegExpr RegExprParser::parseCharacterClass() +{ + consume(); // '[' + const bool complement = consumeIf('^'); // TODO + + SymbolSet ss; + parseCharacterClassFragment(ss); + while (!eof() && currentChar() != ']') + parseCharacterClassFragment(ss); + + if (complement) + ss.complement(); + + consume(']'); + return CharacterClassExpr { move(ss) }; +} + +void RegExprParser::parseNamedCharacterClass(SymbolSet& ss) +{ + consume('['); + consume(':'); + string token; + while (isalpha(currentChar())) + { + token += static_cast(consume()); + } + consume(':'); + consume(']'); + + static const unordered_map> names = { + { "alnum", + [](SymbolSet& ss) { + for (Symbol c = 'a'; c <= 'z'; c++) + ss.insert(c); + for (Symbol c = 'A'; c <= 'Z'; c++) + ss.insert(c); + for (Symbol c = '0'; c <= '9'; c++) + ss.insert(c); + } }, + { "alpha", + [](SymbolSet& ss) { + for (Symbol c = 'a'; c <= 'z'; c++) + ss.insert(c); + for (Symbol c = 'A'; c <= 'Z'; c++) + ss.insert(c); + } }, + { "blank", + [](SymbolSet& ss) { + ss.insert(' '); + ss.insert('\t'); + } }, + { "cntrl", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (iscntrl(c)) + ss.insert(c); + } }, + { "digit", + [](SymbolSet& ss) { + for (Symbol c = '0'; c <= '9'; c++) + ss.insert(c); + } }, + { "graph", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (isgraph(c)) + ss.insert(c); + } }, + { "lower", + [](SymbolSet& ss) { + for (Symbol c = 'a'; c <= 'z'; c++) + ss.insert(c); + } }, + { "print", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (isprint(c) || c == ' ') + ss.insert(c); + } }, + { "punct", + [](SymbolSet& ss) { + for (Symbol c = 0; c <= 255; c++) + if (ispunct(c)) + ss.insert(c); + } }, + { "space", + [](SymbolSet& ss) { + for (Symbol c: "\f\n\r\t\v") + ss.insert(c); + } }, + { "upper", + [](SymbolSet& ss) { + for (Symbol c = 'A'; c <= 'Z'; c++) + ss.insert(c); + } }, + { "xdigit", + [](SymbolSet& ss) { + for (Symbol c = '0'; c <= '9'; c++) + ss.insert(c); + for (Symbol c = 'a'; c <= 'f'; c++) + ss.insert(c); + for (Symbol c = 'A'; c <= 'F'; c++) + ss.insert(c); + } }, + }; + + if (auto i = names.find(token); i != names.end()) + i->second(ss); + else + throw UnexpectedToken { line_, column_, token, "" }; +} + +Symbol RegExprParser::parseSingleCharacter() +{ + if (currentChar() != '\\') + return consume(); + + consume(); // consumes escape character + switch (currentChar()) + { + case 'a': consume(); return '\a'; + case 'b': consume(); return '\b'; + case 'f': consume(); return '\f'; + case 'n': consume(); return '\n'; + case 'r': consume(); return '\r'; + case 's': consume(); return ' '; + case 't': consume(); return '\t'; + case 'v': consume(); return '\v'; + case 'x': { + consume(); + + char buf[3]; + buf[0] = consume(); + if (!isxdigit(buf[0])) + throw UnexpectedToken { line_, column_, string(1, buf[0]), "[0-9a-fA-F]" }; + buf[1] = consume(); + if (!isxdigit(buf[1])) + throw UnexpectedToken { line_, column_, string(1, buf[1]), "[0-9a-fA-F]" }; + buf[2] = 0; + + return static_cast(strtoul(buf, nullptr, 16)); + } + case '0': { + const Symbol x0 = consume(); + if (!isdigit(currentChar())) + return '\0'; + + // octal value (\DDD) + char buf[4]; + buf[0] = x0; + buf[1] = consume(); + if (!(buf[1] >= '0' && buf[1] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[1]), "[0-7]" }; + buf[2] = consume(); + if (!(buf[2] >= '0' && buf[2] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[2]), "[0-7]" }; + buf[3] = '\0'; + + return static_cast(strtoul(buf, nullptr, 8)); + } + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + // octal value (\DDD) + char buf[4]; + buf[0] = consume(); + buf[1] = consume(); + if (!(buf[1] >= '0' && buf[1] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[1]), "[0-7]" }; + buf[2] = consume(); + if (!(buf[2] >= '0' && buf[2] <= '7')) + throw UnexpectedToken { line_, column_, string(1, buf[2]), "[0-7]" }; + buf[3] = '\0'; + + return static_cast(strtoul(buf, nullptr, 8)); + } + case '"': + case '$': + case '(': + case ')': + case '*': + case '+': + case ':': + case '?': + case '[': + case '\'': + case '\\': + case ']': + case '^': + case '{': + case '}': + case '.': + case '/': return consume(); + default: { + throw UnexpectedToken { line_, + column_, + fmt::format("'{}'", static_cast(currentChar())), + "" }; + } + } +} + +void RegExprParser::parseCharacterClassFragment(SymbolSet& ss) +{ + // parse [:named:] + if (currentChar() == '[') + { + parseNamedCharacterClass(ss); + return; + } + + // parse single char (A) or range (A-Z) + const Symbol c1 = parseSingleCharacter(); + if (currentChar() != '-') + { + ss.insert(c1); + return; + } + + consume(); // consume '-' + const Symbol c2 = parseSingleCharacter(); + + for (Symbol c_i = c1; c_i <= c2; c_i++) + ss.insert(c_i); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser.h b/src/regex_dfa/RegExprParser.h new file mode 100644 index 0000000000..d7dccb2302 --- /dev/null +++ b/src/regex_dfa/RegExprParser.h @@ -0,0 +1,91 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include + +#include + +#include +#include + +namespace regex_dfa +{ + +class SymbolSet; + +class RegExprParser +{ + public: + RegExprParser(); + + RegExpr parse(std::string_view expr, int line, int column); + + RegExpr parse(std::string_view expr) { return parse(std::move(expr), 1, 1); } + + class UnexpectedToken: public std::runtime_error + { + public: + UnexpectedToken(unsigned int line, unsigned int column, std::string actual, std::string expected): + std::runtime_error { fmt::format( + "[{}:{}] Unexpected token {}. Expected {} instead.", line, column, actual, expected) }, + line_ { line }, + column_ { column }, + actual_ { std::move(actual) }, + expected_ { std::move(expected) } + { + } + + UnexpectedToken(unsigned int line, unsigned int column, int actual, int expected): + UnexpectedToken { line, + column, + actual == -1 ? "EOF" : fmt::format("{}", static_cast(actual)), + std::string(1, static_cast(expected)) } + { + } + + unsigned int line() const noexcept { return line_; } + unsigned int column() const noexcept { return column_; } + const std::string& actual() const noexcept { return actual_; } + const std::string& expected() const noexcept { return expected_; } + + private: + unsigned int line_; + unsigned int column_; + std::string actual_; + std::string expected_; + }; + + private: + int currentChar() const; + bool eof() const noexcept { return currentChar() == -1; } + bool consumeIf(int ch); + void consume(int ch); + int consume(); + unsigned parseInt(); + + RegExpr parse(); // expr + RegExpr parseExpr(); // lookahead + RegExpr parseLookAheadExpr(); // alternation ('/' alternation)? + RegExpr parseAlternation(); // concatenation ('|' concatenation)* + RegExpr parseConcatenation(); // closure (closure)* + RegExpr parseClosure(); // atom ['*' | '?' | '{' NUM [',' NUM] '}'] + RegExpr parseAtom(); // character | characterClass | '(' expr ')' + RegExpr parseCharacterClass(); // '[' characterClassFragment+ ']' + void parseCharacterClassFragment(SymbolSet& ss); // namedClass | character | character '-' character + void parseNamedCharacterClass(SymbolSet& ss); // '[' ':' NAME ':' ']' + Symbol parseSingleCharacter(); + + private: + std::string_view input_; + std::string_view::iterator currentChar_; + unsigned int line_; + unsigned int column_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser_test.cpp b/src/regex_dfa/RegExprParser_test.cpp new file mode 100644 index 0000000000..d2281e5ed1 --- /dev/null +++ b/src/regex_dfa/RegExprParser_test.cpp @@ -0,0 +1,299 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +#include + +using namespace std; +using namespace regex_dfa; + +TEST(regex_RegExprParser, namedCharacterClass_graph) +{ + RegExpr re = RegExprParser {}.parse("[[:graph:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("!-~", to_string(re)); +} + +TEST(regex_RegExprParser, whitespaces_concatination) +{ + RegExpr re = RegExprParser {}.parse("a b"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("ab", to_string(re)); +} + +TEST(regex_RegExprParser, whitespaces_alternation) +{ + RegExpr re = RegExprParser {}.parse("a | b"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("a|b", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_digit) +{ + RegExpr re = RegExprParser {}.parse("[[:digit:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("0-9", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_alnum) +{ + RegExpr re = RegExprParser {}.parse("[[:alnum:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("0-9A-Za-z", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_alpha) +{ + RegExpr re = RegExprParser {}.parse("[[:alpha:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("A-Za-z", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_blank) +{ + RegExpr re = RegExprParser {}.parse("[[:blank:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\t\\s", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_cntrl) +{ + RegExpr re = RegExprParser {}.parse("[[:cntrl:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\0-\\x1f\\x7f", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_print) +{ + RegExpr re = RegExprParser {}.parse("[[:print:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\s-~", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_punct) +{ + RegExpr re = RegExprParser {}.parse("[[:punct:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("!-/:-@[-`{-~", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_space) +{ + RegExpr re = RegExprParser {}.parse("[[:space:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\0\\t-\\r", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_unknown) +{ + EXPECT_THROW(RegExprParser {}.parse("[[:unknown:]]"), RegExprParser::UnexpectedToken); +} + +TEST(regex_RegExprParser, namedCharacterClass_upper) +{ + RegExpr re = RegExprParser {}.parse("[[:upper:]]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("A-Z", to_string(re)); +} + +TEST(regex_RegExprParser, namedCharacterClass_mixed) +{ + RegExpr re = RegExprParser {}.parse("[[:lower:]0-9]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("0-9a-z", to_string(re)); +} + +TEST(regex_RegExprParser, characterClass_complement) +{ + RegExpr re = RegExprParser {}.parse("[^\\n]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_TRUE(get(re).symbols.isDot()); + EXPECT_EQ(".", get(re).symbols.to_string()); +} + +TEST(regex_RegExprParser, escapeSequences_invalid) +{ + EXPECT_THROW(RegExprParser {}.parse("[\\z]"), RegExprParser::UnexpectedToken); +} + +TEST(regex_RegExprParser, escapeSequences_abfnrstv) +{ + EXPECT_EQ("\\a", to_string(RegExprParser {}.parse("[\\a]"))); + EXPECT_EQ("\\b", to_string(RegExprParser {}.parse("[\\b]"))); + EXPECT_EQ("\\f", to_string(RegExprParser {}.parse("[\\f]"))); + EXPECT_EQ("\\n", to_string(RegExprParser {}.parse("[\\n]"))); + EXPECT_EQ("\\r", to_string(RegExprParser {}.parse("[\\r]"))); + EXPECT_EQ("\\s", to_string(RegExprParser {}.parse("[\\s]"))); + EXPECT_EQ("\\t", to_string(RegExprParser {}.parse("[\\t]"))); + EXPECT_EQ("\\v", to_string(RegExprParser {}.parse("[\\v]"))); +} + +TEST(regex_RegExprParser, newline) +{ + RegExpr re = RegExprParser {}.parse("\n"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ('\n', get(re).value); +} + +TEST(regex_RegExprParser, escapeSequences_hex) +{ + RegExpr re = RegExprParser {}.parse("[\\x20]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\s", get(re).symbols.to_string()); + + EXPECT_THROW(RegExprParser {}.parse("[\\xZZ]"), RegExprParser::UnexpectedToken); + EXPECT_THROW(RegExprParser {}.parse("[\\xAZ]"), RegExprParser::UnexpectedToken); + EXPECT_THROW(RegExprParser {}.parse("[\\xZA]"), RegExprParser::UnexpectedToken); +} + +TEST(regex_RegExprParser, escapeSequences_nul) +{ + RegExpr re = RegExprParser {}.parse("[\\0]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\0", get(re).symbols.to_string()); +} + +TEST(regex_RegExprParser, escapeSequences_octal) +{ + // with leading zero + RegExpr re = RegExprParser {}.parse("[\\040]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("\\s", get(re).symbols.to_string()); + + // with leading non-zero + re = RegExprParser {}.parse("[\\172]"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("z", get(re).symbols.to_string()); + + // invalids + EXPECT_THROW(RegExprParser {}.parse("[\\822]"), RegExprParser::UnexpectedToken); + EXPECT_THROW(RegExprParser {}.parse("[\\282]"), RegExprParser::UnexpectedToken); + EXPECT_THROW(RegExprParser {}.parse("[\\228]"), RegExprParser::UnexpectedToken); + EXPECT_THROW(RegExprParser {}.parse("[\\082]"), RegExprParser::UnexpectedToken); + EXPECT_THROW(RegExprParser {}.parse("[\\028]"), RegExprParser::UnexpectedToken); +} + +TEST(regex_RegExprParser, doubleQuote) +{ + // as concatenation character + RegExpr re = RegExprParser {}.parse(R"(\")"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ('"', get(re).value); + + // as character class + re = RegExprParser {}.parse(R"([\"])"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ(R"(")", get(re).symbols.to_string()); +} + +TEST(regex_RegExprParser, dot) +{ + RegExpr re = RegExprParser {}.parse("."); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ(".", to_string(re)); +} + +TEST(regex_RegExprParser, optional) +{ + RegExpr re = RegExprParser {}.parse("a?"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("a?", to_string(re)); +} + +TEST(regex_RegExprParser, bol) +{ + RegExpr re = RegExprParser {}.parse("^a"); + ASSERT_TRUE(holds_alternative(re)); + const ConcatenationExpr& cat = get(re); + + ASSERT_TRUE(holds_alternative(*cat.left)); + EXPECT_EQ("^", to_string(*cat.left)); + EXPECT_EQ("a", to_string(*cat.right)); +} + +TEST(regex_RegExprParser, eol) +{ + RegExpr re = RegExprParser {}.parse("a$"); + ASSERT_TRUE(holds_alternative(re)); + const ConcatenationExpr& cat = get(re); + + ASSERT_TRUE(holds_alternative(*cat.right)); + EXPECT_EQ("a$", to_string(re)); +} + +TEST(regex_RegExprParser, eof) +{ + RegExpr re = RegExprParser {}.parse("<>"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("<>", to_string(re)); +} + +TEST(regex_RegExprParser, alternation) +{ + EXPECT_EQ("a|b", to_string(RegExprParser {}.parse("a|b"))); + EXPECT_EQ("(a|b)c", to_string(RegExprParser {}.parse("(a|b)c"))); + EXPECT_EQ("a(b|c)", to_string(RegExprParser {}.parse("a(b|c)"))); +} + +TEST(regex_RegExprParser, lookahead) +{ + RegExpr re = RegExprParser {}.parse("ab/cd"); + ASSERT_TRUE(holds_alternative(re)); + EXPECT_EQ("ab/cd", to_string(re)); + EXPECT_EQ("(a/b)|b", to_string(RegExprParser {}.parse("(a/b)|b"))); + EXPECT_EQ("a|(b/c)", to_string(RegExprParser {}.parse("a|(b/c)"))); +} + +TEST(regex_RegExprParser, closure) +{ + RegExpr re = RegExprParser {}.parse("(abc)*"); + ASSERT_TRUE(holds_alternative(re)); + const ClosureExpr& e = get(re); + EXPECT_EQ(0, e.minimumOccurrences); + EXPECT_EQ(numeric_limits::max(), e.maximumOccurrences); + EXPECT_EQ("(abc)*", to_string(re)); +} + +TEST(regex_RegExprParser, positive) +{ + auto re = RegExprParser {}.parse("(abc)+"); + ASSERT_TRUE(holds_alternative(re)); + const ClosureExpr& e = get(re); + EXPECT_EQ(1, e.minimumOccurrences); + EXPECT_EQ(numeric_limits::max(), e.maximumOccurrences); + EXPECT_EQ("(abc)+", to_string(re)); +} + +TEST(regex_RegExprParser, closure_range) +{ + auto re = RegExprParser {}.parse("a{2,4}"); + ASSERT_TRUE(holds_alternative(re)); + const ClosureExpr& e = get(re); + EXPECT_EQ(2, e.minimumOccurrences); + EXPECT_EQ(4, e.maximumOccurrences); + EXPECT_EQ("a{2,4}", to_string(re)); +} + +TEST(regex_RegExprParser, empty) +{ + auto re = RegExprParser {}.parse("(a|)"); + EXPECT_EQ("a|", to_string(re)); // grouping '(' & ')' is not preserved as node in the parse tree. +} + +TEST(regex_RegExprParser, UnexpectedToken_grouping) +{ + EXPECT_THROW(RegExprParser {}.parse("(a"), RegExprParser::UnexpectedToken); +} + +TEST(regex_RegExprParser, UnexpectedToken_literal) +{ + EXPECT_THROW(RegExprParser {}.parse("\"a"), RegExprParser::UnexpectedToken); +} diff --git a/src/regex_dfa/Report.cpp b/src/regex_dfa/Report.cpp new file mode 100644 index 0000000000..d7a1b5d281 --- /dev/null +++ b/src/regex_dfa/Report.cpp @@ -0,0 +1,109 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +using namespace std; +using namespace regex_dfa; + +// {{{ Message +string Report::Message::to_string() const +{ + switch (type) + { + case Type::Warning: return fmt::format("[{}] {}", sourceLocation, text); + case Type::LinkError: return fmt::format("{}: {}", type, text); + default: return fmt::format("[{}] {}: {}", sourceLocation, type, text); + } +} + +bool Report::Message::operator==(const Message& other) const noexcept +{ + // XXX ignore SourceLocation's filename & end + return type == other.type && sourceLocation.offset == other.sourceLocation.offset && text == other.text; +} +// }}} +// {{{ ConsoleReport +void ConsoleReport::onMessage(Message&& message) +{ + switch (message.type) + { + case Type::Warning: cerr << fmt::format("Warning: {}\n", message); break; + default: cerr << fmt::format("Error: {}\n", message); break; + } +} +// }}} +// {{{ BufferedReport +void BufferedReport::onMessage(Message&& msg) +{ + messages_.emplace_back(move(msg)); +} + +void BufferedReport::clear() +{ + messages_.clear(); +} + +string BufferedReport::to_string() const +{ + stringstream sstr; + for (const Message& message: messages_) + { + switch (message.type) + { + case Type::Warning: sstr << "Warning: " << message.to_string() << "\n"; break; + default: sstr << "Error: " << message.to_string() << "\n"; break; + } + } + return sstr.str(); +} + +bool BufferedReport::operator==(const BufferedReport& other) const noexcept +{ + if (size() != other.size()) + return false; + + for (size_t i = 0, e = size(); i != e; ++i) + if (messages_[i] != other.messages_[i]) + return false; + + return true; +} + +bool BufferedReport::contains(const Message& message) const noexcept +{ + for (const Message& m: messages_) + if (m == message) + return true; + + return false; +} + +DifferenceReport difference(const BufferedReport& first, const BufferedReport& second) +{ + DifferenceReport diff; + + for (const Report::Message& m: first) + if (!second.contains(m)) + diff.first.push_back(m); + + for (const Report::Message& m: second) + if (!first.contains(m)) + diff.second.push_back(m); + + return diff; +} + +ostream& operator<<(ostream& os, const BufferedReport& report) +{ + os << report.to_string(); + return os; +} +// }}} diff --git a/src/regex_dfa/Report.h b/src/regex_dfa/Report.h new file mode 100644 index 0000000000..86efcdf8e2 --- /dev/null +++ b/src/regex_dfa/Report.h @@ -0,0 +1,223 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include + +namespace regex_dfa +{ + +class Report +{ + public: + enum class Type + { + TokenError, + SyntaxError, + TypeError, + Warning, + LinkError + }; + + struct Message + { + Type type; + SourceLocation sourceLocation; + std::string text; + + Message(Type type, SourceLocation sloc, std::string text): + type { type }, sourceLocation { std::move(sloc) }, text { std::move(text) } + { + } + + [[nodiscard]] std::string to_string() const; + + bool operator==(const Message& other) const noexcept; + bool operator!=(const Message& other) const noexcept { return !(*this == other); } + }; + + using MessageList = std::vector; + using Reporter = std::function; + + explicit Report(Reporter reporter): onReport_ { std::move(reporter) } {} + + template + void tokenError(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::TokenError, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void syntaxError(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::SyntaxError, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void typeError(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::TypeError, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void warning(const SourceLocation& sloc, const std::string& f, Args&&... args) + { + report(Type::Warning, sloc, fmt::format(f, std::forward(args)...)); + } + + template + void linkError(const std::string& f, Args&&... args) + { + report(Type::LinkError, SourceLocation {}, fmt::format(f, std::forward(args)...)); + } + + void report(Type type, SourceLocation sloc, std::string text) + { + if (type != Type::Warning) + errorCount_++; + + if (onReport_) + { + onReport_(Message(type, std::move(sloc), std::move(text))); + } + } + + [[nodiscard]] bool containsFailures() const noexcept { return errorCount_ != 0; } + + private: + size_t errorCount_ = 0; + Reporter onReport_; +}; + +class ConsoleReport: public Report +{ + public: + ConsoleReport(): Report(std::bind(&ConsoleReport::onMessage, this, std::placeholders::_1)) {} + + private: + void onMessage(Message&& msg); +}; + +class BufferedReport: public Report +{ + public: + BufferedReport(): Report(std::bind(&BufferedReport::onMessage, this, std::placeholders::_1)), messages_ {} + { + } + + [[nodiscard]] std::string to_string() const; + + [[nodiscard]] const MessageList& messages() const noexcept { return messages_; } + + void clear(); + [[nodiscard]] size_t size() const noexcept { return messages_.size(); } + [[nodiscard]] const Message& operator[](size_t i) const { return messages_[i]; } + + using iterator = MessageList::iterator; + using const_iterator = MessageList::const_iterator; + + [[nodiscard]] iterator begin() noexcept { return messages_.begin(); } + [[nodiscard]] iterator end() noexcept { return messages_.end(); } + [[nodiscard]] const_iterator begin() const noexcept { return messages_.begin(); } + [[nodiscard]] const_iterator end() const noexcept { return messages_.end(); } + + [[nodiscard]] bool contains(const Message& m) const noexcept; + + [[nodiscard]] bool operator==(const BufferedReport& other) const noexcept; + [[nodiscard]] bool operator!=(const BufferedReport& other) const noexcept { return !(*this == other); } + + private: + void onMessage(Message&& msg); + + private: + MessageList messages_; +}; + +std::ostream& operator<<(std::ostream& os, const BufferedReport& report); + +using DifferenceReport = std::pair; + +DifferenceReport difference(const BufferedReport& first, const BufferedReport& second); + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter: formatter +{ + using Type = regex_dfa::Report::Type; + + static std::string_view to_stringview(Type t) + { + switch (t) + { + case Type::TokenError: return "TokenError"; + case Type::SyntaxError: return "SyntaxError"; + case Type::TypeError: return "TypeError"; + case Type::Warning: return "Warning"; + case Type::LinkError: return "LinkError"; + default: return "???"; + } + } + + template + constexpr auto format(Type v, FormatContext& ctx) + { + return formatter::format(to_stringview(v), ctx); + } +}; +} // namespace fmt + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::SourceLocation& sloc, FormatContext& ctx) + { + return format_to(ctx.out(), "{} ({}-{})", sloc.filename, sloc.offset, sloc.offset + sloc.count); + } +}; +} // namespace fmt + +namespace fmt +{ +template <> +struct formatter +{ + using Message = regex_dfa::Report::Message; + + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const Message& v, FormatContext& ctx) + { + return format_to(ctx.out(), "{}", v.to_string()); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/Rule.h b/src/regex_dfa/Rule.h new file mode 100644 index 0000000000..abbc0d244a --- /dev/null +++ b/src/regex_dfa/Rule.h @@ -0,0 +1,137 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include // IgnoreTag +#include +#include +#include // Tag + +#include +#include +#include +#include + +namespace regex_dfa +{ + +struct Rule +{ + unsigned int line; + unsigned int column; + Tag tag; + std::vector conditions; + std::string name; + std::string pattern; + std::unique_ptr regexpr = nullptr; + + bool isIgnored() const noexcept { return tag == IgnoreTag; } + + Rule clone() const + { + return regexpr ? Rule { line, + column, + tag, + conditions, + name, + pattern, + std::make_unique(RegExprParser {}.parse(pattern, line, column)) } + : Rule { line, column, tag, conditions, name, pattern, nullptr }; + } + + Rule() = default; + + Rule(unsigned _line, + unsigned _column, + Tag _tag, + std::vector _conditions, + std::string _name, + std::string _pattern, + std::unique_ptr _regexpr = nullptr): + line { _line }, + column { _column }, + tag { _tag }, + conditions { _conditions }, + name { _name }, + pattern { _pattern }, + regexpr { std::move(_regexpr) } + { + } + + Rule(const Rule& v): + line { v.line }, + column { v.column }, + tag { v.tag }, + conditions { v.conditions }, + name { v.name }, + pattern { v.pattern }, + regexpr { v.regexpr ? std::make_unique(RegExprParser {}.parse(pattern, line, column)) + : nullptr } + { + } + + Rule& operator=(const Rule& v) + { + line = v.line; + column = v.column; + tag = v.tag; + conditions = v.conditions; + name = v.name; + pattern = v.pattern; + regexpr = + v.regexpr ? std::make_unique(RegExprParser {}.parse(pattern, line, column)) : nullptr; + return *this; + } + + bool operator<(const Rule& rhs) const noexcept { return tag < rhs.tag; } + bool operator<=(const Rule& rhs) const noexcept { return tag <= rhs.tag; } + bool operator==(const Rule& rhs) const noexcept { return tag == rhs.tag; } + bool operator!=(const Rule& rhs) const noexcept { return tag != rhs.tag; } + bool operator>=(const Rule& rhs) const noexcept { return tag >= rhs.tag; } + bool operator>(const Rule& rhs) const noexcept { return tag > rhs.tag; } +}; + +using RuleList = std::vector; + +inline bool ruleContainsBeginOfLine(const Rule& r) +{ + return containsBeginOfLine(*r.regexpr); +} + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::Rule& v, FormatContext& ctx) + { + if (!v.conditions.empty()) + { + format_to(ctx.out(), "<"); + for (size_t i = 0; i < v.conditions.size(); ++i) + if (i != 0) + format_to(ctx.out(), ", {}", v.conditions[i]); + else + format_to(ctx.out(), "{}", v.conditions[i]); + format_to(ctx.out(), ">"); + } + if (v.tag == regex_dfa::IgnoreTag) + return format_to(ctx.out(), "{}({}) ::= {}", v.name, "ignore", v.pattern); + else + return format_to(ctx.out(), "{}({}) ::= {}", v.name, v.tag, v.pattern); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/RuleParser.cpp b/src/regex_dfa/RuleParser.cpp new file mode 100644 index 0000000000..1c95f67175 --- /dev/null +++ b/src/regex_dfa/RuleParser.cpp @@ -0,0 +1,379 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include // special tags +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +namespace regex_dfa +{ + +RuleParser::RuleParser(unique_ptr input, int firstTag): + stream_ { move(input) }, + refRules_ {}, + lastParsedRule_ { nullptr }, + lastParsedRuleIsRef_ { false }, + currentChar_ { 0 }, + line_ { 1 }, + column_ { 0 }, + offset_ { 0 }, + nextTag_ { firstTag } +{ + consumeChar(); +} + +RuleParser::RuleParser(string input, int firstTag): + RuleParser { make_unique(move(input)), firstTag } +{ +} + +RuleList RuleParser::parseRules() +{ + RuleList rules; + + for (;;) + { + consumeSpace(); + if (eof()) + { + break; + } + else if (currentChar() == '\n') + { + consumeChar(); + } + else + { + parseRule(rules); + } + } + + // collect all condition labels, find all <*>-conditions, then replace their <*> with {collected + // conditions} + set conditions; + list starRules; + for (Rule& rule: rules) + { + for (const string& condition: rule.conditions) + { + if (condition != "*") + { + conditions.emplace(condition); + } + else + { + rule.conditions.clear(); + starRules.emplace_back(&rule); + } + } + } + for (Rule* rule: starRules) + for (const string& condition: conditions) + rule->conditions.emplace_back(condition); + + return rules; +} + +void RuleParser::parseRule(RuleList& rules) +{ + // Rule ::= RuleConditionList? BasicRule + // | RuleConditionList '{' BasicRule* '}' (LF | EOF)? + // BasicRule ::= TOKEN RuleOptions? SP '::=' SP RegEx SP? (LF | EOF) + // RuleOptions ::= '(' RuleOption (',' RuleOption)* + // RuleOption ::= ignore + + consumeSP(); + if (currentChar_ == '|' && lastParsedRule_ != nullptr) + { + consumeChar(); + consumeSP(); + const string pattern = parseExpression(); + lastParsedRule_->pattern += '|' + pattern; + return; + } + + // finalize ref-rule by surrounding it with round braces + if (lastParsedRuleIsRef_) + lastParsedRule_->pattern = fmt::format("({})", lastParsedRule_->pattern); + + vector conditions = parseRuleConditions(); + consumeSP(); + if (!conditions.empty() && currentChar() == '{') + { + consumeChar(); + consumeAnySP(); // allow whitespace, including LFs + while (!eof() && currentChar() != '}') + { + parseBasicRule(rules, vector(conditions)); + consumeSP(); // part of the next line, allow indentation + } + consumeChar('}'); + consumeSP(); + if (currentChar() == '\n') + consumeChar(); + else if (!eof()) + throw UnexpectedChar { line_, column_, currentChar_, '\n' }; + } + else + { + parseBasicRule(rules, move(conditions)); + } +} + +struct TestRuleForName +{ + string name; + bool operator()(const Rule& r) const { return r.name == name; } +}; + +void RuleParser::parseBasicRule(RuleList& rules, vector&& conditions) +{ + const unsigned int beginLine = line_; + const unsigned int beginColumn = column_; + + string token = consumeToken(); + bool ignore = false; + bool ref = false; + if (currentChar_ == '(') + { + consumeChar(); + unsigned optionOffset = offset_; + string option = consumeToken(); + consumeChar(')'); + + if (option == "ignore") + ignore = true; + else if (option == "ref") + ref = true; + else + throw InvalidRuleOption { optionOffset, option }; + } + consumeSP(); + consumeAssoc(); + consumeSP(); + const unsigned int line = line_; + const unsigned int column = column_; + const string pattern = parseExpression(); + if (currentChar() == '\n') + consumeChar(); + else if (!eof()) + throw UnexpectedChar { line_, column_, currentChar_, '\n' }; + + const Tag tag = [&] { + if (ignore || ref) + return IgnoreTag; + else if (auto i = find_if(rules.begin(), rules.end(), TestRuleForName { token }); i != rules.end()) + return i->tag; + else + return nextTag_++; + }(); + + if (ref && !conditions.empty()) + throw InvalidRefRuleWithConditions { beginLine, + beginColumn, + Rule { line, column, tag, move(conditions), token, pattern } }; + + if (conditions.empty()) + conditions.emplace_back("INITIAL"); + + sort(conditions.begin(), conditions.end()); + + if (!ref) + { + if (auto i = find_if(rules.begin(), rules.end(), TestRuleForName { token }); i != rules.end()) + { + throw DuplicateRule { Rule { line, column, tag, move(conditions), token, pattern }, *i }; + } + else + { + rules.emplace_back(Rule { line, column, tag, conditions, token, pattern }); + lastParsedRule_ = &rules.back(); + lastParsedRuleIsRef_ = false; + } + } + else if (auto i = refRules_.find(token); i != refRules_.end()) + { + throw DuplicateRule { Rule { line, column, tag, move(conditions), token, pattern }, i->second }; + } + else + { + // TODO: throw if !conditions.empty(); + refRules_[token] = { line, column, tag, {}, token, pattern }; + lastParsedRule_ = &refRules_[token]; + lastParsedRuleIsRef_ = true; + } +} + +vector RuleParser::parseRuleConditions() +{ + // RuleConditionList ::= '<' ('*' | TOKEN (',' SP* TOKEN)) '>' + if (currentChar() != '<') + return {}; + + consumeChar(); + + if (currentChar() == '*') + { + consumeChar(); + consumeChar('>'); + return { "*" }; + } + + vector conditions { consumeToken() }; + + while (currentChar() == ',') + { + consumeChar(); + consumeSP(); + conditions.emplace_back(consumeToken()); + } + + consumeChar('>'); + + return conditions; +} + +string RuleParser::parseExpression() +{ + // expression ::= " .... " + // | .... + + stringstream sstr; + + size_t i = 0; + size_t lastGraph = 0; + while (!eof() && currentChar_ != '\n') + { + if (isgraph(currentChar_)) + lastGraph = i + 1; + i++; + sstr << consumeChar(); + } + string pattern = sstr.str().substr(0, lastGraph); // skips trailing spaces + + // replace all occurrences of {ref} + for (const pair& ref: refRules_) + { + const Rule& rule = ref.second; + const string name = fmt::format("{{{}}}", rule.name); + // for (size_t i = 0; (i = pattern.find(name, i)) != string::npos; i += rule.pattern.size()) { + // pattern.replace(i, name.size(), rule.pattern); + // } + size_t i = 0; + while ((i = pattern.find(name, i)) != string::npos) + { + pattern.replace(i, name.size(), rule.pattern); + i += rule.pattern.size(); + } + } + + return pattern; +} + +// skips space until LF or EOF +void RuleParser::consumeSpace() +{ + for (;;) + { + switch (currentChar_) + { + case ' ': + case '\t': + case '\r': consumeChar(); break; + case '#': + while (!eof() && currentChar_ != '\n') + { + consumeChar(); + } + break; + default: return; + } + } +} + +char RuleParser::currentChar() const noexcept +{ + return currentChar_; +} + +char RuleParser::consumeChar(char ch) +{ + if (currentChar_ != ch) + throw UnexpectedChar { line_, column_, currentChar_, ch }; + + return consumeChar(); +} + +char RuleParser::consumeChar() +{ + char t = currentChar_; + + currentChar_ = stream_->get(); + if (!stream_->eof()) + { + offset_++; + if (t == '\n') + { + line_++; + column_ = 1; + } + else + { + column_++; + } + } + + return t; +} + +bool RuleParser::eof() const noexcept +{ + return currentChar_ < 0 || stream_->eof(); +} + +string RuleParser::consumeToken() +{ + stringstream sstr; + + if (!isalpha(currentChar_) || currentChar_ == '_') + throw UnexpectedToken { offset_, currentChar_, "Token" }; + + do + sstr << consumeChar(); + while (isalnum(currentChar_) || currentChar_ == '_'); + + return sstr.str(); +} + +void RuleParser::consumeAnySP() +{ + while (currentChar_ == ' ' || currentChar_ == '\t' || currentChar_ == '\n') + consumeChar(); +} + +void RuleParser::consumeSP() +{ + while (currentChar_ == ' ' || currentChar_ == '\t') + consumeChar(); +} + +void RuleParser::consumeAssoc() +{ + consumeChar(':'); + consumeChar(':'); + consumeChar('='); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/RuleParser.h b/src/regex_dfa/RuleParser.h new file mode 100644 index 0000000000..817e945fa9 --- /dev/null +++ b/src/regex_dfa/RuleParser.h @@ -0,0 +1,187 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +class RuleParser +{ + public: + explicit RuleParser(std::unique_ptr input, int firstTerminalId = FirstUserTag); + explicit RuleParser(std::string input, int firstTerminalId = FirstUserTag); + + RuleList parseRules(); + + class UnexpectedChar; + class UnexpectedToken; + class InvalidRuleOption; + class InvalidRefRuleWithConditions; + class DuplicateRule; + + private: + void parseRule(RuleList& rules); + std::vector parseRuleConditions(); + void parseBasicRule(RuleList& rules, std::vector&& conditions); + std::string parseExpression(); + + private: + std::string consumeToken(); + void consumeAnySP(); + void consumeSP(); + void consumeAssoc(); + void consumeSpace(); + char currentChar() const noexcept; + char consumeChar(char ch); + char consumeChar(); + bool eof() const noexcept; + std::string replaceRefs(const std::string& pattern); + + private: + std::unique_ptr stream_; + std::map refRules_; + Rule* lastParsedRule_; + bool lastParsedRuleIsRef_; + char currentChar_; + unsigned int line_; + unsigned int column_; + unsigned int offset_; + int nextTag_; +}; + +class RuleParser::InvalidRefRuleWithConditions: public std::runtime_error +{ + public: + InvalidRefRuleWithConditions(unsigned line, unsigned column, Rule&& rule): + std::runtime_error { fmt::format( + "{}:{}: Invalid rule \"{}\". Reference rules must not be labelled with conditions.", + line, + column, + rule.name) }, + rule_ { std::move(rule) } + { + } + + const Rule& rule() const noexcept { return rule_; } + + private: + const Rule rule_; +}; + +class RuleParser::DuplicateRule: public std::runtime_error +{ + public: + DuplicateRule(Rule&& duplicate, const Rule& other): + std::runtime_error { fmt::format( + "{}:{}: Duplicated rule definition with name \"{}\", previously defined in {}:{}.", + duplicate.line, + duplicate.column, + duplicate.name, + other.line, + other.column) }, + duplicate_ { std::move(duplicate) }, + other_ { other } + { + } + + const Rule& duplicate() const noexcept { return duplicate_; } + const Rule& other() const noexcept { return other_; } + + private: + const Rule duplicate_; + const Rule& other_; +}; + +class RuleParser::UnexpectedToken: public std::runtime_error +{ + public: + UnexpectedToken(unsigned offset, char actual, std::string expected): + std::runtime_error { fmt::format( + "{}: Unexpected token {}, expected <{}> instead.", offset, actual, expected) }, + offset_ { offset }, + actual_ { std::move(actual) }, + expected_ { std::move(expected) } + { + } + + unsigned offset() const noexcept { return offset_; } + char actual() const noexcept { return actual_; } + const std::string& expected() const noexcept { return expected_; } + + private: + unsigned offset_; + char actual_; + std::string expected_; +}; + +class RuleParser::UnexpectedChar: public std::runtime_error +{ + public: + UnexpectedChar(unsigned int line, unsigned int column, char actual, char expected): + std::runtime_error { fmt::format("[{}:{}] Unexpected char {}, expected {} instead.", + line, + column, + quoted(actual), + quoted(expected)) }, + line_ { line }, + column_ { column }, + actual_ { actual }, + expected_ { expected } + { + } + + unsigned int line() const noexcept { return line_; } + unsigned int column() const noexcept { return column_; } + char actual() const noexcept { return actual_; } + char expected() const noexcept { return expected_; } + + private: + static std::string quoted(char ch) + { + if (ch < 0) + return "<>"; + else + return fmt::format("'{}'", ch); + } + + private: + unsigned int line_; + unsigned int column_; + char actual_; + char expected_; +}; + +class RuleParser::InvalidRuleOption: public std::runtime_error +{ + public: + InvalidRuleOption(unsigned offset, std::string option): + std::runtime_error { fmt::format("{}: Invalid rule option \"{}\".", offset, option) }, + offset_ { offset }, + option_ { option } + { + } + + unsigned offset() const noexcept { return offset_; } + const std::string& option() const noexcept { return option_; } + + private: + unsigned offset_; + std::string option_; +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/RuleParser_test.cpp b/src/regex_dfa/RuleParser_test.cpp new file mode 100644 index 0000000000..b41669ee27 --- /dev/null +++ b/src/regex_dfa/RuleParser_test.cpp @@ -0,0 +1,247 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include +#include + +#include + +using namespace regex_dfa; + +TEST(regex_RuleParser, simple) +{ + RuleParser rp { "main ::= blah\n" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ("blah", rules[0].pattern); +} + +TEST(regex_RuleParser, whitespaces) +{ + RuleParser rp { "main ::= a\n\t| b | c\n" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ("a|b | c", rules[0].pattern); +} + +TEST(regex_RuleParser, rule_at_eof) +{ + RuleParser rp { "main ::= blah" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ("blah", rules[0].pattern); +} + +TEST(regex_RuleParser, simple_trailing_spaces) +{ + RuleParser rp { "main ::= blah\n " }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ("blah", rules[0].pattern); +} + +TEST(regex_RuleParser, quotedPattern) +{ + RuleParser rp { "main ::= \"blah\"" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ("\"blah\"", rules[0].pattern); +} + +TEST(regex_RuleParser, multiQuotedPattern) +{ + RuleParser rp { R"(rule ::= "b"la"h")" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ(R"("b"la"h")", rules[0].pattern); +} + +TEST(regex_RuleParser, doubleQuote) +{ + RuleParser rp { R"(rule ::= \")" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ(R"(\")", rules[0].pattern); +} + +TEST(regex_RuleParser, spaceRule) +{ + RuleParser rp { R"(rule ::= [ \n\t]+)" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ(R"([ \n\t]+)", rules[0].pattern); +} + +TEST(regex_RuleParser, stringRule) +{ + RuleParser rp { R"(rule ::= \"[^\"]*\")" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ(R"(\"[^\"]*\")", rules[0].pattern); +} + +TEST(regex_RuleParser, ref) +{ + RuleParser rp { R"( + Foo(ref) ::= foo + Bar(ref) ::= bar + FooBar ::= {Foo}_{Bar} + )" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(1, rules.size()); + EXPECT_EQ("(foo)_(bar)", rules[0].pattern); +} + +TEST(regex_RuleParser, ref_duplicated) +{ + RuleParser rp { R"( + Foo(ref) ::= foo + Foo(ref) ::= bar + FooBar ::= {Foo} + )" }; + EXPECT_THROW(rp.parseRules(), RuleParser::DuplicateRule); +} + +TEST(regex_RuleParser, multiline_alt) +{ + RuleParser rp { R"( + Rule1 ::= foo + | bar + Rule2(ref) ::= fnord + | hard + Rule3 ::= {Rule2} + | {Rule2} + )" }; + RuleList rules = rp.parseRules(); + ASSERT_EQ(2, rules.size()); + EXPECT_EQ("foo|bar", rules[0].pattern); + EXPECT_EQ("(fnord|hard)|(fnord|hard)", rules[1].pattern); +} + +TEST(regex_RuleParser, condition1) +{ + RuleParser rp { R"( + Rule1 ::= foo + Rule2 ::= bar + )" }; + RuleList rules = rp.parseRules(); + + ASSERT_EQ(2, rules.size()); + EXPECT_EQ("foo", rules[0].pattern); + EXPECT_EQ("bar", rules[1].pattern); + + ASSERT_EQ(1, rules[0].conditions.size()); + EXPECT_EQ("foo", rules[0].conditions[0]); + + ASSERT_EQ(1, rules[1].conditions.size()); + EXPECT_EQ("bar", rules[1].conditions[0]); +} + +TEST(regex_RuleParser, condition2) +{ + RuleParser rp { R"( + Rule1 ::= foo + Rule2 ::= bar + )" }; + RuleList rules = rp.parseRules(); + + ASSERT_EQ(2, rules.size()); + EXPECT_EQ("foo", rules[0].pattern); + EXPECT_EQ("bar", rules[1].pattern); + + ASSERT_EQ(1, rules[0].conditions.size()); + EXPECT_EQ("foo", rules[0].conditions[0]); + + ASSERT_EQ(2, rules[1].conditions.size()); + // in sorted order + EXPECT_EQ("bar", rules[1].conditions[0]); + EXPECT_EQ("foo", rules[1].conditions[1]); +} + +TEST(regex_RuleParser, conditional_star) +{ + RuleParser rp { R"( + Zero ::= zero + One ::= one + Two ::= two + <*>Tri ::= tri + )" }; + RuleList rules = rp.parseRules(); + + ASSERT_EQ(4, rules.size()); + + EXPECT_EQ("zero", rules[0].pattern); + ASSERT_EQ(1, rules[0].conditions.size()); + EXPECT_EQ("INITIAL", rules[0].conditions[0]); + + EXPECT_EQ("one", rules[1].pattern); + ASSERT_EQ(1, rules[1].conditions.size()); + EXPECT_EQ("one", rules[1].conditions[0]); + + EXPECT_EQ("two", rules[2].pattern); + ASSERT_EQ(1, rules[2].conditions.size()); + EXPECT_EQ("two", rules[2].conditions[0]); + + EXPECT_EQ("tri", rules[3].pattern); + ASSERT_EQ(3, rules[3].conditions.size()); + EXPECT_EQ("INITIAL", rules[3].conditions[0]); + EXPECT_EQ("one", rules[3].conditions[1]); + EXPECT_EQ("two", rules[3].conditions[2]); +} + +TEST(regex_RuleParser, grouped_conditions) +{ + RuleParser rp { R"( + Rule1 ::= foo + { + Rule2 ::= bar + } + )" }; + RuleList rules = rp.parseRules(); + + ASSERT_EQ(2, rules.size()); + EXPECT_EQ("foo", rules[0].pattern); + EXPECT_EQ("bar", rules[1].pattern); + + ASSERT_EQ(1, rules[1].conditions.size()); + EXPECT_EQ("blah", rules[1].conditions[0]); +} + +TEST(regex_RuleParser, InvalidRefRuleWithConditions) +{ + ASSERT_THROW(RuleParser { "main(ref) ::= blah\n" }.parseRules(), + RuleParser::InvalidRefRuleWithConditions); +} + +TEST(regex_RuleParser, InvalidRuleOption) +{ + ASSERT_THROW(RuleParser { "A(invalid) ::= a\n" }.parseRules(), RuleParser::InvalidRuleOption); +} + +TEST(regex_RuleParser, DuplicateRule) +{ + RuleParser rp { R"( + foo ::= abc + foo ::= def + )" }; + ASSERT_THROW(rp.parseRules(), RuleParser::DuplicateRule); +} + +TEST(regex_RuleParser, UnexpectedChar) +{ + ASSERT_THROW(RuleParser { "A :=" }.parseRules(), RuleParser::UnexpectedChar); + ASSERT_THROW(RuleParser { " A ::= a" }.parseRules(), RuleParser::UnexpectedToken); + ASSERT_THROW(RuleParser { "<> A ::= a" }.parseRules(), RuleParser::UnexpectedToken); + ASSERT_THROW(RuleParser { " ::= a" }.parseRules(), RuleParser::UnexpectedToken); +} diff --git a/src/regex_dfa/SourceLocation.cpp b/src/regex_dfa/SourceLocation.cpp new file mode 100644 index 0000000000..67b6c986da --- /dev/null +++ b/src/regex_dfa/SourceLocation.cpp @@ -0,0 +1,27 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +string SourceLocation::source() const // TODO +{ + string code; + ifstream ifs(filename); + ifs.seekg(offset, ifs.beg); + code.resize(count); + ifs.read(&code[0], count); + return code; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/SourceLocation.h b/src/regex_dfa/SourceLocation.h new file mode 100644 index 0000000000..c69d7f7487 --- /dev/null +++ b/src/regex_dfa/SourceLocation.h @@ -0,0 +1,40 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +namespace regex_dfa +{ + +struct SourceLocation +{ + std::string filename; + size_t offset; + size_t count; + + [[nodiscard]] long long int compare(const SourceLocation& other) const noexcept + { + if (filename == other.filename) + return (long) offset - (long) other.offset; + else if (filename < other.filename) + return -1; + else + return 1; + } + + [[nodiscard]] std::string source() const; + + bool operator==(const SourceLocation& other) const noexcept { return compare(other) == 0; } + bool operator<=(const SourceLocation& other) const noexcept { return compare(other) <= 0; } + bool operator>=(const SourceLocation& other) const noexcept { return compare(other) >= 0; } + bool operator<(const SourceLocation& other) const noexcept { return compare(other) < 0; } + bool operator>(const SourceLocation& other) const noexcept { return compare(other) > 0; } +}; + +} // namespace regex_dfa diff --git a/src/regex_dfa/State.cpp b/src/regex_dfa/State.cpp new file mode 100644 index 0000000000..76eaa27f26 --- /dev/null +++ b/src/regex_dfa/State.cpp @@ -0,0 +1,37 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +string to_string(const StateIdVec& S, string_view stateLabelPrefix) +{ + StateIdVec names = S; + sort(names.begin(), names.end()); + + stringstream sstr; + sstr << "{"; + int i = 0; + for (StateId name: names) + { + if (i) + sstr << ", "; + sstr << stateLabelPrefix << name; + i++; + } + sstr << "}"; + + return sstr.str(); +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/State.h b/src/regex_dfa/State.h new file mode 100644 index 0000000000..82e0162336 --- /dev/null +++ b/src/regex_dfa/State.h @@ -0,0 +1,53 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +using Tag = int; +using StateId = size_t; +using StateIdVec = std::vector; + +using AcceptMap = std::map; + +/** + * Returns a human readable string of @p S, such as "{n0, n1, n2}". + */ +std::string to_string(const StateIdVec& S, std::string_view stateLabelPrefix = "n"); + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::StateIdVec& v, FormatContext& ctx) + { + return format_to(ctx.out(), "{}", regex_dfa::to_string(v)); + } +}; +} // namespace fmt diff --git a/src/regex_dfa/State_test.cpp b/src/regex_dfa/State_test.cpp new file mode 100644 index 0000000000..c2e34dad67 --- /dev/null +++ b/src/regex_dfa/State_test.cpp @@ -0,0 +1,18 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +#include + +TEST(regex_State, to_string) +{ + regex_dfa::StateIdVec v { 1, 2, 3 }; + EXPECT_EQ("{n1, n2, n3}", fmt::format("{}", v)); +} diff --git a/src/regex_dfa/Symbols.cpp b/src/regex_dfa/Symbols.cpp new file mode 100644 index 0000000000..630670740b --- /dev/null +++ b/src/regex_dfa/Symbols.cpp @@ -0,0 +1,184 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; + +namespace regex_dfa +{ + +string prettySymbol(Symbol input) +{ + switch (input) + { + case Symbols::Error: return "<>"; + case Symbols::BeginOfLine: return "<>"; + case Symbols::EndOfLine: return "<>"; + case Symbols::EndOfFile: return "<>"; + case Symbols::Epsilon: return "ε"; + case '\a': return "\\a"; + case '\b': return "\\b"; + case '\f': return "\\f"; + case '\n': return "\\n"; + case '\r': return "\\r"; + case ' ': return "\\s"; + case '\t': return "\\t"; + case '\v': return "\\v"; + case '\0': return "\\0"; + case '.': return "\\."; // so we can distinguish from dot-operator + default: + if (isprint(input)) + { + return fmt::format("{}", (char) input); + } + else + { + return fmt::format("\\x{:02x}", input); + } + } +} + +string prettyCharRange(Symbol ymin, Symbol ymax) +{ + assert(ymin <= ymax); + + stringstream sstr; + switch (ymax - ymin) + { + case 0: sstr << prettySymbol(ymin); break; + case 1: sstr << prettySymbol(ymin) << prettySymbol(ymin + 1); break; + case 2: sstr << prettySymbol(ymin) << prettySymbol(ymin + 1) << prettySymbol(ymax); break; + default: sstr << prettySymbol(ymin) << '-' << prettySymbol(ymax); break; + } + return sstr.str(); +} + +string groupCharacterClassRanges(const vector& syms) +{ + // {1,3,5,a,b,c,d,e,f,z] + // -> + // {{1}, {3}, {5}, {a-f}, {z}} + + stringstream sstr; + Symbol ymin = '\0'; + Symbol ymax = ymin; + int k = 0; + + for (size_t i = 0, e = syms.size(); i != e; ++i) + { + if (!syms[i]) + continue; + + const Symbol c = (Symbol) i; + if (c == ymax + 1) + { // range growing + ymax = c; + } + else + { // gap found + if (k) + { + sstr << prettyCharRange(ymin, ymax); + } + ymin = ymax = c; + } + k++; + } + sstr << prettyCharRange(ymin, ymax); + + return sstr.str(); +} + +string groupCharacterClassRanges(vector chars) +{ + // we took a copy in tgroup here, so I can sort() later + sort(chars.begin(), chars.end()); + + if (chars.size() == 1) + return prettySymbol(chars.front()); + + // {1,3,5,a,b,c,d,e,f,z] + // -> + // "123a-fz" + + stringstream sstr; + Symbol ymin = 0; + Symbol ymax = ymin; + int i = 0; + + for (Symbol c: chars) + { + if (c == ymax + 1) + { // range growing + ymax = c; + } + else + { // gap found + if (i) + { + sstr << prettyCharRange(ymin, ymax); + } + ymin = ymax = c; + } + i++; + } + sstr << prettyCharRange(ymin, ymax); + + return sstr.str(); +} + +SymbolSet::SymbolSet(DotMode): set_(256, true), size_ { 255 }, hash_ { 2166136261 } +{ + set_[(size_t) '\n'] = false; + for (Symbol s: *this) + { + hash_ = (hash_ * 16777619) ^ s; + } +} + +bool SymbolSet::isDot() const noexcept +{ + static SymbolSet dot(SymbolSet::Dot); + return *this == dot; +} + +string SymbolSet::to_string() const +{ + if (isDot()) + return "."; + + return groupCharacterClassRanges(set_); +} + +void SymbolSet::complement() +{ + // flip bits + for (size_t i = 0, e = set_.size(); i != e; ++i) + { + set_[i] = !set_[i]; + } + + // flip size + size_ = set_.size() - size_; + + recalculateHash(); +} + +void SymbolSet::recalculateHash() +{ + // recalculate hash + hash_ = 2166136261; + for (Symbol s: *this) + { + hash_ = (hash_ * 16777619) ^ s; + } +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/Symbols.h b/src/regex_dfa/Symbols.h new file mode 100644 index 0000000000..d76a2307ff --- /dev/null +++ b/src/regex_dfa/Symbols.h @@ -0,0 +1,206 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa +{ + +//! input symbol as used for transitions +using Symbol = int; + +std::string prettySymbol(Symbol input); +std::string prettyCharRange(Symbol ymin, Symbol ymax); +std::string groupCharacterClassRanges(const std::vector& syms); +std::string groupCharacterClassRanges(std::vector syms); + +// new way of wrapping up Symbols +struct Symbols +{ + constexpr static Symbol Epsilon = -1; + constexpr static Symbol Error = -2; + constexpr static Symbol BeginOfLine = -3; + constexpr static Symbol EndOfLine = -4; + constexpr static Symbol EndOfFile = -5; + constexpr static Symbol Character(char ch) { return Symbol(ch); } + + constexpr static bool isSpecial(Symbol s) + { + switch (s) + { + case Symbols::EndOfFile: + case Symbols::EndOfLine: + case Symbols::BeginOfLine: + case Symbols::Epsilon: + case Symbols::Error: return true; + default: return false; + } + } +}; + +/** + * Represents a set of symbols. + */ +class SymbolSet +{ + public: + enum DotMode + { + Dot + }; + + explicit SymbolSet(DotMode); + SymbolSet(): set_(256, false), size_ { 0 }, hash_ { 2166136261 } {} + + explicit SymbolSet(std::initializer_list list): SymbolSet() + { + std::for_each(list.begin(), list.end(), [this](Symbol s) { insert(s); }); + } + + bool empty() const noexcept { return size_ == 0; } + size_t size() const noexcept { return size_; } + + //! Transforms into the complement set. + void complement(); + + //! Inserts given Symbol @p s into this set. + void insert(Symbol s) + { + if (!contains(s)) + { + set_[s] = true; + hash_ = (hash_ * 16777619) ^ s; + size_++; + } + } + + //! Inserts a range of Simples between [a, b]. + void insert(const std::pair& range) + { + for (Symbol s = range.first; s <= range.second; ++s) + { + insert(s); + } + } + + //! @returns whether or not given Symbol @p s is in this set. + bool contains(Symbol s) const + { + assert(s >= 0 && s <= 255 && "Only ASCII allowed."); + return set_[(size_t) s]; + } + + //! Tests whether or not this SymbolSet can be represented as dot (.), i.e. all but \n. + bool isDot() const noexcept; + + //! @returns a human readable representation of this set + std::string to_string() const; + + bool operator==(const SymbolSet& rhs) const noexcept { return hash_ == rhs.hash_ && set_ == rhs.set_; } + bool operator!=(const SymbolSet& rhs) const noexcept { return !(*this == rhs); } + + class const_iterator + { // {{{ + public: + const_iterator(std::vector::const_iterator beg, + std::vector::const_iterator end, + size_t n): + beg_ { std::move(beg) }, end_ { std::move(end) }, offset_ { n } + { + while (beg_ != end_ && !*beg_) + { + ++beg_; + ++offset_; + } + } + + Symbol operator*() const { return static_cast(offset_); } + + const_iterator& operator++(int) + { + do + { + ++beg_; + ++offset_; + } while (beg_ != end_ && !*beg_); + return *this; + } + + const_iterator& operator++() + { + do + { + beg_++; + offset_++; + } while (beg_ != end_ && !*beg_); + return *this; + } + + bool operator==(const const_iterator& rhs) const noexcept { return beg_ == rhs.beg_; } + bool operator!=(const const_iterator& rhs) const noexcept { return beg_ != rhs.beg_; } + + private: + std::vector::const_iterator beg_; + std::vector::const_iterator end_; + size_t offset_; + }; // }}} + + const_iterator begin() const { return const_iterator(set_.begin(), set_.end(), 0); } + const_iterator end() const { return const_iterator(set_.end(), set_.end(), set_.size()); } + + size_t hash() const noexcept { return hash_; } + + private: + void recalculateHash(); + + private: + // XXX we chose vector as it is an optimized bit vector + std::vector set_; + size_t size_; + size_t hash_; +}; + +} // namespace regex_dfa + +namespace fmt +{ +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + constexpr auto format(const regex_dfa::SymbolSet& v, FormatContext& ctx) + { + return format_to(ctx.out(), "{}", v.to_string()); + } +}; +} // namespace fmt + +namespace std +{ +template <> +struct hash +{ + size_t operator()(const regex_dfa::SymbolSet& set) const { return set.hash(); } +}; +} // namespace std diff --git a/src/regex_dfa/Symbols_test.cpp b/src/regex_dfa/Symbols_test.cpp new file mode 100644 index 0000000000..4cfb69e112 --- /dev/null +++ b/src/regex_dfa/Symbols_test.cpp @@ -0,0 +1,112 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +#include + +using namespace std; +using regex_dfa::SymbolSet; + +TEST(regex_SymbolSet, s0) +{ + SymbolSet s0; + ASSERT_EQ(0, s0.size()); + ASSERT_TRUE(s0.empty()); +} + +TEST(regex_SymbolSet, s1) +{ + SymbolSet s1; + + // first add + s1.insert('a'); + ASSERT_EQ(1, s1.size()); + ASSERT_FALSE(s1.empty()); + + // overwrite + s1.insert('a'); + ASSERT_EQ(1, s1.size()); + ASSERT_FALSE(s1.empty()); +} + +TEST(regex_SymbolSet, initializer_list) +{ + SymbolSet a { 'a' }; + EXPECT_EQ(1, a.size()); + EXPECT_TRUE(a.contains('a')); + + SymbolSet s2 { 'a', 'b', 'b', 'c' }; + EXPECT_EQ(3, s2.size()); + EXPECT_EQ("abc", s2.to_string()); +} + +TEST(regex_SymbolSet, dot) +{ + SymbolSet dot(SymbolSet::Dot); + EXPECT_FALSE(dot.contains('\n')); + EXPECT_TRUE(dot.contains('\0')); + EXPECT_TRUE(dot.contains(' ')); + EXPECT_TRUE(dot.isDot()); + EXPECT_EQ(".", dot.to_string()); +} + +TEST(regex_SymbolSet, complement) +{ + SymbolSet s; + s.insert('\n'); + EXPECT_EQ("\\n", s.to_string()); + s.complement(); + EXPECT_EQ(".", s.to_string()); +} + +TEST(regex_SymbolSet, range) +{ + SymbolSet r; + r.insert(make_pair('a', 'f')); + + EXPECT_EQ(6, r.size()); + EXPECT_EQ("a-f", r.to_string()); + + r.insert(make_pair('0', '9')); + EXPECT_EQ(16, r.size()); + EXPECT_EQ("0-9a-f", r.to_string()); +} + +TEST(regex_SymbolSet, fmt_format) +{ + SymbolSet s; + s.insert(make_pair('0', '9')); + s.insert(make_pair('a', 'f')); + + EXPECT_EQ("0-9a-f", fmt::format("{}", s)); +} + +TEST(regex_SymbolSet, hash_map) +{ + SymbolSet s0; + SymbolSet s1 { 'a' }; + SymbolSet s2 { 'a', 'b' }; + + unordered_map map; + map[s0] = 0; + map[s1] = 1; + map[s2] = 2; + + EXPECT_EQ(0, map[s0]); + EXPECT_EQ(1, map[s1]); + EXPECT_EQ(2, map[s2]); +} + +TEST(regex_SymbolSet, compare) +{ + SymbolSet s1 { 'a', 'b' }; + SymbolSet s2 { 'a', 'b' }; + SymbolSet s3 { 'a', 'c' }; + ASSERT_TRUE(s1 == s2); + ASSERT_TRUE(s1 != s3); +} diff --git a/src/regex_dfa/TransitionMap-inl.h b/src/regex_dfa/TransitionMap-inl.h new file mode 100644 index 0000000000..36ba30d013 --- /dev/null +++ b/src/regex_dfa/TransitionMap-inl.h @@ -0,0 +1,49 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include + +namespace regex_dfa +{ + +inline void TransitionMap::define(StateId currentState, Symbol charCat, StateId nextState) +{ + mapping_[currentState][charCat] = nextState; +} + +inline StateId TransitionMap::apply(StateId currentState, Symbol charCat) const +{ + if (auto i = mapping_.find(currentState); i != mapping_.end()) + if (auto k = i->second.find(charCat); k != i->second.end()) + return k->second; + + return ErrorState; +} + +inline std::vector TransitionMap::states() const +{ + std::vector v; + v.reserve(mapping_.size()); + for (const auto& i: mapping_) + v.push_back(i.first); + std::sort(v.begin(), v.end()); + return v; +} + +inline std::map TransitionMap::map(StateId s) const +{ + std::map m; + if (auto mapping = mapping_.find(s); mapping != mapping_.end()) + for (const auto& i: mapping->second) + m[i.first] = i.second; + return m; +} + +} // namespace regex_dfa diff --git a/src/regex_dfa/TransitionMap.h b/src/regex_dfa/TransitionMap.h new file mode 100644 index 0000000000..d05229f64b --- /dev/null +++ b/src/regex_dfa/TransitionMap.h @@ -0,0 +1,66 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +#include +#include + +namespace regex_dfa +{ + +using CharCatId = int; + +constexpr CharCatId ErrorCharCat = static_cast(-1); + +/** + * Represents an error-state, such as invalid input character or unexpected EOF. + */ +constexpr StateId ErrorState { 808080 }; // static_cast(-1); + +/** + * Transition mapping API to map the input (currentState, charCat) to (newState). + */ +class TransitionMap +{ + public: + using Container = std::map>; + + TransitionMap(): mapping_ {} {} + + TransitionMap(Container mapping): mapping_ { std::move(mapping) } {} + + /** + * Defines a new mapping for (currentState, charCat) to (nextState). + */ + void define(StateId currentState, Symbol charCat, StateId nextState); + + /** + * Retrieves the next state for the input (currentState, charCat). + * + * @returns the transition from (currentState, charCat) to (nextState) or ErrorState if not defined. + */ + StateId apply(StateId currentState, Symbol charCat) const; + + /** + * Retrieves a list of all available states. + */ + std::vector states() const; + + /** + * Retrieves a map of all transitions from given state @p inputState. + */ + std::map map(StateId inputState) const; + + private: + Container mapping_; +}; + +} // namespace regex_dfa + +#include diff --git a/src/regex_dfa/klex_test.cpp b/src/regex_dfa/klex_test.cpp new file mode 100644 index 0000000000..17f2164e24 --- /dev/null +++ b/src/regex_dfa/klex_test.cpp @@ -0,0 +1,13 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include + +int main(int argc, const char* argv[]) +{ + return regex_dfa::util::testing::main(argc, argv); +} diff --git a/src/regex_dfa/util/AnsiColor.h b/src/regex_dfa/util/AnsiColor.h new file mode 100644 index 0000000000..d7b27c99bc --- /dev/null +++ b/src/regex_dfa/util/AnsiColor.h @@ -0,0 +1,153 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2019 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +namespace AnsiColor { + +enum Code : unsigned { + Clear = 0, + Reset = Clear, + Bold = 0x0001, // 1 + Dark = 0x0002, // 2 + Undef1 = 0x0004, + Underline = 0x0008, // 4 + Blink = 0x0010, // 5 + Undef2 = 0x0020, + Reverse = 0x0040, // 7 + Concealed = 0x0080, // 8 + AllFlags = 0x00FF, + Black = 0x0100, + Red = 0x0200, + Green = 0x0300, + Yellow = 0x0400, + Blue = 0x0500, + Magenta = 0x0600, + Cyan = 0x0700, + White = 0x0800, + AnyFg = 0x0F00, + OnBlack = 0x1000, + OnRed = 0x2000, + OnGreen = 0x3000, + OnYellow = 0x4000, + OnBlue = 0x5000, + OnMagenta = 0x6000, + OnCyan = 0x7000, + OnWhite = 0x8000, + AnyBg = 0xF000 +}; + +/// Combines two ANSI escape sequences into one Code. +constexpr inline Code operator|(Code a, Code b) +{ + return Code{unsigned(a) | unsigned(b)}; +} + +/** + * Counts the number of ANSI escape sequences in @p codes. + */ +constexpr unsigned count(Code codes) +{ + if (codes == Clear) + return 1; + + unsigned i = 0; + + if (codes & AllFlags) + for (int k = 0; k < 8; ++k) + if (codes & (1 << k)) + ++i; + + if (codes & AnyFg) + ++i; + + if (codes & AnyBg) + ++i; + + return i; +} + +/** + * Retrieves the number of bytes required to store the ANSI escape sequences of @p codes + * without prefix/suffix notation. + */ +constexpr unsigned capacity(Code codes) +{ + if (codes == Clear) + return 1; + + unsigned i = 0; + + if (codes & AllFlags) + for (int k = 0; k < 8; ++k) + if (codes & (1 << k)) + ++i; + + if (codes & AnyFg) + i += 2; + + if (codes & AnyBg) + i += 2; + + return i + (count(codes) - 1); +} + +/// Constructs a sequence of ANSI codes for the colors in this @p codes. +template +constexpr auto codes() +{ + std::array result{}; + + size_t n = 0; // n'th escape sequence being iterate through + size_t i = 0; // i'th byte in output array + + result[i++] = '\x1B'; + result[i++] = '['; + + if constexpr (value != 0) + { + if (value & AllFlags) + { + for (int k = 0; k < 8; ++k) + { + if (value & (1 << k)) + { + if (n++) + result[i++] = ';'; + result[i++] = k + '1'; + } + } + } + + if (value & AnyFg) + { + if (n++) + result[i++] = ';'; + unsigned const val = ((value >> 8) & 0x0F) + 29; // 36 -> {'3', '6'} + result[i++] = (val / 10) + '0'; + result[i++] = (val % 10) + '0'; + } + + if (value & AnyBg) + { + if (n++) + result[i++] = ';'; + unsigned const val = ((value >> 12) & 0x0F) + 39; + result[i++] = (val / 10) + '0'; + result[i++] = (val % 10) + '0'; + } + } + else + result[i++] = '0'; // reset/clear + + result[i++] = 'm'; + + return result; +} + +} // namespace AnsiColor diff --git a/src/regex_dfa/util/Flags.cpp b/src/regex_dfa/util/Flags.cpp new file mode 100644 index 0000000000..5082fce7df --- /dev/null +++ b/src/regex_dfa/util/Flags.cpp @@ -0,0 +1,578 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include "Flags.h" + +#include +#include +#include + +#include "AnsiColor.h" + +using namespace std; + +namespace regex_dfa::util +{ + +auto static constexpr clearColor = AnsiColor::codes(); +auto static constexpr optionColor = AnsiColor::codes(); +auto static constexpr valueColor = AnsiColor::codes(); +auto static constexpr headerColor = AnsiColor::codes(); + +// {{{ Flags::Error +Flags::Error::Error(ErrorCode code, string arg): + runtime_error { FlagsErrorCategory::get().message(static_cast(code)) + ": " + arg }, + code_ { code }, + arg_ { move(arg) } +{ +} +// }}} + +// {{{ Flag +Flags::Flag::Flag(const string& opt, const string& val, FlagStyle fs, FlagType ft): + type_(ft), style_(fs), name_(opt), value_(val) +{ +} +// }}} + +Flags::Flags(): + flagDefs_ {}, + parametersEnabled_ { false }, + parametersPlaceholder_ {}, + parametersHelpText_ {}, + set_ {}, + raw_ {} +{ +} + +void Flags::set(const Flag& flag) +{ + set_[flag.name()] = make_pair(flag.type(), flag.value()); +} + +void Flags::set(const string& opt, const string& val, FlagStyle fs, FlagType ft) +{ + set(Flag { opt, val, fs, ft }); +} + +bool Flags::isSet(const string& flag) const +{ + return set_.find(flag) != set_.end(); +} + +string Flags::asString(const string& flag) const +{ + auto i = set_.find(flag); + if (i == set_.end()) + throw Error { ErrorCode::NotFound, flag }; + + return i->second.second; +} + +string Flags::getString(const string& flag) const +{ + auto i = set_.find(flag); + if (i == set_.end()) + throw Error { ErrorCode::NotFound, flag }; + + if (i->second.first != FlagType::String) + throw Error { ErrorCode::TypeMismatch, flag }; + + return i->second.second; +} + +long int Flags::getNumber(const string& flag) const +{ + auto i = set_.find(flag); + if (i == set_.end()) + throw Error { ErrorCode::NotFound, flag }; + + if (i->second.first != FlagType::Number) + throw Error { ErrorCode::TypeMismatch, flag }; + + return stoi(i->second.second); +} + +float Flags::getFloat(const string& flag) const +{ + auto i = set_.find(flag); + if (i == set_.end()) + throw Error { ErrorCode::NotFound, flag }; + + if (i->second.first != FlagType::Float) + throw Error { ErrorCode::TypeMismatch, flag }; + + return stof(i->second.second); +} + +bool Flags::getBool(const string& flag) const +{ + auto i = set_.find(flag); + if (i == set_.end()) + return false; + + return i->second.second == "true"; +} + +const vector& Flags::parameters() const +{ + return raw_; +} + +void Flags::setParameters(const vector& v) +{ + raw_ = v; +} + +string Flags::to_s() const +{ + stringstream sstr; + + int i = 0; + for (const pair& flag: set_) + { + if (i) + sstr << ' '; + + i++; + + switch (flag.second.first) + { + case FlagType::Bool: + if (flag.second.second == "true") + sstr << "--" << flag.first; + else + sstr << "--" << flag.first << "=false"; + break; + case FlagType::String: sstr << "--" << flag.first << "=\"" << flag.second.second << "\""; break; + default: sstr << "--" << flag.first << "=" << flag.second.second; break; + } + } + + return sstr.str(); +} + +Flags& Flags::define(const string& longOpt, + char shortOpt, + bool required, + FlagType type, + const string& valuePlaceholder, + const string& helpText, + const optional& defaultValue, + function callback) +{ + FlagDef fd; + fd.type = type; + fd.longOption = longOpt; + fd.shortOption = shortOpt; + fd.required = required; + fd.valuePlaceholder = valuePlaceholder; + fd.helpText = helpText; + fd.defaultValue = defaultValue; + fd.callback = callback; + + flagDefs_.emplace_back(fd); + + return *this; +} + +Flags& Flags::defineString(const string& longOpt, + char shortOpt, + const string& valuePlaceholder, + const string& helpText, + optional defaultValue, + function callback) +{ + return define( + longOpt, shortOpt, false, FlagType::String, valuePlaceholder, helpText, defaultValue, callback); +} + +Flags& Flags::defineNumber(const string& longOpt, + char shortOpt, + const string& valuePlaceholder, + const string& helpText, + optional defaultValue, + function callback) +{ + return define(longOpt, + shortOpt, + false, + FlagType::Number, + valuePlaceholder, + helpText, + defaultValue.has_value() ? make_optional(to_string(*defaultValue)) : nullopt, + [=](const string& value) { + if (callback) + { + callback(stoi(value)); + } + }); +} + +Flags& Flags::defineFloat(const string& longOpt, + char shortOpt, + const string& valuePlaceholder, + const string& helpText, + optional defaultValue, + function callback) +{ + return define(longOpt, + shortOpt, + false, + FlagType::Float, + valuePlaceholder, + helpText, + defaultValue.has_value() ? make_optional(to_string(*defaultValue)) : nullopt, + [=](const string& value) { + if (callback) + { + callback(stof(value)); + } + }); +} + +Flags& Flags::defineBool(const string& longOpt, + char shortOpt, + const string& helpText, + function callback) +{ + return define( + longOpt, shortOpt, false, FlagType::Bool, "", helpText, nullopt, [=](const string& value) { + if (callback) + { + callback(value == "true"); + } + }); +} + +Flags& Flags::enableParameters(const string& valuePlaceholder, const string& helpText) +{ + parametersEnabled_ = true; + parametersPlaceholder_ = valuePlaceholder; + parametersHelpText_ = helpText; + + return *this; +} + +const Flags::FlagDef* Flags::findDef(const string& longOption) const +{ + for (const auto& flag: flagDefs_) + if (flag.longOption == longOption) + return &flag; + + return nullptr; +} + +const Flags::FlagDef* Flags::findDef(char shortOption) const +{ + for (const auto& flag: flagDefs_) + if (flag.shortOption == shortOption) + return &flag; + + return nullptr; +} + +// ----------------------------------------------------------------------------- +void Flags::parse(int argc, const char* argv[]) +{ + vector args; + for (int i = 1; i < argc; ++i) + args.push_back(argv[i]); + + parse(args); +} + +error_code Flags::tryParse(const vector& args) +{ + try + { + parse(args); + } + catch (const Error& parseError) + { + return parseError.code(); + } + return error_code(); +} + +void Flags::parse(const vector& args) +{ + auto invokeCallback = [&](const FlagDef* fd, FlagStyle style, const string& value) { + if (fd) + { + set(fd->longOption, value, style, fd->type); + if (fd->callback) + { + fd->callback(value); + } + } + }; + + enum class ParsingState + { + Options, + Parameters, + }; + + vector params; + ParsingState pstate = ParsingState::Options; + size_t i = 0; + + while (i < args.size()) + { + string arg = args[i]; + i++; + if (pstate == ParsingState::Parameters) + params.push_back(arg); + else if (arg == "--") + { + if (parametersEnabled_) + pstate = ParsingState::Parameters; + else + throw Error { ErrorCode::UnknownOption, arg }; + } + else if (arg.size() > 2 && arg[0] == '-' && arg[1] == '-') + { + // longopt + string name = arg.substr(2); + size_t eq = name.find('='); + if (eq != name.npos) + { // --name=value + string value = name.substr(eq + 1); + name = name.substr(0, eq); + const FlagDef* fd = findDef(name); + if (fd == nullptr) + throw Error { ErrorCode::UnknownOption, arg }; + else + invokeCallback(fd, FlagStyle::LongWithValue, value); + } + else + { // --name [VALUE] + const FlagDef* fd = findDef(name); + if (fd == nullptr) + throw Error { ErrorCode::UnknownOption, arg }; + else if (fd->type == FlagType::Bool) + // --name + invokeCallback(fd, FlagStyle::LongSwitch, "true"); + else + { + // --name VALUE + if (i >= args.size()) + throw Error { ErrorCode::MissingOption, arg }; + + string value = args[i]; + i++; + + invokeCallback(fd, FlagStyle::LongWithValue, value); + } + } + } + else if (arg.size() > 1 && arg[0] == '-') + { + // shortopt + arg = arg.substr(1); + while (!arg.empty()) + { + const FlagDef* fd = findDef(arg[0]); + if (fd == nullptr) // option not found + throw Error { ErrorCode::UnknownOption, "-" + arg.substr(0, 1) }; + else if (fd->type == FlagType::Bool) + { + invokeCallback(fd, FlagStyle::ShortSwitch, "true"); + arg = arg.substr(1); + } + else if (arg.size() > 1) // -fVALUE + { + string value = arg.substr(1); + invokeCallback(fd, FlagStyle::ShortSwitch, value); + arg.clear(); + } + else + { + // -f VALUE + string name = fd->longOption; + + if (i >= args.size()) + { + char option[3] = { '-', fd->shortOption, '\0' }; + throw Error { ErrorCode::MissingOptionValue, option }; + } + + arg.clear(); + string value = args[i]; + i++; + + if (!value.empty() && value[0] == '-') + { + char option[3] = { '-', fd->shortOption, '\0' }; + throw Error { ErrorCode::MissingOptionValue, option }; + } + + invokeCallback(fd, FlagStyle::ShortSwitch, value); + } + } + } + else if (parametersEnabled_) + params.push_back(arg); + else + throw Error { ErrorCode::UnknownOption, arg }; + } + + setParameters(params); + + // fill any missing default flags + for (const FlagDef& fd: flagDefs_) + { + if (fd.defaultValue.has_value()) + { + if (!isSet(fd.longOption)) + invokeCallback(&fd, FlagStyle::LongWithValue, fd.defaultValue.value()); + } + else if (fd.type == FlagType::Bool) + { + if (!isSet(fd.longOption)) + invokeCallback(&fd, FlagStyle::LongWithValue, "false"); + } + } +} + +// ----------------------------------------------------------------------------- + +string Flags::helpText(string_view const& header, size_t width, size_t helpTextOffset) const +{ + stringstream sstr; + + if (!header.empty()) + sstr << headerColor.data() << header << clearColor.data(); + + if (parametersEnabled_ || !flagDefs_.empty()) + sstr << headerColor.data() << "Options:\n" << clearColor.data(); + + for (const FlagDef& fd: flagDefs_) + sstr << fd.makeHelpText(width, helpTextOffset); + + if (parametersEnabled_) + { + sstr << endl; + + const streampos p = sstr.tellp(); + const size_t column = static_cast(sstr.tellp() - p); + + sstr << " [--] " << valueColor.data() << parametersPlaceholder_ << clearColor.data(); + if (column < helpTextOffset) + sstr << setw(helpTextOffset - column) << ' '; + else + sstr << endl << setw(helpTextOffset) << ' '; + + sstr << parametersHelpText_ << endl; + } + + return sstr.str(); +} + +static string wordWrap(const string& text, size_t currentWidth, size_t width, size_t indent) +{ + stringstream sstr; + + size_t i = 0; + while (i < text.size()) + { + if (currentWidth >= width) + { + sstr << endl << setw(indent) << ' '; + currentWidth = 0; + } + + sstr << text[i]; + currentWidth++; + i++; + } + + return sstr.str(); +} + +error_code make_error_code(Flags::ErrorCode errc) +{ + return error_code(static_cast(errc), FlagsErrorCategory::get()); +} + +// {{{ Flags::FlagDef +string Flags::FlagDef::makeHelpText(size_t width, size_t helpTextOffset) const +{ + stringstream sstr; + + sstr << " "; + + // short option + if (shortOption) + sstr << optionColor.data() << "-" << shortOption << clearColor.data() << ", "; + else + sstr << " "; + + // long option + sstr << optionColor.data() << "--" << longOption; + + // value placeholder + if (type != FlagType::Bool) + { + sstr << "=" << valueColor.data(); + if (!valuePlaceholder.empty()) + sstr << valuePlaceholder; + else + sstr << "VALUE"; + } + sstr << clearColor.data(); + + // spacer + size_t column = static_cast(sstr.tellp()); + if (column < helpTextOffset) + sstr << setw(helpTextOffset - sstr.tellp()) << ' '; + else + { + sstr << endl << setw(helpTextOffset) << ' '; + column = helpTextOffset; + } + + // help output with default value hint. + if (type != FlagType::Bool && defaultValue.has_value()) + sstr << wordWrap(helpText + " [" + *defaultValue + "]", column, width, helpTextOffset); + else + sstr << wordWrap(helpText, column, width, helpTextOffset); + + sstr << endl; + + return sstr.str(); +} +// }}} + +// {{{ FlagsErrorCategory +FlagsErrorCategory& FlagsErrorCategory::get() +{ + static FlagsErrorCategory cat; + return cat; +} + +const char* FlagsErrorCategory::name() const noexcept +{ + return "Flags"; +} + +string FlagsErrorCategory::message(int ec) const +{ + switch (static_cast(ec)) + { + case Flags::ErrorCode::TypeMismatch: return "Type Mismatch"; + case Flags::ErrorCode::UnknownOption: return "Unknown Option"; + case Flags::ErrorCode::MissingOption: return "Missing Option"; + case Flags::ErrorCode::MissingOptionValue: return "Missing Option Value"; + case Flags::ErrorCode::NotFound: return "Flag Not Found"; + default: return ""; + } +} +// }}} + +} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/Flags.h b/src/regex_dfa/util/Flags.h new file mode 100644 index 0000000000..611eafbf84 --- /dev/null +++ b/src/regex_dfa/util/Flags.h @@ -0,0 +1,171 @@ +// This file is part of the "x0" project, // http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa::util { + +class Flags { + public: + enum class FlagType { + String, + Number, + Float, + Bool, + }; + + // FlagPassingStyle + enum FlagStyle { ShortSwitch, LongSwitch, ShortWithValue, LongWithValue, UnnamedParameter }; + + enum class ErrorCode { + TypeMismatch, + UnknownOption, + MissingOption, + MissingOptionValue, + NotFound, + }; + + class Error : public std::runtime_error { + public: + Error(ErrorCode code, std::string arg); + + ErrorCode code() const noexcept { return code_; } + const std::string& arg() const noexcept { return arg_; } + + private: + ErrorCode code_; + std::string arg_; + }; + + struct FlagDef; + class Flag; + + Flags(); + + std::string getString(const std::string& flag) const; + std::string asString(const std::string& flag) const; + long int getNumber(const std::string& flag) const; + float getFloat(const std::string& flag) const; + bool getBool(const std::string& flag) const; + + const std::vector& parameters() const; + void setParameters(const std::vector& v); + + size_t size() const { return set_.size(); } + + std::string to_s() const; + + void set(const Flag& flag); + void set(const std::string& opt, const std::string& val, FlagStyle fs, FlagType ft); + bool isSet(const std::string& flag) const; + + Flags& defineString(const std::string& longOpt, char shortOpt, const std::string& valuePlaceholder, + const std::string& helpText, std::optional defaultValue = std::nullopt, + std::function callback = nullptr); + + Flags& defineNumber(const std::string& longOpt, char shortOpt, const std::string& valuePlaceholder, + const std::string& helpText, std::optional defaultValue = std::nullopt, + std::function callback = nullptr); + + Flags& defineFloat(const std::string& longOpt, char shortOpt, const std::string& valuePlaceholder, + const std::string& helpText, std::optional defaultValue = std::nullopt, + std::function callback = nullptr); + + Flags& defineBool(const std::string& longOpt, char shortOpt, const std::string& helpText, + std::function callback = nullptr); + + Flags& enableParameters(const std::string& valuePlaceholder, const std::string& helpText); + + std::string helpText(std::string_view const& header = "") const { return helpText(header, 78, 30); } + std::string helpText(std::string_view const& header, size_t width, size_t helpTextOffset) const; + + const FlagDef* findDef(const std::string& longOption) const; + const FlagDef* findDef(char shortOption) const; + + void parse(int argc, const char* argv[]); + void parse(const std::vector& args); + + // Attempts to parse given arguments and returns an error code in case of parsing errors instead + // of throwing. + std::error_code tryParse(const std::vector& args); + + private: + Flags& define(const std::string& longOpt, char shortOpt, bool required, FlagType type, + const std::string& helpText, const std::string& valuePlaceholder, + const std::optional& defaultValue, + std::function callback); + + private: + std::list flagDefs_; + bool parametersEnabled_; // non-option parameters enabled? + std::string parametersPlaceholder_; + std::string parametersHelpText_; + + typedef std::pair FlagValue; + std::unordered_map set_; + std::vector raw_; +}; + +struct Flags::FlagDef { + FlagType type; + std::string longOption; + char shortOption; + bool required; + std::string valuePlaceholder; + std::string helpText; + std::optional defaultValue; + std::function callback; + + std::string makeHelpText(size_t width, size_t helpTextOffset) const; +}; + +class Flags::Flag { + public: + Flag(const std::string& opt, const std::string& val, FlagStyle fs, FlagType ft); + + explicit Flag(char shortOpt); + Flag(char shortOpt, const std::string& val); + Flag(const std::string& longOpt); + Flag(const std::string& longOpt, const std::string& val); + + FlagType type() const { return type_; } + const std::string& name() const { return name_; } + const std::string& value() const { return value_; } + + private: + FlagType type_; + FlagStyle style_; + std::string name_; + std::string value_; +}; + +class FlagsErrorCategory : public std::error_category { + public: + static FlagsErrorCategory& get(); + + const char* name() const noexcept override; + std::string message(int ec) const override; +}; + +std::error_code make_error_code(Flags::ErrorCode errc); + +} // namespace regex_dfa::util + +namespace std { +template <> +struct is_error_code_enum : public std::true_type { +}; +} // namespace std diff --git a/src/regex_dfa/util/IntVector.h b/src/regex_dfa/util/IntVector.h new file mode 100644 index 0000000000..8c15c407e3 --- /dev/null +++ b/src/regex_dfa/util/IntVector.h @@ -0,0 +1,40 @@ +#pragma once + +/** + * Encapsulates std::vector with speed improvements. + * + */ +template +class IntVector { + public: + using value_type = T; + using vector = std::vector; + using iterator = Vector::iterator; + using const_iterator = Vector::const_iterator; + + IntVector() : vector_{}, hash_{2166136261llu} {} + + void clear() { + vector_.clear(); + hash_ = 2166136261llu; + } + + void push_back(T v) { + vector_.push_back(v); + + hash_ ^= v; + hash_ *= 16777619llu; + } + + bool operator==(const IntVector& rhs) const noexcept { + return hash_ == rhs.hash_ && vector_ == rhs.vector_; + } + + bool operator!=(const IntVector& rhs) const noexcept { + return !(*this == rhs); + } + + private: + Vector vector_; + unsigned hash_; +}; diff --git a/src/regex_dfa/util/UnboxedRange.h b/src/regex_dfa/util/UnboxedRange.h new file mode 100644 index 0000000000..69133c3af2 --- /dev/null +++ b/src/regex_dfa/util/UnboxedRange.h @@ -0,0 +1,94 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT +#pragma once + +#include + +namespace regex_dfa::util { + +template +class UnboxedRange { + public: + using BoxedContainer = T; + using BoxedIterator = typename BoxedContainer::iterator; + using element_type = typename BoxedContainer::value_type::element_type; + + class iterator { // {{{ + public: + typedef typename BoxedContainer::iterator::difference_type difference_type; + typedef typename BoxedContainer::iterator::value_type::element_type value_type; + typedef typename BoxedContainer::iterator::value_type::element_type* pointer; + typedef typename BoxedContainer::iterator::value_type::element_type& reference; + typedef typename BoxedContainer::iterator::iterator_category iterator_category; + + explicit iterator(BoxedIterator boxed) : it_(boxed) {} + + const element_type& operator->() const { return **it_; } + element_type& operator->() { return **it_; } + + const element_type* operator*() const { return (*it_).get(); } + element_type* operator*() { return (*it_).get(); } + + iterator& operator++() + { + ++it_; + return *this; + } + iterator& operator++(int) + { + ++it_; + return *this; + } + + bool operator==(const iterator& other) const { return it_ == other.it_; } + bool operator!=(const iterator& other) const { return it_ != other.it_; } + + private: + BoxedIterator it_; + }; // }}} + + UnboxedRange(BoxedIterator begin, BoxedIterator end) : begin_(begin), end_(end) {} + explicit UnboxedRange(BoxedContainer& c) : begin_(c.begin()), end_(c.end()) {} + explicit UnboxedRange(const BoxedContainer& c) : UnboxedRange{const_cast(c)} {} + + iterator begin() const { return begin_; } + iterator end() const { return end_; } + iterator cbegin() const { return begin_; } + iterator cend() const { return end_; } + size_t size() const { return std::distance(begin_, end_); } + + private: + iterator begin_; + iterator end_; +}; + +/** + * Unboxes boxed element types in containers. + * + * Good examples are: + * + * \code + * std::vector> numbers; + * // ... + * for (int number: unbox(numbers)) { + * // ... juse use number here, instead of number.get() or *number. + * }; + * \endcode + */ +template +UnboxedRange unbox(BoxedContainer& boxedContainer) +{ + return UnboxedRange(boxedContainer); +} + +template +UnboxedRange unbox(const BoxedContainer& boxedContainer) +{ + return UnboxedRange(boxedContainer); +} + +} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/iterator-detail.h b/src/regex_dfa/util/iterator-detail.h new file mode 100644 index 0000000000..6f96cc5bd2 --- /dev/null +++ b/src/regex_dfa/util/iterator-detail.h @@ -0,0 +1,169 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +namespace regex_dfa::util::detail { + +template +struct reversed { + const Container container; + + auto begin() { return container.crbegin(); } + auto end() { return container.crend(); } +}; + +template +struct indexed { + Container& container; + + struct iterator { + typename Container::iterator iter; + std::size_t index = 0; + + iterator& operator++() + { + ++iter; + ++index; + return *this; + } + + iterator& operator++(int) + { + ++*this; + return *this; + } + + auto operator*() const { return std::make_pair(index, *iter); } + + bool operator==(const iterator& rhs) const noexcept { return iter == rhs.iter; } + bool operator!=(const iterator& rhs) const noexcept { return iter != rhs.iter; } + }; + + struct const_iterator { + typename Container::const_iterator iter; + std::size_t index = 0; + + const_iterator& operator++() + { + ++iter; + ++index; + return *this; + } + + const_iterator& operator++(int) + { + ++*this; + return *this; + } + + auto operator*() const { return std::make_pair(index, *iter); } + + bool operator==(const const_iterator& rhs) const noexcept { return iter == rhs.iter; } + bool operator!=(const const_iterator& rhs) const noexcept { return iter != rhs.iter; } + }; + + auto begin() const + { + if constexpr (std::is_const::value) + return const_iterator{container.cbegin()}; + else + return iterator{container.begin()}; + } + + auto end() const + { + if constexpr (std::is_const::value) + return const_iterator{container.cend()}; + else + return iterator{container.end()}; + } +}; + +template +struct filter { + Container& container; + Lambda proc; + + struct iterator { + using iterator_category = std::forward_iterator_tag; + using value_type = typename Container::value_type; + using difference_type = long; + using pointer = value_type*; + using reference = value_type&; + + typename Container::iterator i; + typename Container::iterator e; + Lambda filter; + + auto operator*() const { return *i; } + + iterator& operator++() + { + ++i; + while (i != e && !filter(*i)) + ++i; + return *this; + } + + iterator& operator++(int) { return ++*this; } + + bool operator==(const iterator& rhs) const noexcept { return i == rhs.i; } + bool operator!=(const iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + struct const_iterator { + typename Container::const_iterator i; + typename Container::const_iterator e; + Lambda filter; + + auto operator*() const { return *i; } + + const_iterator& operator++() + { + ++i; + while (i != e && !filter(*i)) + ++i; + return *this; + } + + const_iterator& operator++(int) { return ++*this; } + + bool operator==(const const_iterator& rhs) const noexcept { return i == rhs.i; } + bool operator!=(const const_iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + auto begin() const + { + if constexpr (std::is_const::value) + { + auto i = const_iterator{std::cbegin(container), std::cend(container), proc}; + while (i != end() && !proc(*i)) + ++i; + return i; + } + else + { + auto i = iterator{std::begin(container), std::end(container), proc}; + while (i != end() && !proc(*i)) + ++i; + return i; + } + } + + auto end() const + { + if constexpr (std::is_const::value) + return const_iterator{std::cend(container), std::cend(container), proc}; + else + return iterator{std::end(container), std::end(container), proc}; + } +}; + +} // namespace regex_dfa::util::detail diff --git a/src/regex_dfa/util/iterator.h b/src/regex_dfa/util/iterator.h new file mode 100644 index 0000000000..60240ba12a --- /dev/null +++ b/src/regex_dfa/util/iterator.h @@ -0,0 +1,106 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace regex_dfa::util { + +template +inline auto reversed(Container&& c) +{ + if constexpr (std::is_reference::value) + return detail::reversed{std::forward(c)}; + else + return detail::reversed{std::forward(c)}; +} + +template +inline auto indexed(const Container& c) +{ + return typename std::add_const>::type{c}; +} + +template +inline auto indexed(Container& c) +{ + return detail::indexed{c}; +} + +template +inline auto translate(const Container& container, Lambda mapfn) { + using namespace std; + using T = decltype(mapfn(*begin(container))); + + vector out; + out.reserve(distance(begin(container), end(container))); + transform(begin(container), end(container), back_inserter(out), move(mapfn)); + + return out; +} + +template +inline std::string join(const Container& container, const std::string& separator = ", ") +{ + std::stringstream out; + + for (const auto&& [i, v] : indexed(container)) + if (i) + out << separator << v; + else + out << v; + + return out.str(); +} + +template +inline auto filter(std::initializer_list&& c, Lambda proc) +{ + return typename std::add_const, Lambda>>::type{c, proc}; +} + +template +inline auto filter(const Container& c, Lambda proc) +{ + return typename std::add_const>::type{c, proc}; +} + +template +inline auto filter(Container& c, Lambda proc) +{ + return detail::filter{c, proc}; +} + +/** + * Finds the last occurence of a given element satisfying @p test. + * + * @returns the iterator representing the last item satisfying @p test or @p end if none found. + */ +template +auto find_last(const Container& container, Test test) -> decltype(std::cbegin(container)) +{ + auto begin = std::cbegin(container); + auto end = std::cend(container); + + for (auto i = std::prev(end); i != begin; --i) + if (test(*i)) + return i; + + if (test(*begin)) + return begin; + else + return end; +} + +} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/iterator_test.cpp b/src/regex_dfa/util/iterator_test.cpp new file mode 100644 index 0000000000..c64f67e6f0 --- /dev/null +++ b/src/regex_dfa/util/iterator_test.cpp @@ -0,0 +1,179 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include + +#include +#include +#include +#include + +using namespace std; +using namespace regex_dfa::util; + +TEST(util_iterator_reversed, empty) +{ + const vector v; + auto x = reversed(v); + auto i = begin(x); + ASSERT_TRUE(i == end(x)); +} + +TEST(util_iterator_reversed, one) +{ + const vector v { 1 }; + auto x = reversed(v); + auto i = begin(x); + ASSERT_EQ(1, *i); + i++; + ASSERT_TRUE(i == end(x)); +} + +TEST(util_iterator_reversed, many) +{ + const vector v { 1, 2, 3 }; + auto x = reversed(v); + auto i = begin(x); + ASSERT_EQ(3, *i); + i++; + ASSERT_EQ(2, *i); + i++; + ASSERT_EQ(1, *i); + i++; + ASSERT_TRUE(i == end(x)); +} + +TEST(util_iterator_indexed, many_const) +{ + const vector v { 10, 20, 30 }; + const auto x = indexed(v); + static_assert(is_const::value); + auto i = begin(x); + + ASSERT_EQ(0, (*i).first); + ASSERT_EQ(10, (*i).second); + i++; + + ASSERT_EQ(1, (*i).first); + ASSERT_EQ(20, (*i).second); + i++; + + ASSERT_EQ(2, (*i).first); + ASSERT_EQ(30, (*i).second); + i++; + + ASSERT_TRUE(i == end(x)); +} + +TEST(util_iterator_indexed, many) +{ + vector v { "zero", "one", "two" }; + auto x = indexed(v); + auto i = begin(x); + + ASSERT_EQ(0, (*i).first); + ASSERT_EQ("zero", (*i).second); + i++; + + ASSERT_EQ(1, (*i).first); + ASSERT_EQ("one", (*i).second); + i++; + + ASSERT_EQ(2, (*i).first); + ASSERT_EQ("two", (*i).second); + i++; + + ASSERT_TRUE(i == end(x)); +} + +TEST(util_iterator_indexed, range_based_for_loop) +{ + log("const:"); + const vector v1 { 10, 20, 30 }; + for (const auto&& [index, value]: indexed(v1)) + logf("index {}, value {}", index, value); + + log("non-const:"); + vector v2 { 10, 20, 30 }; + for (const auto&& [index, value]: indexed(v2)) + logf("index {}, value {}", index, value); +} + +TEST(util_iterator_filter, for_range) +{ + const vector nums = { 1, 2, 3, 4 }; + vector odds; + for (const int i: filter(nums, [](int x) { return x % 2 != 0; })) + odds.push_back(i); + + ASSERT_EQ(2, odds.size()); + EXPECT_EQ(1, odds[0]); + EXPECT_EQ(3, odds[1]); +} + +TEST(util_iterator_filter, count_proc_invocations) +{ + static const array numbers = { 1, 2, 3, 4 }; + int count = 0; + auto counter = [&](int) { + ++count; + return true; + }; + const auto f = filter(numbers, counter); + for_each(begin(f), end(f), [](int) {}); + ASSERT_EQ(4, count); +} + +TEST(util_iterator_filter, for_range_initializer_list) +{ + static const array numbers = { 1, 2, 3, 4 }; + vector odds; + auto f_odd = [&](int x) { + logf("f_odd: x={0}", x); + return x % 2 != 0; + }; + for (const int i: filter(numbers, f_odd)) + odds.push_back(i); + + ASSERT_EQ(2, odds.size()); + EXPECT_EQ(1, odds[0]); + EXPECT_EQ(3, odds[1]); +} + +TEST(util_iterator_translate, vector) +{ + const vector in { 1, 2, 3, 4 }; + const vector out = translate(in, [](auto i) -> int { return int(i * 2); }); + + for (const auto&& [i, v]: indexed(out)) + logf("out[{}] = {}", i, v); + + ASSERT_EQ(4, out.size()); + + EXPECT_EQ(2, out[0]); + EXPECT_EQ(4, out[1]); + EXPECT_EQ(6, out[2]); + EXPECT_EQ(8, out[3]); +} + +TEST(util_iterator_translate, chain_translate_join) +{ + const vector in { 1, 2, 3, 4 }; + const string out { join(translate(in, [](int i) -> string { return to_string(i); }), ", ") }; + + ASSERT_EQ("1, 2, 3, 4", out); +} + +TEST(util_iterator, find_last) +{ + const vector v { 1, 2, 3, 4 }; + const auto i = find_last(v, [](int i) { return i % 2 != 0; }); // find last odd value -> 3 + + ASSERT_TRUE(i != end(v)); + ASSERT_EQ(3, *i); +} diff --git a/src/regex_dfa/util/literals.h b/src/regex_dfa/util/literals.h new file mode 100644 index 0000000000..427822539e --- /dev/null +++ b/src/regex_dfa/util/literals.h @@ -0,0 +1,73 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include +#include +#include + +namespace regex_dfa::util::literals { + +/** + * Strips a multiline string's indentation prefix. + * + * Example: + * \code + * string s = R"(|line one + * |line two + * |line three + * )"_multiline; + * fmt::print(s); + * \endcode + * + * This prints three lines: @c "line one\nline two\nline three\n" + */ +inline std::string operator""_multiline(const char* text, size_t size) +{ + if (!*text) + return {}; + + enum class State { + LineData, + SkipUntilPrefix, + }; + + constexpr char LF = '\n'; + State state = State::LineData; + std::stringstream sstr; + char sep = *text++; + + while (*text) + { + switch (state) + { + case State::LineData: + if (*text == LF) + { + state = State::SkipUntilPrefix; + sstr << *text++; + } + else + sstr << *text++; + break; + case State::SkipUntilPrefix: + if (*text == sep) + { + state = State::LineData; + text++; + } + else + text++; + break; + } + } + + return sstr.str(); +} + +} // namespace regex_dfa::util::literals diff --git a/src/regex_dfa/util/overloaded.h b/src/regex_dfa/util/overloaded.h new file mode 100644 index 0000000000..733d201c05 --- /dev/null +++ b/src/regex_dfa/util/overloaded.h @@ -0,0 +1,21 @@ +// This file is part of the "klex" project, http://github.com/christianparpart/klex> +// (c) 2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +// This is a nice helper for conviniently using std::visit() with an arbitrary list of lambdas as +// overload for pattern matching the variant's input type + +template +struct overloaded : Ts... +{ + using Ts::operator()...; +}; + +template +overloaded(Ts...) -> overloaded; + diff --git a/src/regex_dfa/util/testing.cpp b/src/regex_dfa/util/testing.cpp new file mode 100644 index 0000000000..b8ecaecdaa --- /dev/null +++ b/src/regex_dfa/util/testing.cpp @@ -0,0 +1,610 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) + #include +#else + #include +#endif + +using namespace std; + +namespace regex_dfa::util::testing +{ + +auto static constexpr colorsReset = AnsiColor::codes(); +auto static constexpr colorsTestCaseHeader = AnsiColor::codes(); +auto static constexpr colorsError = AnsiColor::codes(); +auto static constexpr colorsOk = AnsiColor::codes(); +auto static constexpr colorsLog = AnsiColor::codes(); + +int main(int argc, const char* argv[]) +{ + return UnitTest::instance()->main(argc, argv); +} + +bool beginsWith(const string& str, const string_view& prefix) +{ + if (str.length() < prefix.length()) + { + return false; + } + + return string_view(&str[0], prefix.length()) == prefix; +} + +// ############################################################################ + +class BailOutException +{ + public: + BailOutException() {} +}; + +// ############################################################################ + +Environment::~Environment() +{ +} + +void Environment::SetUp() +{ +} + +void Environment::TearDown() +{ +} + +// ############################################################################ + +Test::~Test() +{ +} + +void Test::SetUp() +{ +} + +void Test::TearDown() +{ +} + +void Test::log(const string& message) +{ + UnitTest::instance()->log(message); +} + +void Test::reportUnhandledException(const exception& e) +{ + UnitTest::instance()->reportUnhandledException(e); +} + +// ############################################################################ + +TestInfo::TestInfo(const string& testCaseName, + const string& testName, + bool enabled, + unique_ptr&& testFactory): + testCaseName_(testCaseName), testName_(testName), enabled_(enabled), testFactory_(move(testFactory)) +{ +} + +// ############################################################################ + +UnitTest::UnitTest(): + environments_(), + testCases_(), + activeTests_(), + repeats_(1), + printProgress_(false), + printSummaryDetails_(true), + currentTestCase_(nullptr), + currentCount_(0), + successCount_(0), + failCount_(0) +{ +} + +UnitTest::~UnitTest() +{ +} + +UnitTest* UnitTest::instance() +{ + static UnitTest unitTest; + return &unitTest; +} + +void UnitTest::randomizeTestOrder() +{ + unsigned int seed = static_cast(chrono::system_clock::now().time_since_epoch().count()); + + shuffle(activeTests_.begin(), activeTests_.end(), default_random_engine(seed)); +} + +void UnitTest::sortTestsAlphabetically() +{ + sort(activeTests_.begin(), activeTests_.end(), [this](size_t a, size_t b) -> bool { + TestInfo* left = testCases_[a].get(); + TestInfo* right = testCases_[b].get(); + + if (left->testCaseName() < right->testCaseName()) + return true; + + if (left->testCaseName() == right->testCaseName()) + return left->testName() < right->testName(); + + return false; + }); +} + +bool initializeTTY() +{ +#if defined(_WIN32) && defined(ENABLE_VIRTUAL_TERMINAL_PROCESSING) + HANDLE output = GetStdHandle(STD_OUTPUT_HANDLE); + if (output == INVALID_HANDLE_VALUE) + return false; + + DWORD mode = 0; + if (!GetConsoleMode(output, &mode)) + return false; + + mode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING; + if (!SetConsoleMode(output, mode)) + return false; +#endif + + return true; +} + +int UnitTest::main(int argc, const char* argv[]) +{ + initializeTTY(); + // TODO: add CLI parameters (preferably gtest compatible) + // + // --no-color | --color explicitely enable/disable color output + // --filter=REGEX filter tests by regular expression + // --exclude=REGEX excludes tests by regular expressions + // --randomize randomize test order + // --repeats=NUMBER repeats tests given number of times + // --list[-tests] Just list the tests and exit. + + Flags flags; + flags.defineBool("help", 'h', "Prints this help and terminates.") + .defineBool("verbose", 'v', "Prints to console in debug log level.") + .defineString("filter", 'f', "GLOB", "Filters tests by given glob.", "*") + .defineString("exclude", 'e', "GLOB", "Excludes tests by given glob.", "") + .defineBool("list", 'l', "Prints all tests and exits.") + .defineBool("randomize", 'R', "Randomizes test order.") + .defineBool("sort", 's', "Sorts tests alphabetically ascending.") + .defineBool("no-progress", 0, "Avoids printing progress.") + .defineNumber("repeat", 'r', "COUNT", "Repeat tests given number of times.", 1); + + try + { + flags.parse(argc, argv); + } + catch (const exception& ex) + { + fprintf(stderr, "Failed to parse flags. %s\n", ex.what()); + return EXIT_FAILURE; + } + + if (flags.getBool("help")) + { + printf("%s\n", flags.helpText().c_str()); + return EXIT_SUCCESS; + } + + verbose_ = flags.getBool("verbose"); + + string filter = flags.getString("filter"); + string exclude = flags.getString("exclude"); + repeats_ = flags.getNumber("repeat"); + printProgress_ = !flags.getBool("no-progress"); + + if (flags.getBool("randomize")) + randomizeTestOrder(); + else if (flags.getBool("sort")) + sortTestsAlphabetically(); + + filterTests(filter, exclude); + + if (flags.getBool("list")) + { + printTestList(); + return EXIT_SUCCESS; + } + + run(); + + return failCount_ == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} + +void UnitTest::filterTests(const string& filter, const string& exclude) +{ + // if (filter != "*") { ... } + + vector filtered; + for (size_t i = 0, e = activeTests_.size(); i != e; ++i) + { + TestInfo* testInfo = testCases_[activeTests_[i]].get(); + string matchName = fmt::format("{}.{}", testInfo->testCaseName(), testInfo->testName()); + +#if defined(_WIN32) || defined(_WIN64) + if (!exclude.empty() && PathMatchSpec(matchName.c_str(), exclude.c_str()) == TRUE) + continue; // exclude this one + + if (PathMatchSpec(matchName.c_str(), filter.c_str()) == TRUE) + filtered.push_back(activeTests_[i]); +#else + const int flags = 0; + + if (!exclude.empty() && fnmatch(exclude.c_str(), matchName.c_str(), flags) == 0) + continue; // exclude this one + + if (fnmatch(filter.c_str(), matchName.c_str(), flags) == 0) + { + filtered.push_back(activeTests_[i]); + } +#endif + } + activeTests_ = move(filtered); +} + +void UnitTest::run() +{ + for (auto& env: environments_) + { + env->SetUp(); + } + + for (auto& init: initializers_) + { + init->invoke(); + } + + for (int i = 0; i < repeats_; i++) + { + runAllTestsOnce(); + } + + for (auto& env: environments_) + { + env->TearDown(); + } + + printSummary(); +} + +void UnitTest::printTestList() +{ + for (size_t i = 0, e = activeTests_.size(); i != e; ++i) + { + TestInfo* testCase = testCases_[activeTests_[i]].get(); + printf("%4zu. %s.%s\n", i + 1, testCase->testCaseName().c_str(), testCase->testName().c_str()); + } +} + +void UnitTest::printSummary() +{ + // print summary + fmt::print("{}Finished running {} tests ({} repeats). {} success, {} failed, {} disabled.{}\n", + failCount_ ? colorsError.data() : colorsOk.data(), + repeats_ * activeTests_.size(), + repeats_, + successCount_, + failCount_, + disabledCount(), + colorsReset.data()); + + if (printSummaryDetails_ && !failures_.empty()) + { + printf("================================\n"); + printf(" Summary:\n"); + printf("================================\n"); + + for (size_t i = 0, e = failures_.size(); i != e; ++i) + { + const auto& failure = failures_[i]; + fmt::print("{}{}{}\n", colorsError.data(), failure, colorsReset.data()); + } + } +} + +size_t UnitTest::enabledCount() const +{ + size_t count = 0; + + for (size_t i = 0, e = activeTests_.size(); i != e; ++i) + { + if (testCases_[activeTests_[i]]->isEnabled()) + { + count++; + } + } + + return count; +} + +size_t UnitTest::disabledCount() const +{ + size_t count = 0; + + for (size_t i = 0, e = activeTests_.size(); i != e; ++i) + { + if (!testCases_[activeTests_[i]]->isEnabled()) + { + count++; + } + } + + return count; +} + +void UnitTest::runAllTestsOnce() +{ + const size_t totalCount = repeats_ * enabledCount(); + + for (size_t i = 0, e = activeTests_.size(); i != e; ++i) + { + TestInfo* testCase = testCases_[activeTests_[i]].get(); + unique_ptr test = testCase->createTest(); + + if (!testCase->isEnabled()) + continue; + + currentTestCase_ = testCase; + currentCount_++; + size_t percentage = currentCount_ * 100 / totalCount; + + if (printProgress_) + { + fmt::print("{}{:>3} Running test: {}.{}{}\n", + colorsTestCaseHeader.data(), + percentage, + testCase->testCaseName(), + testCase->testName(), + colorsReset.data()); + } + + int failed = 0; + + try + { + test->SetUp(); + } + catch (const BailOutException&) + { + // SHOULD NOT HAPPEND: complain about it + failed++; + } + catch (...) + { + // TODO: report failure upon set-up phase, hence skipping actual test + failed++; + } + + if (!failed) + { + try + { + test->TestBody(); + } + catch (const BailOutException&) + { + // no-op + failed++; + } + catch (const exception& ex) + { + reportUnhandledException(ex); + failed++; + } + catch (...) + { + reportMessage("Unhandled exception caught in test.", false); + failed++; + } + + try + { + test->TearDown(); + } + catch (const BailOutException&) + { + // SHOULD NOT HAPPEND: complain about it + failed++; + } + catch (...) + { + // TODO: report failure in tear-down + failed++; + } + + if (!failed) + { + successCount_++; + } + } + } +} + +void UnitTest::reportError( + const char* fileName, int lineNo, bool fatal, const char* actual, const error_code& ec) +{ + string message = fmt::format("{}:{}: Failure\n" + " Value of: {}\n" + " Expected: success\n" + " Actual: ({}) {}\n", + fileName, + lineNo, + actual, + ec.category().name(), + ec.message()); + + reportMessage(message, fatal); +} + +void UnitTest::reportError(const char* fileName, + int lineNo, + bool fatal, + const char* expected, + const error_code& expectedEvaluated, + const char* actual, + const error_code& actualEvaluated) +{ + string message = fmt::format("{}:{}: Failure\n" + " Value of: {}\n" + " Expected: ({}) {}\n" + " Actual: ({}) {}\n", + fileName, + lineNo, + actual, + expectedEvaluated.category().name(), + expectedEvaluated.message(), + actualEvaluated.category().name(), + actualEvaluated.message()); + + reportMessage(message, fatal); +} + +void UnitTest::reportBinary(const char* fileName, + int lineNo, + bool fatal, + const char* expected, + const char* actual, + const string& actualEvaluated, + const char* op) +{ + string message = fmt::format("{}:{}: Failure\n" + " Value of: {}\n" + " Expected: {} {}\n" + " Actual: {}\n", + fileName, + lineNo, + actual, + expected, + op, + actualEvaluated); + + reportMessage(message, fatal); +} + +void UnitTest::reportUnhandledException(const exception& e) +{ + string message = fmt::format("Unhandled Exception\n" + " Type: {}\n" + " What: {}\n", + typeid(e).name(), + e.what()); + reportMessage(message, false); +} + +void UnitTest::reportEH(const char* fileName, + int lineNo, + bool fatal, + const char* program, + const char* expected, + const char* actual) +{ + string message = fmt::format("{}:{}: {}\n" + " Value of: {}\n" + " Expected: {}\n" + " Actual: {}\n", + fileName, + lineNo, + actual ? "Unexpected exception caught" : "No exception caught", + program, + expected, + actual); + + reportMessage(message, fatal); +} + +void UnitTest::reportMessage(const char* fileName, int lineNo, bool fatal, const string& msg) +{ + string message = fmt::format("{}:{}: {}\n", fileName, lineNo, msg); + reportMessage(message, fatal); +} + +void UnitTest::reportMessage(const string& message, bool fatal) +{ + fmt::print("{}{}{}\n", colorsError.data(), message, colorsReset.data()); + + failCount_++; + failures_.emplace_back(message); + + if (fatal) + { + throw BailOutException(); + } +} + +void UnitTest::addEnvironment(unique_ptr&& env) +{ + environments_.emplace_back(move(env)); +} + +Callback* UnitTest::addInitializer(unique_ptr&& cb) +{ + initializers_.emplace_back(move(cb)); + return initializers_.back().get(); +} + +TestInfo* UnitTest::addTest(const char* testCaseName, + const char* testName, + unique_ptr&& testFactory) +{ + testCases_.emplace_back( + make_unique(testCaseName, + testName, + !beginsWith(testCaseName, "DISABLED_") && !beginsWith(testName, "DISABLED_"), + move(testFactory))); + + activeTests_.emplace_back(activeTests_.size()); + + return testCases_.back().get(); +} + +void UnitTest::log(const string& message) +{ + if (verbose_) + { + size_t bol = 0; + size_t eol = 0; + do + { + eol = message.find('\n', bol); + string line = message.substr(bol, eol - bol); + if (eol + 1 < message.size() || (!line.empty() && line != "\n")) + { + fmt::print("{}{}.{}:{} {}\n", + colorsLog.data(), + currentTestCase_->testCaseName(), + currentTestCase_->testName(), + colorsReset.data(), + line); + } + bol = eol + 1; + } while (eol != string::npos); + } +} + +} // namespace regex_dfa::util::testing diff --git a/src/regex_dfa/util/testing.h b/src/regex_dfa/util/testing.h new file mode 100644 index 0000000000..5e726a5820 --- /dev/null +++ b/src/regex_dfa/util/testing.h @@ -0,0 +1,425 @@ +// This file is part of the "x0" project, http://github.com/christianparpart/x0> +// (c) 2009-2018 Christian Parpart +// +// Licensed under the MIT License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of +// the License at: http://opensource.org/licenses/MIT + +#pragma once + +#include + +#include +#include +#include + +namespace regex_dfa::util::testing { + +#define TEST_ENV_SETUP(Name) \ + class _CALLBACK_NAME(Name) : public ::regex_dfa::util::testing::Callback { \ + public: \ + void invoke() override; \ + private: \ + static ::regex_dfa::util::testing::Callback* const ref_ [[maybe_unused]]; \ + }; \ + \ + ::regex_dfa::util::testing::Callback* const \ + _CALLBACK_NAME(Name)::ref_ = \ + ::regex_dfa::util::testing::UnitTest::instance()->addInitializer( \ + std::make_unique<_CALLBACK_NAME>(Name)); \ + \ + void _CALLBACK_NAME(Name)::invoke() + +#define _CALLBACK_NAME(Name) Callback_##Name + +#define TEST_ENV_TEARDOWN(Name) // TODO + +#define TEST_ENV_F(EnvName) \ + ::regex_dfa::util::testing::UnitTest::instance()->addEnvironment( \ + std::unique_ptr<::regex_dfa::util::testing::Environment>(EnvName)); + +// ############################################################################ + +#define TEST(testCase, testName) _CREATE_TEST(testCase, testName, ::regex_dfa::util::testing::Test) +#define TEST_F(testFixture, testName) _CREATE_TEST(testFixture, testName, testFixture) + +#define EXPECT_EQ(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, ==) + +#define EXPECT_NE(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, !=) + +#define EXPECT_GE(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, >=) + +#define EXPECT_LE(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, <=) + +#define EXPECT_GT(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, >) + +#define EXPECT_LT(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, <) + +#define EXPECT_TRUE(actual) \ + _EXPECT_BOOLEAN(__FILE__, __LINE__, false, true, actual) + +#define EXPECT_FALSE(actual) \ + _EXPECT_BOOLEAN(__FILE__, __LINE__, false, false, actual) + +#define EXPECT_NEAR(expected, actual, diff) // TODO + +#define REPORT_ERROR(message) \ + do { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportMessage( \ + __FILE__, __LINE__, false, (message)); \ + } while (0) + +#define EXPECT_ERROR_CODE_SUCCESS(errorCode) \ + if (errorCode) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ + __FILE__, __LINE__, false, #errorCode, errorCode); \ + } + +#define EXPECT_ERROR_CODE(expected, actual) \ + do { \ + std::error_code actual_ {(actual)}; \ + if (actual_ != (expected)) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ + __FILE__, __LINE__, false, \ + #expected, (expected), \ + #actual, actual_); \ + } \ + } while (0) + +#define EXPECT_THROW(program, ExceptionType) \ + do { \ + try { \ + program; \ + ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ + __FILE__, __LINE__, false, #program, #ExceptionType, \ + ""); \ + } catch (const ExceptionType&) { \ + break; \ + } catch (...) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ + __FILE__, __LINE__, false, #program, #ExceptionType, ""); \ + } \ + } while (0) + +#define EXPECT_ANY_THROW(program) \ + do { \ + try { \ + program; \ + ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ + __FILE__, __LINE__, false, #program, "", \ + ""); \ + } catch (...) { \ + } \ + } while (0) + +// ############################################################################ + +#define ASSERT_EQ(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, ==) + +#define ASSERT_NE(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, !=) + +#define ASSERT_GE(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, >=) + +#define ASSERT_LE(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, <=) + +#define ASSERT_GT(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, >) + +#define ASSERT_LT(expected, actual) \ + _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, <) + +#define ASSERT_TRUE(actual) \ + _EXPECT_BOOLEAN(__FILE__, __LINE__, true, true, actual) + +#define ASSERT_FALSE(actual) \ + _EXPECT_BOOLEAN(__FILE__, __LINE__, true, false, actual) + +#define ASSERT_NEAR(expected, actual, diff) // TODO + +#define ASSERT_ERROR_CODE_SUCCESS(errorCode) \ + if (errorCode) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ + __FILE__, __LINE__, true, #errorCode, errorCode); \ + } + +#define ASSERT_ERROR_CODE(expected, actual) \ + do { \ + std::error_code actual_ {(actual)}; \ + if (actual_ != (expected)) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ + __FILE__, __LINE__, true, \ + #expected, (expected), \ + #actual, actual_); \ + } \ + } while (0) + +#define ASSERT_THROW(program, ExceptionType) \ + do { \ + try { \ + program; \ + ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ + __FILE__, __LINE__, true, #program, #ExceptionType, \ + ""); \ + } catch (const ExceptionType&) { \ + break; \ + } catch (...) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ + __FILE__, __LINE__, true, #program, #ExceptionType, ""); \ + } \ + } while (0) + +#define ASSERT_ANY_THROW(program) \ + do { \ + try { \ + program; \ + ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ + __FILE__, __LINE__, true, #program, "", \ + ""); \ + } catch (...) { \ + } \ + } while (0) + +// ############################################################################ + +#define _EXPECT_BOOLEAN(fileName, lineNo, fatal, expected, actual) \ + do { \ + bool actualEvaluated = !! (actual); \ + bool failed = (expected && !actualEvaluated) \ + || (!expected && actualEvaluated); \ + if (failed) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportBinary( \ + __FILE__, __LINE__, fatal, #expected, #actual, \ + ::fmt::format("{}", (actualEvaluated)), ""); \ + } \ + } while (0) + +#define _EXPECT_BINARY(fileName, lineNo, fatal, expected, actual, op) \ + do { \ + auto actual_ = (actual); \ + if (!((expected) op (actual_))) { \ + ::regex_dfa::util::testing::UnitTest::instance()->reportBinary( \ + __FILE__, __LINE__, fatal, #expected, #actual, \ + ::fmt::format("{}", actual_), #op); \ + } \ + } while (0) + +#define _TEST_CLASS_NAME(testCaseName, testName) \ + Test_##testCaseName##testName + +#define _CREATE_TEST(testCaseName, testName, ParentClass) \ +class _TEST_CLASS_NAME(testCaseName, testName) : public ParentClass { \ + public: \ + _TEST_CLASS_NAME(testCaseName, testName)() {} \ + \ + private: \ + virtual void TestBody(); \ + \ + static ::regex_dfa::util::testing::TestInfo* const test_info_; \ +}; \ + \ +::regex_dfa::util::testing::TestInfo* const \ +_TEST_CLASS_NAME(testCaseName, testName)::test_info_ = \ + ::regex_dfa::util::testing::UnitTest::instance()->addTest( \ + #testCaseName, #testName, \ + std::make_unique< \ + ::regex_dfa::util::testing::TestFactoryTemplate< \ + _TEST_CLASS_NAME(testCaseName, testName)>>()); \ + \ +void _TEST_CLASS_NAME(testCaseName, testName)::TestBody() + +// ############################################################################ + +int main(int argc, const char* argv[]); + +// ############################################################################ + +class Callback { + public: + virtual ~Callback() {} + + virtual void invoke() = 0; +}; + +/** + * Environment hooks. + */ +class Environment { + public: + virtual ~Environment(); + + virtual void SetUp(); + virtual void TearDown(); +}; + +/** + * interface to a single test. + */ +class Test { + public: + virtual ~Test(); + + virtual void SetUp(); + virtual void TestBody() = 0; + virtual void TearDown(); + + void log(const std::string& message); + + template + void logf(const char* fmt, Args... args); + + void reportUnhandledException(const std::exception& e); +}; + +/** + * API to create one kind of a test. + */ +class TestFactory { + TestFactory(const TestFactory&) = delete; + TestFactory& operator=(const TestFactory&) = delete; + + public: + TestFactory() {} + virtual ~TestFactory() {} + virtual std::unique_ptr createTest() = 0; +}; + +template +class TestFactoryTemplate : public TestFactory { + public: + std::unique_ptr createTest() override { + return std::make_unique(); + } +}; + +/** + * TestInfo describes a single test. + */ +class TestInfo { + TestInfo(const TestInfo&) = delete; + TestInfo& operator=(const TestInfo&) = delete; + + public: + TestInfo(const std::string& testCaseName, + const std::string& testName, + bool enabled, + std::unique_ptr&& testFactory); + + const std::string& testCaseName() const { return testCaseName_; } + const std::string& testName() const { return testName_; } + bool isEnabled() const { return enabled_; } + + std::unique_ptr createTest() { return testFactory_->createTest(); } + + private: + std::string testCaseName_; + std::string testName_; + bool enabled_; + std::unique_ptr testFactory_; +}; + +class UnitTest { + public: + UnitTest(); + ~UnitTest(); + + static UnitTest* instance(); + + int main(int argc, const char* argv[]); + + void randomizeTestOrder(); + void sortTestsAlphabetically(); + void printTestList(); + void filterTests(const std::string& filter, const std::string& exclude); + void run(); + + void addEnvironment(std::unique_ptr&& env); + + Callback* addInitializer(std::unique_ptr&& cb); + + TestInfo* addTest(const char* testCaseName, + const char* testName, + std::unique_ptr&& testFactory); + + void reportError(const char* fileName, + int lineNo, + bool fatal, + const char* actual, + const std::error_code& ec); + + void reportError(const char* fileName, + int lineNo, + bool fatal, + const char* expected, + const std::error_code& expectedEvaluated, + const char* actual, + const std::error_code& actualEvaluated); + + void reportBinary(const char* fileName, + int lineNo, + bool fatal, + const char* expected, + const char* actual, + const std::string& actualEvaluated, + const char* op); + + void reportUnhandledException(const std::exception& e); + + void reportEH(const char* fileName, + int lineNo, + bool fatal, + const char* program, + const char* expected, + const char* actual); + + void reportMessage(const std::string& message, bool fatal); + void reportMessage(const char* fileName, int lineNo, bool fatal, const std::string& message); + + void log(const std::string& message); + + template + void logf(const char* format, Args... args) { + log(fmt::format(format, args...)); + } + + private: + void runAllTestsOnce(); + void printSummary(); + size_t enabledCount() const; + size_t disabledCount() const; + + private: + std::vector> environments_; + std::vector> initializers_; + std::vector> testCases_; + + //! ordered list of tests as offsets into testCases_ + std::vector activeTests_; + + int repeats_; + bool verbose_; + bool printProgress_; + bool printSummaryDetails_; + + TestInfo* currentTestCase_; + size_t currentCount_; + size_t successCount_; + int failCount_; + std::vector failures_; +}; + +template +inline void Test::logf(const char* fmt, Args... args) { + UnitTest::instance()->logf(fmt, args...); +} + +} // namespace regex_dfa::util::testing From a0f0605c4f7724c2e9fdd0099ad36c13826d1690 Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Sat, 12 Aug 2023 10:43:43 +0200 Subject: [PATCH 3/5] Adapt to Contour's best code quality & maintenance practice. Signed-off-by: Christian Parpart --- .github/workflows/build.yml | 14 + src/regex_dfa/Alphabet.cpp | 6 +- src/regex_dfa/Alphabet.h | 12 +- src/regex_dfa/CMakeLists.txt | 8 +- src/regex_dfa/CharStream.h | 30 +- src/regex_dfa/Compiler.cpp | 26 +- src/regex_dfa/Compiler.h | 14 +- src/regex_dfa/DFA.cpp | 6 +- src/regex_dfa/DFA.h | 30 +- src/regex_dfa/DFABuilder.cpp | 6 +- src/regex_dfa/DFABuilder.h | 13 +- src/regex_dfa/DFABuilder_test.cpp | 12 +- src/regex_dfa/DFAMinimizer.cpp | 14 +- src/regex_dfa/DFAMinimizer.h | 18 +- src/regex_dfa/DotVisitor.h | 2 +- src/regex_dfa/DotWriter.cpp | 4 +- src/regex_dfa/DotWriter_test.cpp | 14 +- src/regex_dfa/Lexable.h | 63 +-- src/regex_dfa/Lexer-inl.h | 20 +- src/regex_dfa/Lexer.h | 61 +-- src/regex_dfa/LexerDef.h | 6 +- src/regex_dfa/Lexer_test.cpp | 506 +++++++++++----------- src/regex_dfa/MultiDFA.cpp | 2 +- src/regex_dfa/NFA.cpp | 24 +- src/regex_dfa/NFA.h | 43 +- src/regex_dfa/NFABuilder.cpp | 36 +- src/regex_dfa/NFABuilder.h | 4 +- src/regex_dfa/NFA_test.cpp | 71 ++-- src/regex_dfa/RegExpr.cpp | 50 +-- src/regex_dfa/RegExpr.h | 29 +- src/regex_dfa/RegExprParser.cpp | 34 +- src/regex_dfa/RegExprParser.h | 47 ++- src/regex_dfa/RegExprParser_test.cpp | 338 ++++++++------- src/regex_dfa/Report.cpp | 6 +- src/regex_dfa/Report.h | 8 +- src/regex_dfa/Rule.h | 44 +- src/regex_dfa/RuleParser.cpp | 115 +++-- src/regex_dfa/RuleParser.h | 102 ++--- src/regex_dfa/RuleParser_test.cpp | 182 ++++---- src/regex_dfa/SourceLocation.cpp | 4 +- src/regex_dfa/State.h | 4 +- src/regex_dfa/State_test.cpp | 6 +- src/regex_dfa/Symbols.h | 36 +- src/regex_dfa/Symbols_test.cpp | 74 ++-- src/regex_dfa/TransitionMap-inl.h | 4 +- src/regex_dfa/TransitionMap.h | 6 +- src/regex_dfa/regex_dfa_test.cpp | 25 ++ src/regex_dfa/util/AnsiColor.h | 153 ------- src/regex_dfa/util/Flags.cpp | 578 ------------------------- src/regex_dfa/util/Flags.h | 171 -------- src/regex_dfa/util/IntVector.h | 40 -- src/regex_dfa/util/UnboxedRange.h | 94 ----- src/regex_dfa/util/iterator-detail.h | 288 +++++++------ src/regex_dfa/util/iterator.h | 75 ++-- src/regex_dfa/util/iterator_test.cpp | 115 ++--- src/regex_dfa/util/literals.h | 78 ++-- src/regex_dfa/util/overloaded.h | 21 - src/regex_dfa/util/testing.cpp | 610 --------------------------- src/regex_dfa/util/testing.h | 425 ------------------- 59 files changed, 1397 insertions(+), 3430 deletions(-) create mode 100644 src/regex_dfa/regex_dfa_test.cpp delete mode 100644 src/regex_dfa/util/AnsiColor.h delete mode 100644 src/regex_dfa/util/Flags.cpp delete mode 100644 src/regex_dfa/util/Flags.h delete mode 100644 src/regex_dfa/util/IntVector.h delete mode 100644 src/regex_dfa/util/UnboxedRange.h delete mode 100644 src/regex_dfa/util/overloaded.h delete mode 100644 src/regex_dfa/util/testing.cpp delete mode 100644 src/regex_dfa/util/testing.h diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7b9da0c303..42dfbe2923 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -215,6 +215,7 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Debug -DLIBTERMINAL_BUILD_BENCH_HEADLESS=ON -DCONTOUR_QT_VERSION=6 -S . -B build cmake --build build/ -j2 ./build/src/crispy/crispy_test + ./build/src/regex_dfa/regex_dfa_test ./build/src/vtparser/vtparser_test ./build/src/vtbackend/vtbackend_test rm -rf _deps build @@ -257,6 +258,8 @@ jobs: run: cmake --build build/ - name: "test: crispy" run: ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser" run: ./build/src/vtparser/vtparser_test - name: "test: vtbackend" @@ -326,6 +329,8 @@ jobs: run: cmake --build build/ --config Release - name: "test: crispy" run: .\build\src\crispy\Release\crispy_test.exe + - name: "test: regex_dfa" + run: .\build\src\regex_dfa\Release\regex_dfa_test.exe - name: "test: vtparser" run: .\build\src\vtparser\Release\vtparser_test.exe - name: "test: vtbackend" @@ -450,6 +455,8 @@ jobs: run: cmake --build build/ -- -j3 - name: "test: crispy" run: ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser" run: ./build/src/vtparser/vtparser_test - name: "test: vtbackend" @@ -461,6 +468,7 @@ jobs: name: contour-ubuntu2204-tests path: | build/src/crispy/crispy_test + build/src/regex_dfa/regex_dfa_test build/src/vtparser/vtparser_test build/src/vtbackend/vtbackend_test build/src/vtbackend/bench-headless @@ -516,6 +524,8 @@ jobs: # run: cmake --build build/ -- -j3 # - name: "test: crispy" # run: ./build/src/crispy/crispy_test + # - name: "test: regex_dfa" + # run: ./build/src/regex_dfa/regex_dfa_test # - name: "test: vtparser" # run: ./build/src/vtparser/vtparser_test # - name: "test: vtbackend" @@ -664,6 +674,8 @@ jobs: run: cmake --build build/ -- -j3 - name: "test: crispy" run: ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser" run: ./build/src/vtparser/vtparser_test - name: "test: vtbackend" @@ -728,6 +740,8 @@ jobs: valgrind - name: "test: crispy (via valgrind)" run: valgrind --error-exitcode=64 ./build/src/crispy/crispy_test + - name: "test: regex_dfa" + run: valgrind --error-exitcode=64 ./build/src/regex_dfa/regex_dfa_test - name: "test: vtparser (via valgrind)" run: valgrind --error-exitcode=64 ./build/src/vtparser/vtparser_test - name: "test: vtbackend (via valgrind)" diff --git a/src/regex_dfa/Alphabet.cpp b/src/regex_dfa/Alphabet.cpp index 704c59c7f6..8dccced1e1 100644 --- a/src/regex_dfa/Alphabet.cpp +++ b/src/regex_dfa/Alphabet.cpp @@ -32,10 +32,10 @@ namespace regex_dfa void Alphabet::insert(Symbol ch) { - if (alphabet_.find(ch) == alphabet_.end()) + if (_alphabet.find(ch) == _alphabet.end()) { DEBUG("Alphabet: insert '{:}'", prettySymbol(ch)); - alphabet_.insert(ch); + _alphabet.insert(ch); } } @@ -45,7 +45,7 @@ string Alphabet::to_string() const sstr << '{'; - for (Symbol c: alphabet_) + for (Symbol c: _alphabet) sstr << prettySymbol(c); sstr << '}'; diff --git a/src/regex_dfa/Alphabet.h b/src/regex_dfa/Alphabet.h index eb6e7bf6df..ec6d37cd1a 100644 --- a/src/regex_dfa/Alphabet.h +++ b/src/regex_dfa/Alphabet.h @@ -25,17 +25,17 @@ class Alphabet using set_type = std::set; using iterator = set_type::iterator; - size_t size() const noexcept { return alphabet_.size(); } + [[nodiscard]] size_t size() const noexcept { return _alphabet.size(); } void insert(Symbol ch); - std::string to_string() const; + [[nodiscard]] std::string to_string() const; - const iterator begin() const { return alphabet_.begin(); } - const iterator end() const { return alphabet_.end(); } + [[nodiscard]] iterator begin() const { return _alphabet.begin(); } + [[nodiscard]] iterator end() const { return _alphabet.end(); } private: - set_type alphabet_; + set_type _alphabet; }; } // namespace regex_dfa @@ -54,7 +54,7 @@ struct formatter template constexpr auto format(const regex_dfa::Alphabet& v, FormatContext& ctx) { - return format_to(ctx.out(), "{}", v.to_string()); + return fmt::format_to(ctx.out(), "{}", v.to_string()); } }; } // namespace fmt diff --git a/src/regex_dfa/CMakeLists.txt b/src/regex_dfa/CMakeLists.txt index 3b4c594f96..a415336887 100644 --- a/src/regex_dfa/CMakeLists.txt +++ b/src/regex_dfa/CMakeLists.txt @@ -21,7 +21,9 @@ target_include_directories(regex_dfa PUBLIC ${PROJECT_SOURCE_DIR}/src ${CMAKE_SO target_link_libraries(regex_dfa PUBLIC fmt::fmt-header-only) # ---------------------------------------------------------------------------- -if(TESTS) +option(REGEX_DFA_TESTING "Enables building of unittests for regex_dfa library [default: ON]" ON) +if(REGEX_DFA_TESTING) + enable_testing() add_executable(regex_dfa_test regex_dfa_test.cpp DFABuilder_test.cpp @@ -33,9 +35,9 @@ if(TESTS) State_test.cpp Symbols_test.cpp util/iterator_test.cpp - util/testing.cpp ) target_link_libraries(regex_dfa_test PUBLIC regex_dfa) + target_link_libraries(regex_dfa_test PUBLIC Catch2::Catch2) target_link_libraries(regex_dfa_test PUBLIC fmt::fmt-header-only) -endif(TESTS) +endif(REGEX_DFA_TESTING) diff --git a/src/regex_dfa/CharStream.h b/src/regex_dfa/CharStream.h index 79f087eec6..d0d0e2d96f 100644 --- a/src/regex_dfa/CharStream.h +++ b/src/regex_dfa/CharStream.h @@ -27,16 +27,16 @@ class CharStream class StringStream: public CharStream { public: - explicit StringStream(std::string&& s): source_ { std::move(s) } {} + explicit StringStream(std::string&& s): _source { std::move(s) } {} - [[nodiscard]] bool isEof() const noexcept override { return pos_ >= source_.size(); } - char get() override { return source_[pos_++]; } - void rollback(int count) override { pos_ -= count; } - void rewind() override { pos_ = 0; } + [[nodiscard]] bool isEof() const noexcept override { return _pos >= _source.size(); } + char get() override { return _source[_pos++]; } + void rollback(int count) override { _pos -= count; } + void rewind() override { _pos = 0; } private: - std::string source_; - size_t pos_ = 0; + std::string _source; + size_t _pos = 0; }; class StandardStream: public CharStream @@ -44,24 +44,24 @@ class StandardStream: public CharStream public: explicit StandardStream(std::istream* source); - [[nodiscard]] bool isEof() const noexcept override { return !source_->good(); } - char get() override { return static_cast(source_->get()); } + [[nodiscard]] bool isEof() const noexcept override { return !_source->good(); } + char get() override { return static_cast(_source->get()); } void rollback(int count) override { - source_->clear(); - source_->seekg(-count, std::ios::cur); + _source->clear(); + _source->seekg(-count, std::ios::cur); } void rewind() override { - source_->clear(); - source_->seekg(initialOffset_, std::ios::beg); + _source->clear(); + _source->seekg(_initialOffset, std::ios::beg); } private: - std::istream* source_; - std::streamoff initialOffset_; + std::istream* _source; + std::streamoff _initialOffset; }; } // namespace regex_dfa diff --git a/src/regex_dfa/Compiler.cpp b/src/regex_dfa/Compiler.cpp index 42e7dca814..676ef26ac0 100644 --- a/src/regex_dfa/Compiler.cpp +++ b/src/regex_dfa/Compiler.cpp @@ -27,12 +27,12 @@ namespace regex_dfa void Compiler::parse(string text) { - parse(make_unique(move(text))); + parse(make_unique(std::move(text))); } void Compiler::parse(unique_ptr stream) { - declareAll(RuleParser { move(stream) }.parseRules()); + declareAll(RuleParser { std::move(stream) }.parseRules()); } void Compiler::declareAll(RuleList rules) @@ -81,7 +81,7 @@ void Compiler::declareAll(RuleList rules) else names_[rule.tag] = rule.name; - rules_.emplace_back(move(rule)); + rules_.emplace_back(std::move(rule)); } } @@ -118,7 +118,7 @@ MultiDFA Compiler::compileMultiDFA(OvershadowMap* overshadows) for (const auto& fa: fa_) dfaMap[fa.first] = DFABuilder { fa.second.clone() }.construct(overshadows); - return constructMultiDFA(move(dfaMap)); + return constructMultiDFA(std::move(dfaMap)); } DFA Compiler::compileDFA(OvershadowMap* overshadows) @@ -134,7 +134,7 @@ DFA Compiler::compileMinimalDFA() LexerDef Compiler::compile() { - return generateTables(compileMinimalDFA(), containsBeginOfLine_, move(names_)); + return generateTables(compileMinimalDFA(), containsBeginOfLine_, std::move(names_)); } LexerDef Compiler::compileMulti(OvershadowMap* overshadows) @@ -144,7 +144,7 @@ LexerDef Compiler::compileMulti(OvershadowMap* overshadows) return generateTables(multiDFA, containsBeginOfLine_, names()); } -LexerDef Compiler::generateTables(const DFA& dfa, bool requiresBeginOfLine, const map& names) +LexerDef Compiler::generateTables(const DFA& dfa, bool requiresBeginOfLine, map names) { const Alphabet alphabet = dfa.alphabet(); TransitionMap transitionMap; @@ -161,15 +161,13 @@ LexerDef Compiler::generateTables(const DFA& dfa, bool requiresBeginOfLine, cons // TODO: many initial states ! return LexerDef { { { "INITIAL", dfa.initialState() } }, requiresBeginOfLine, - move(transitionMap), - move(acceptStates), + std::move(transitionMap), + std::move(acceptStates), dfa.backtracking(), - move(names) }; + std::move(names) }; } -LexerDef Compiler::generateTables(const MultiDFA& multiDFA, - bool requiresBeginOfLine, - const map& names) +LexerDef Compiler::generateTables(const MultiDFA& multiDFA, bool requiresBeginOfLine, map names) { const Alphabet alphabet = multiDFA.dfa.alphabet(); TransitionMap transitionMap; @@ -184,8 +182,8 @@ LexerDef Compiler::generateTables(const MultiDFA& multiDFA, acceptStates.emplace(s, *multiDFA.dfa.acceptTag(s)); // TODO: many initial states ! - return LexerDef { multiDFA.initialStates, requiresBeginOfLine, move(transitionMap), - move(acceptStates), multiDFA.dfa.backtracking(), move(names) }; + return LexerDef { multiDFA.initialStates, requiresBeginOfLine, std::move(transitionMap), + std::move(acceptStates), multiDFA.dfa.backtracking(), std::move(names) }; } } // namespace regex_dfa diff --git a/src/regex_dfa/Compiler.h b/src/regex_dfa/Compiler.h index 88d2160b81..9e8f1846d2 100644 --- a/src/regex_dfa/Compiler.h +++ b/src/regex_dfa/Compiler.h @@ -48,9 +48,9 @@ class Compiler */ void declareAll(RuleList rules); - const RuleList& rules() const noexcept { return rules_; } - const TagNameMap& names() const noexcept { return names_; } - size_t size() const; + [[nodiscard]] const RuleList& rules() const noexcept { return rules_; } + [[nodiscard]] const TagNameMap& names() const noexcept { return names_; } + [[nodiscard]] size_t size() const; /** * Compiles all previousely parsed rules into a DFA. @@ -81,12 +81,12 @@ class Compiler * * @see Lexer */ - static LexerDef generateTables(const DFA& dfa, bool requiresBeginOfLine, const TagNameMap& names); - static LexerDef generateTables(const MultiDFA& dfa, bool requiresBeginOfLine, const TagNameMap& names); + static LexerDef generateTables(const DFA& dfa, bool requiresBeginOfLine, TagNameMap names); + static LexerDef generateTables(const MultiDFA& dfa, bool requiresBeginOfLine, TagNameMap names); - const std::map& automata() const { return fa_; } + [[nodiscard]] const std::map& automata() const { return fa_; } - bool containsBeginOfLine() const noexcept { return containsBeginOfLine_; } + [[nodiscard]] bool containsBeginOfLine() const noexcept { return containsBeginOfLine_; } private: /** diff --git a/src/regex_dfa/DFA.cpp b/src/regex_dfa/DFA.cpp index e0ee1f12d1..f0df7a8eac 100644 --- a/src/regex_dfa/DFA.cpp +++ b/src/regex_dfa/DFA.cpp @@ -37,7 +37,7 @@ Alphabet DFA::alphabet() const { Alphabet alphabet; for (const State& state: states_) - for (const pair& t: state.transitions) + for (pair const t: state.transitions) alphabet.insert(t.first); return alphabet; @@ -118,12 +118,12 @@ void DFA::prepareStateIds(StateId baseId, StateId q0) AcceptMap remapped; for (auto& a: acceptTags_) remapped[transformId(a.first)] = a.second; - acceptTags_ = move(remapped); + acceptTags_ = std::move(remapped); BacktrackingMap backtracking; for (const auto& bt: backtrackStates_) backtracking[transformId(bt.first)] = transformId(bt.second); - backtrackStates_ = move(backtracking); + backtrackStates_ = std::move(backtracking); initialState_ = q0; } diff --git a/src/regex_dfa/DFA.h b/src/regex_dfa/DFA.h index ceb82c4018..a2d4881fab 100644 --- a/src/regex_dfa/DFA.h +++ b/src/regex_dfa/DFA.h @@ -57,16 +57,16 @@ class DFA } //! Retrieves the alphabet of this finite automaton. - Alphabet alphabet() const; + [[nodiscard]] Alphabet alphabet() const; //! Retrieves the initial state. - StateId initialState() const { return initialState_; } + [[nodiscard]] StateId initialState() const { return initialState_; } //! Retrieves the list of available states. - const StateVec& states() const { return states_; } - StateVec& states() { return states_; } + [[nodiscard]] const StateVec& states() const { return states_; } + [[nodiscard]] StateVec& states() { return states_; } - StateIdVec stateIds() const + [[nodiscard]] StateIdVec stateIds() const { StateIdVec v; v.reserve(states_.size()); @@ -76,7 +76,7 @@ class DFA } //! Retrieves the list of accepting states. - std::vector acceptStates() const; + [[nodiscard]] std::vector acceptStates() const; /** * Traverses all states and edges in this NFA and calls @p visitor for each state & edge. @@ -89,7 +89,7 @@ class DFA void setInitialState(StateId state); - const TransitionMap& stateTransitions(StateId id) const + [[nodiscard]] const TransitionMap& stateTransitions(StateId id) const { return states_[static_cast(id)].transitions; } @@ -97,7 +97,7 @@ class DFA // {{{ backtracking (for lookahead) void setBacktrack(StateId from, StateId to) { backtrackStates_[from] = to; } - std::optional backtrack(StateId acceptState) const + [[nodiscard]] std::optional backtrack(StateId acceptState) const { if (auto i = backtrackStates_.find(acceptState); i != backtrackStates_.end()) return i->second; @@ -105,15 +105,15 @@ class DFA return std::nullopt; } - const BacktrackingMap& backtracking() const noexcept { return backtrackStates_; } + [[nodiscard]] const BacktrackingMap& backtracking() const noexcept { return backtrackStates_; } // }}} //! Flags given state as accepting-state with given Tag @p acceptTag. void setAccept(StateId state, Tag acceptTag) { acceptTags_[state] = acceptTag; } - bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } + [[nodiscard]] bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } - std::optional acceptTag(StateId s) const + [[nodiscard]] std::optional acceptTag(StateId s) const { if (auto i = acceptTags_.find(s); i != acceptTags_.end()) return i->second; @@ -121,7 +121,7 @@ class DFA return std::nullopt; } - std::optional delta(StateId state, Symbol symbol) const + [[nodiscard]] std::optional delta(StateId state, Symbol symbol) const { const auto& T = states_[state].transitions; if (auto i = T.find(symbol); i != T.end()) @@ -133,7 +133,7 @@ class DFA void setTransition(StateId from, Symbol symbol, StateId to); void removeTransition(StateId from, Symbol symbol); - StateIdVec nonAcceptStates() const + [[nodiscard]] StateIdVec nonAcceptStates() const { StateIdVec result; result.reserve( @@ -146,9 +146,9 @@ class DFA return result; } - bool isAcceptor(Tag t) const + [[nodiscard]] bool isAcceptor(Tag t) const { - for (const std::pair& p: acceptTags_) + for (std::pair p: acceptTags_) if (p.second == t) return true; diff --git a/src/regex_dfa/DFABuilder.cpp b/src/regex_dfa/DFABuilder.cpp index 1c2ef725cf..aa8fd393ff 100644 --- a/src/regex_dfa/DFABuilder.cpp +++ b/src/regex_dfa/DFABuilder.cpp @@ -93,7 +93,7 @@ DFA DFABuilder::construct(OvershadowMap* overshadows) while (!workList.empty()) { const StateIdVec q = - move(workList.front()); // each set q represents a valid configuration from the NFA + std::move(workList.front()); // each set q represents a valid configuration from the NFA workList.pop_front(); const StateId q_i = *configurationNumber(Q, q); @@ -109,7 +109,7 @@ DFA DFABuilder::construct(OvershadowMap* overshadows) Q.emplace_back(eclosure); t_i = StateId { Q.size() - 1 }; // equal to configurationNumber(Q, eclosure); T.insert(q_i, c, *t_i); // T[q][c] = eclosure; - workList.emplace_back(move(eclosure)); + workList.emplace_back(std::move(eclosure)); } eclosure.clear(); } @@ -166,7 +166,7 @@ DFA DFABuilder::constructDFA(const vector& Q, // observe mapping from q_i to d_i for (auto const& [q_i, branch]: T.transitions) - for (auto const [c, t_i]: branch) + for (auto&& [c, t_i]: branch) dfa.setTransition(q_i, c, t_i); // q_0 becomes d_0 (initial state) diff --git a/src/regex_dfa/DFABuilder.h b/src/regex_dfa/DFABuilder.h index 6f3eb6138e..0cbaf5adeb 100644 --- a/src/regex_dfa/DFABuilder.h +++ b/src/regex_dfa/DFABuilder.h @@ -32,19 +32,20 @@ class DFABuilder * @param overshadows if not nullptr, it will be used to store semantic information about * which rule tags have been overshadowed by which. */ - DFA construct(OvershadowMap* overshadows = nullptr); + [[nodiscard]] DFA construct(OvershadowMap* overshadows = nullptr); private: struct TransitionTable; - DFA constructDFA(const std::vector& Q, - const TransitionTable& T, - OvershadowMap* overshadows) const; + [[nodiscard]] DFA constructDFA(const std::vector& Q, + const TransitionTable& T, + OvershadowMap* overshadows) const; /** * Finds @p t in @p Q and returns its offset (aka configuration number) or -1 if not found. */ - static std::optional configurationNumber(const std::vector& Q, const StateIdVec& t); + [[nodiscard]] static std::optional configurationNumber(const std::vector& Q, + const StateIdVec& t); /** * Determines the tag to use for the deterministic state representing @p q from non-deterministic FA @p @@ -54,7 +55,7 @@ class DFABuilder * * @returns the determined tag or std::nullopt if none */ - std::optional determineTag(const StateIdVec& q, std::map* overshadows) const; + [[nodiscard]] std::optional determineTag(const StateIdVec& q, std::map* overshadows) const; private: const NFA nfa_; diff --git a/src/regex_dfa/DFABuilder_test.cpp b/src/regex_dfa/DFABuilder_test.cpp index 5ff7a6a9a1..86a9613e0d 100644 --- a/src/regex_dfa/DFABuilder_test.cpp +++ b/src/regex_dfa/DFABuilder_test.cpp @@ -10,14 +10,14 @@ #include #include +#include + #include #include -#include - using namespace regex_dfa; -TEST(regex_DFABuilder, shadowing) +TEST_CASE("regex_DFABuilder.shadowing") { Compiler cc; cc.parse(std::make_unique(R"( @@ -27,7 +27,7 @@ TEST(regex_DFABuilder, shadowing) // rule 2 is overshadowed by rule 1 Compiler::OvershadowMap overshadows; DFA dfa = cc.compileDFA(&overshadows); - ASSERT_EQ(1, overshadows.size()); - EXPECT_EQ(2, overshadows[0].first); // overshadowee - EXPECT_EQ(1, overshadows[0].second); // overshadower + REQUIRE(1 == overshadows.size()); + CHECK(2 == overshadows[0].first); // overshadowee + CHECK(1 == overshadows[0].second); // overshadower } diff --git a/src/regex_dfa/DFAMinimizer.cpp b/src/regex_dfa/DFAMinimizer.cpp index f2c37af674..9c5a58e53d 100644 --- a/src/regex_dfa/DFAMinimizer.cpp +++ b/src/regex_dfa/DFAMinimizer.cpp @@ -101,9 +101,9 @@ DFAMinimizer::PartitionVec DFAMinimizer::split(const StateIdVec& S) const { DEBUG("split: {} on character '{}' into {} sets", to_string(S), (char) c, t_i.size()); PartitionVec result; - for (const pair& t: t_i) + for (auto&& t: t_i) { - result.emplace_back(move(t.second)); + result.emplace_back(std::move(t.second)); DEBUG(" partition {}: {}", t.first, t.second); } return result; @@ -125,7 +125,7 @@ DFAMinimizer::PartitionVec DFAMinimizer::split(const StateIdVec& S) const main.emplace_back(s); if (!main.empty()) - result.emplace_back(move(main)); + result.emplace_back(std::move(main)); } } @@ -136,7 +136,7 @@ DFAMinimizer::PartitionVec DFAMinimizer::split(const StateIdVec& S) const void DFAMinimizer::dumpGroups(const PartitionVec& T) { DEBUG("dumping groups ({})", T.size()); - int groupNr = 0; + [[maybe_unused]] int groupNr = 0; for (const auto& t: T) { stringstream sstr; @@ -176,7 +176,7 @@ MultiDFA DFAMinimizer::constructMultiDFA() dfamin.setTransition(dfamin.initialState(), static_cast(t), t); } - return MultiDFA { move(initialStates), move(dfamin) }; + return MultiDFA { std::move(initialStates), std::move(dfamin) }; } void DFAMinimizer::constructPartitions() @@ -253,9 +253,9 @@ DFA DFAMinimizer::constructFromPartitions(const PartitionVec& P) const for (const StateIdVec& p: P) { const StateId s = *p.begin(); - for (const pair& transition: dfa_.stateTransitions(s)) + for (pair const transition: dfa_.stateTransitions(s)) { - const int t_i = partitionId(transition.second); + auto const t_i = partitionId(transition.second); DEBUG("map p{} --({})--> p{}", p_i, prettySymbol(transition.first), t_i); dfamin.setTransition(p_i, transition.first, t_i); } diff --git a/src/regex_dfa/DFAMinimizer.h b/src/regex_dfa/DFAMinimizer.h index 40647044c3..0f30d06267 100644 --- a/src/regex_dfa/DFAMinimizer.h +++ b/src/regex_dfa/DFAMinimizer.h @@ -35,18 +35,18 @@ class DFAMinimizer using PartitionVec = std::list; void constructPartitions(); - StateIdVec nonAcceptStates() const; - bool containsInitialState(const StateIdVec& S) const; - bool isMultiInitialState(StateId s) const; - PartitionVec::iterator findGroup(StateId s); - int partitionId(StateId s) const; - PartitionVec split(const StateIdVec& S) const; - DFA constructFromPartitions(const PartitionVec& P) const; - std::optional containsBacktrackState(const StateIdVec& Q) const; + [[nodiscard]] StateIdVec nonAcceptStates() const; + [[nodiscard]] bool containsInitialState(const StateIdVec& S) const; + [[nodiscard]] bool isMultiInitialState(StateId s) const; + [[nodiscard]] PartitionVec::iterator findGroup(StateId s); + [[nodiscard]] int partitionId(StateId s) const; + [[nodiscard]] PartitionVec split(const StateIdVec& S) const; + [[nodiscard]] DFA constructFromPartitions(const PartitionVec& P) const; + [[nodiscard]] std::optional containsBacktrackState(const StateIdVec& Q) const; static void dumpGroups(const PartitionVec& T); - StateId targetStateId(StateId oldId) const + [[nodiscard]] StateId targetStateId(StateId oldId) const { auto i = targetStateIdMap_.find(oldId); assert(i != targetStateIdMap_.end()); diff --git a/src/regex_dfa/DotVisitor.h b/src/regex_dfa/DotVisitor.h index 6eb4a62cc1..303dec8373 100644 --- a/src/regex_dfa/DotVisitor.h +++ b/src/regex_dfa/DotVisitor.h @@ -17,7 +17,7 @@ namespace regex_dfa class DotVisitor { public: - virtual ~DotVisitor() {} + virtual ~DotVisitor() = default; virtual void start(StateId initialState) = 0; virtual void visitNode(StateId number, bool start, bool accept) = 0; diff --git a/src/regex_dfa/DotWriter.cpp b/src/regex_dfa/DotWriter.cpp index c5883a93ea..36d98e4681 100644 --- a/src/regex_dfa/DotWriter.cpp +++ b/src/regex_dfa/DotWriter.cpp @@ -65,7 +65,7 @@ void DotWriter::visitNode(StateId number, bool start, bool accept) } } -void DotWriter::visitEdge(StateId from, StateId to, Symbol s) +void DotWriter::visitEdge(StateId /*from*/, StateId to, Symbol s) { transitionGroups_[to].push_back(s); } @@ -95,7 +95,7 @@ void DotWriter::endVisitEdge(StateId from, StateId to) } else { - string label = groupCharacterClassRanges(move(tgroup)); + string label = groupCharacterClassRanges(std::move(tgroup)); stream_ << fmt::format(" {}{} -> {}{} [label=\"{}\"];\n", stateLabelPrefix_, from, diff --git a/src/regex_dfa/DotWriter_test.cpp b/src/regex_dfa/DotWriter_test.cpp index 78b24eff74..4c659a1cfa 100644 --- a/src/regex_dfa/DotWriter_test.cpp +++ b/src/regex_dfa/DotWriter_test.cpp @@ -7,14 +7,14 @@ #include -#include +#include -#include +#include using namespace std; using namespace regex_dfa; -TEST(regex_DotWriter, simple) +TEST_CASE("regex_DotWriter.simple") { stringstream sstr; DotWriter dw(sstr, "n"); @@ -33,12 +33,11 @@ TEST(regex_DotWriter, simple) dw.endVisitEdge(1, 1); dw.end(); - log(sstr.str()); - ASSERT_TRUE(!sstr.str().empty()); + REQUIRE(!sstr.str().empty()); // just make sure it processes } -TEST(regex_DotWriter, multidfa_simple) +TEST_CASE("regex_DotWriter.multidfa_simple") { stringstream sstr; const MultiDFA::InitialStateMap mis { { "foo", 1 }, { "bar", 2 } }; @@ -63,7 +62,6 @@ TEST(regex_DotWriter, multidfa_simple) dw.end(); - log(sstr.str()); - ASSERT_TRUE(!sstr.str().empty()); + REQUIRE(!sstr.str().empty()); // just make sure it processes } diff --git a/src/regex_dfa/Lexable.h b/src/regex_dfa/Lexable.h index f9988522df..6524896b81 100644 --- a/src/regex_dfa/Lexable.h +++ b/src/regex_dfa/Lexable.h @@ -29,9 +29,9 @@ namespace regex_dfa //! Runtime exception that is getting thrown when a word could not be recognized. struct LexerError: public std::runtime_error { - explicit LexerError(unsigned int _offset): - std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", _offset) }, - offset { _offset } + explicit LexerError(unsigned int offset): + std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", offset) }, + offset { offset } { } @@ -72,7 +72,7 @@ class LexerIterator /** * Retrieves the default DFA machine that is used to recognize words. */ - Machine defaultMachine() const noexcept; + [[nodiscard]] Machine defaultMachine() const noexcept; /** * Sets the active deterministic finite automaton to use for recognizing words. @@ -82,29 +82,29 @@ class LexerIterator */ Machine setMachine(Machine machine); - const TokenInfo& operator*() const noexcept { return currentToken_; } - auto offset() const noexcept { return currentToken_.offset; } - auto literal() const noexcept -> const std::string& { return currentToken_.literal; } - auto token() const noexcept { return currentToken_.token; } - auto name() const noexcept { return name(token()); } + [[nodiscard]] const TokenInfo& operator*() const noexcept { return currentToken_; } + [[nodiscard]] auto offset() const noexcept { return currentToken_.offset; } + [[nodiscard]] auto literal() const noexcept -> const std::string& { return currentToken_.literal; } + [[nodiscard]] auto token() const noexcept { return currentToken_.token; } + [[nodiscard]] auto name() const noexcept { return name(token()); } - bool operator==(const LexerIterator& rhs) const noexcept; - bool operator!=(const LexerIterator& rhs) const noexcept; + [[nodiscard]] bool operator==(const LexerIterator& rhs) const noexcept; + [[nodiscard]] bool operator!=(const LexerIterator& rhs) const noexcept; LexerIterator& operator++(); LexerIterator& operator++(int); private: void recognize(); - Token recognizeOne(); + [[nodiscard]] Token recognizeOne(); // --------------------------------------------------------------------------------- // state helpers static constexpr StateId BadState = std::numeric_limits::max(); - StateId getInitialState() const noexcept; - bool isAcceptState(StateId state) const; + [[nodiscard]] StateId getInitialState() const noexcept; + [[nodiscard]] bool isAcceptState(StateId state) const; /** * Retrieves the next state for given input state and input symbol. @@ -114,13 +114,13 @@ class LexerIterator * state. * @returns the next state to transition to. */ - StateId delta(StateId currentState, Symbol inputSymbol) const; + [[nodiscard]] StateId delta(StateId currentState, Symbol inputSymbol) const; // --------------------------------------------------------------------------------- // stream helpers - int currentChar() const noexcept { return currentChar_; } - bool eof() const noexcept { return !source_->good(); } + [[nodiscard]] int currentChar() const noexcept { return currentChar_; } + [[nodiscard]] bool eof() const noexcept { return !source_->good(); } Symbol nextChar(); void rollback(); @@ -128,13 +128,13 @@ class LexerIterator // debugging helpers template - void tracef(const char* msg, Args&&... args) const; + void tracef(fmt::format_string msg, Args&&... args) const; - const std::string& name(Token t) const; + [[nodiscard]] const std::string& name(Token t) const; - std::string toString(const std::deque& stack); - Token token(StateId s) const; - static std::string stateName(StateId s); + [[nodiscard]] std::string toString(const std::deque& stack); + [[nodiscard]] Token token(StateId s) const; + [[nodiscard]] static std::string stateName(StateId s); private: const LexerDef* def_ = nullptr; @@ -243,7 +243,7 @@ template ::LexerIterator(const LexerDef& ld, std::istream& source, TraceFn trace): - def_ { &ld }, trace_ { trace }, source_ { &source } + def_ { &ld }, trace_ { std::move(trace) }, source_ { &source } { recognize(); } @@ -322,7 +322,7 @@ inline Token LexerIterator::recogniz stack.push_back(BadState); if constexpr (Trace) - tracef("recognize: startState {}, offset {} {}", + tracef("recognizeOne: startState {}, offset {} {}", stateName(state), offset_, isBeginOfLine_ ? "BOL" : "no-BOL"); @@ -344,7 +344,7 @@ inline Token LexerIterator::recogniz while (state != BadState && !isAcceptState(state)) { if constexpr (Trace) - tracef("recognize: backtrack: current state {} {}; stack: {}", + tracef("recognizeOne: backtrack: current state {} {}; stack: {}", stateName(state), isAcceptState(state) ? "accepting" : "non-accepting", toString(stack)); @@ -391,7 +391,7 @@ inline Token LexerIterator::recogniz currentToken_.offset, offset_, quotedString(currentToken_.literal), - quoted(currentChar_)); + prettySymbol(currentChar_)); if (!isAcceptState(state)) throw LexerError { offset_ }; @@ -451,7 +451,7 @@ inline Symbol LexerIterator::nextCha currentChar_ = ch; buffered_.resize(buffered_.size() - 1); if constexpr (Trace) - tracef("Lexer:{}: advance '{}'", offset_, prettySymbol(ch)); + tracef("Lexer:{}: advance (buffered) '{}'", offset_, prettySymbol(ch)); offset_++; return ch; } @@ -459,7 +459,7 @@ inline Symbol LexerIterator::nextCha if (!source_->good()) { // EOF or I/O error if constexpr (Trace) - tracef("Lexer:{}: advance '{}'", offset_, "EOF"); + tracef("Lexer:{}: advance '<<{}>>'", offset_, "EOF"); return Symbols::EndOfFile; } @@ -487,7 +487,8 @@ inline void LexerIterator::rollback( if (currentToken_.literal.back() != -1) { offset_--; - buffered_.push_back(currentToken_.literal.back()); + buffered_.push_back(static_cast(static_cast(currentToken_.literal.back()))); + tracef("Lexer:{}: rollback '{}'", offset_, prettySymbol(buffered_.back())); } } @@ -495,7 +496,7 @@ inline void LexerIterator::rollback( template template -inline void LexerIterator::tracef(const char* msg, +inline void LexerIterator::tracef(fmt::format_string msg, Args&&... args) const { if constexpr (Trace) @@ -584,7 +585,7 @@ struct formatter constexpr auto format(const LexerIterator& v, FormatContext& ctx) { - return format_to(ctx.out(), "{} ({})", v.literal(), v.name()); + return fmt::format_to(ctx.out(), "{} ({})", v.literal(), v.name()); } }; } // namespace fmt diff --git a/src/regex_dfa/Lexer-inl.h b/src/regex_dfa/Lexer-inl.h index fbc1521c99..25b2aaae11 100644 --- a/src/regex_dfa/Lexer-inl.h +++ b/src/regex_dfa/Lexer-inl.h @@ -17,17 +17,6 @@ namespace regex_dfa { -static inline std::string quoted(char ch) -{ - if (ch < 0) - return "<>"; - if (ch == '\n') - return "\\n"; - if (ch == ' ') - return "\\s"; - return fmt::format("{}", ch); -} - static inline std::string quotedString(const std::string& s) { std::stringstream sstr; @@ -115,8 +104,7 @@ inline size_t Lexer::getFileSize() } template -inline std::string Lexer::stateName(StateId s, - const std::string_view& n) +inline std::string Lexer::stateName(StateId s, std::string_view n) { switch (s) { @@ -247,7 +235,7 @@ inline Token Lexer::recognizeOne() oldOffset_, offset_, quotedString(word_), - quoted(currentChar_)); + prettySymbol(currentChar_)); if (!isAcceptState(state)) throw LexerError { offset_ }; @@ -286,9 +274,9 @@ inline StateId Lexer::delta(StateId } template -inline bool Lexer::isAcceptState(StateId id) const +inline bool Lexer::isAcceptState(StateId state) const noexcept { - return def_.acceptStates.find(id) != def_.acceptStates.end(); + return def_.acceptStates.find(state) != def_.acceptStates.end(); } template diff --git a/src/regex_dfa/Lexer.h b/src/regex_dfa/Lexer.h index db75097717..769ce34cad 100644 --- a/src/regex_dfa/Lexer.h +++ b/src/regex_dfa/Lexer.h @@ -39,25 +39,25 @@ struct TokenInfo }; template -inline Token token(const TokenInfo& it) +[[nodiscard]] inline Token token(const TokenInfo& it) { return it.token; } template -inline size_t offset(const TokenInfo& it) +[[nodiscard]] inline size_t offset(const TokenInfo& it) { return it.offset; } template -inline const std::string& literal(const TokenInfo& it) +[[nodiscard]] inline const std::string& literal(const TokenInfo& it) { return it.literal; } template -inline const std::string& to_string(const TokenInfo& info) noexcept +[[nodiscard]] inline const std::string& to_string(const TokenInfo& info) noexcept { return info.literal; } @@ -97,27 +97,30 @@ class Lexer /** * Recognizes one token (ignored patterns are skipped). */ - TokenInfo recognize(); + [[nodiscard]] TokenInfo recognize(); /** * Recognizes one token, regardless of it is to be ignored or not. */ - Token recognizeOne(); + [[nodiscard]] Token recognizeOne(); //! the underlying word of the currently recognized token - const std::string& word() const { return word_; } + [[nodiscard]] const std::string& word() const { return word_; } //! @returns the absolute offset of the file the lexer is currently reading from. - std::pair offset() const noexcept { return std::make_pair(oldOffset_, offset_); } + [[nodiscard]] std::pair offset() const noexcept + { + return std::make_pair(oldOffset_, offset_); + } //! @returns the last recognized token. - Token token() const noexcept { return token_; } + [[nodiscard]] Token token() const noexcept { return token_; } //! @returns the name of the current token. - const std::string& name() const { return name(token_); } + [[nodiscard]] const std::string& name() const { return name(token_); } //! @returns the name of the token represented by Token @p t. - const std::string& name(Token t) const + [[nodiscard]] const std::string& name(Token t) const { auto i = def_.tagNames.find(static_cast(t)); assert(i != def_.tagNames.end()); @@ -132,7 +135,7 @@ class Lexer * state. * @returns the next state to transition to. */ - inline StateId delta(StateId currentState, Symbol inputSymbol) const; + [[nodiscard]] inline StateId delta(StateId currentState, Symbol inputSymbol) const; /** * Sets the active deterministic finite automaton to use for recognizing words. @@ -141,14 +144,16 @@ class Lexer */ Machine setMachine(Machine machine) { + auto const oldMachine = initialStateId_; // since Machine is a 1:1 mapping into the State's ID, we can simply cast here. initialStateId_ = static_cast(machine); + return oldMachine; } /** * Retrieves the default DFA machine that is used to recognize words. */ - Machine defaultMachine() const + [[nodiscard]] Machine defaultMachine() const { auto i = def_.initialStates.find("INITIAL"); assert(i != def_.initialStates.end()); @@ -160,16 +165,16 @@ class Lexer */ struct LexerError: public std::runtime_error { - LexerError(unsigned int _offset): - std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", _offset) }, - offset { _offset } + LexerError(unsigned int offset): + std::runtime_error { fmt::format("[{}] Failed to lexically recognize a word.", offset) }, + offset { offset } { } unsigned int offset; }; - struct iterator + struct iterator // NOLINT(readability-identifier-naming) { Lexer& lx; int end; @@ -200,9 +205,9 @@ class Lexer iterator end() { return iterator { *this, 2, TokenInfo { 0, "" } }; } - bool eof() const { return !stream_->good(); } + [[nodiscard]] bool eof() const { return !stream_->good(); } - size_t fileSize() const noexcept { return fileSize_; } + [[nodiscard]] size_t fileSize() const noexcept { return fileSize_; } private: template @@ -213,24 +218,24 @@ class Lexer debug_(fmt::format(msg, args...)); } - Symbol nextChar(); + [[nodiscard]] Symbol nextChar(); void rollback(); - StateId getInitialState() const noexcept; - bool isAcceptState(StateId state) const; - static std::string stateName(StateId s, const std::string_view& n = "n"); + [[nodiscard]] StateId getInitialState() const noexcept; + [[nodiscard]] bool isAcceptState(StateId state) const noexcept; + [[nodiscard]] static std::string stateName(StateId s, std::string_view n = "n"); static constexpr StateId BadState = 101010; - std::string toString(const std::deque& stack); + [[nodiscard]] std::string toString(const std::deque& stack); - int currentChar() const noexcept { return currentChar_; } + [[nodiscard]] int currentChar() const noexcept { return currentChar_; } - Token token(StateId s) const + [[nodiscard]] Token token(StateId s) const { auto i = def_.acceptStates.find(s); assert(i != def_.acceptStates.end()); return static_cast(i->second); } - size_t getFileSize(); + [[nodiscard]] size_t getFileSize(); private: const LexerDef& def_; @@ -277,7 +282,7 @@ struct formatter> template constexpr auto format(const TokenInfo& v, FormatContext& ctx) { - return format_to(ctx.out(), "{}", v.literal); + return fmt::format_to(ctx.out(), "{}", v.literal); } }; } // namespace fmt diff --git a/src/regex_dfa/LexerDef.h b/src/regex_dfa/LexerDef.h index 90a1c01803..3a774827c2 100644 --- a/src/regex_dfa/LexerDef.h +++ b/src/regex_dfa/LexerDef.h @@ -34,11 +34,11 @@ struct LexerDef BacktrackingMap backtrackingStates; std::map tagNames; - std::string to_string() const; + [[nodiscard]] std::string to_string() const; - bool isValidTag(Tag t) const noexcept { return tagNames.find(t) != tagNames.end(); } + [[nodiscard]] bool isValidTag(Tag t) const noexcept { return tagNames.find(t) != tagNames.end(); } - std::string tagName(Tag t) const + [[nodiscard]] std::string tagName(Tag t) const { auto i = tagNames.find(t); assert(i != tagNames.end()); diff --git a/src/regex_dfa/Lexer_test.cpp b/src/regex_dfa/Lexer_test.cpp index 41b4624355..1bc732eb28 100644 --- a/src/regex_dfa/Lexer_test.cpp +++ b/src/regex_dfa/Lexer_test.cpp @@ -10,9 +10,9 @@ #include #include #include +#include -#include -#include +#include using namespace std; using namespace regex_dfa; @@ -49,12 +49,13 @@ enum class LookaheadToken { Eof = 1, ABBA, - AB_CD, + AB_CD, // NOLINT(readability-identifier-naming) CD, CDEF, - EOL_LF, + EOL_LF, // NOLINT(readability-identifier-naming) XAnyLine }; + namespace fmt { // it sucks that I've to specify that here template <> @@ -71,84 +72,84 @@ struct formatter { switch (v) { - case LookaheadToken::Eof: return format_to(ctx.out(), "Eof"); - case LookaheadToken::ABBA: return format_to(ctx.out(), "abba"); - case LookaheadToken::AB_CD: return format_to(ctx.out(), "ab/cd"); - case LookaheadToken::CD: return format_to(ctx.out(), "cd"); - case LookaheadToken::CDEF: return format_to(ctx.out(), "cdef"); - case LookaheadToken::EOL_LF: return format_to(ctx.out(), "eol$"); - case LookaheadToken::XAnyLine: return format_to(ctx.out(), ""); - default: return format_to(ctx.out(), "<{}>", static_cast(v)); + case LookaheadToken::Eof: return fmt::format_to(ctx.out(), "Eof"); + case LookaheadToken::ABBA: return fmt::format_to(ctx.out(), "abba"); + case LookaheadToken::AB_CD: return fmt::format_to(ctx.out(), "ab/cd"); + case LookaheadToken::CD: return fmt::format_to(ctx.out(), "cd"); + case LookaheadToken::CDEF: return fmt::format_to(ctx.out(), "cdef"); + case LookaheadToken::EOL_LF: return fmt::format_to(ctx.out(), "eol$"); + case LookaheadToken::XAnyLine: return fmt::format_to(ctx.out(), ""); + default: return fmt::format_to(ctx.out(), "<{}>", static_cast(v)); } } }; } // namespace fmt -TEST(regex_Lexer, lookahead) +TEST_CASE("regex_Lexer.lookahead") { Compiler cc; cc.parse(RULES); - const LexerDef lexerDef = cc.compile(); - logf("LexerDef:\n{}", lexerDef.to_string()); - Lexable ls { lexerDef, "abba abcdef", [this](const string& msg) { - log(msg); + LexerDef const lexerDef = cc.compile(); + CAPTURE(lexerDef.to_string()); + Lexable ls { lexerDef, "abba abcdef", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(LookaheadToken::ABBA, *lexer); - ASSERT_EQ(LookaheadToken::AB_CD, *++lexer); - ASSERT_EQ(LookaheadToken::CDEF, *++lexer); - ASSERT_EQ(LookaheadToken::Eof, *++lexer); - ASSERT_EQ(end(ls), ++lexer); + REQUIRE(LookaheadToken::ABBA == *lexer); + REQUIRE(LookaheadToken::AB_CD == *++lexer); + REQUIRE(LookaheadToken::CDEF == *++lexer); + REQUIRE(LookaheadToken::Eof == *++lexer); + REQUIRE(end(ls) == ++lexer); } -TEST(regex_Lexable, one) +TEST_CASE("regex_Lexable.one") { Compiler cc; cc.parse(RULES); - const LexerDef ld = cc.compile(); - logf("LexerDef:\n{}", ld.to_string()); + LexerDef const ld = cc.compile(); + CAPTURE(ld.to_string()); auto src = Lexable { ld, make_unique("abba abcdef"), - [this](const string& msg) { - log(msg); + [](const string& msg) { + INFO(msg); } }; auto lexer = begin(src); auto eof = end(src); - ASSERT_TRUE(lexer != eof); - EXPECT_EQ(LookaheadToken::ABBA, token(lexer)); - EXPECT_EQ(0, offset(lexer)); + REQUIRE(lexer != eof); + CHECK(LookaheadToken::ABBA == token(lexer)); + CHECK(0 == offset(lexer)); ++lexer; - EXPECT_EQ(LookaheadToken::AB_CD, token(lexer)); - EXPECT_EQ(5, offset(lexer)); + CHECK(LookaheadToken::AB_CD == token(lexer)); + CHECK(5 == offset(lexer)); ++lexer; - EXPECT_EQ(LookaheadToken::CDEF, token(lexer)); - EXPECT_EQ(7, offset(lexer)); + CHECK(LookaheadToken::CDEF == token(lexer)); + CHECK(7 == offset(lexer)); ++lexer; - EXPECT_EQ(LookaheadToken::Eof, token(lexer)); - EXPECT_EQ(11, offset(lexer)); + CHECK(LookaheadToken::Eof == token(lexer)); + CHECK(11 == offset(lexer)); ++lexer; - ASSERT_FALSE(lexer != eof); // TODO: make that work + REQUIRE(!(lexer != eof)); // TODO: make that work } -TEST(regex_Lexer, LexerError) +TEST_CASE("regex_Lexer.LexerError") { Compiler cc; cc.parse(RULES); const LexerDef ld = cc.compile(); Lexable ls { ld, "invalid" }; - EXPECT_THROW(begin(ls), LexerError); + CHECK_THROWS_AS((void) begin(ls), LexerError); } -TEST(regex_Lexer, evaluateDotToken) +TEST_CASE("regex_Lexer.evaluateDotToken") { Compiler cc; cc.parse(RULES); @@ -157,179 +158,180 @@ TEST(regex_Lexer, evaluateDotToken) Lexable ls { ld, "xanything" }; auto lexer = begin(ls); - ASSERT_EQ(LookaheadToken::XAnyLine, *lexer); - ASSERT_EQ(LookaheadToken::Eof, *++lexer); + REQUIRE(LookaheadToken::XAnyLine == *lexer); + REQUIRE(LookaheadToken::Eof == *++lexer); } -TEST(regex_Lexer, match_eol) +TEST_CASE("regex_Lexer.match_eol") { Compiler cc; cc.parse(RULES); LexerDef ld = cc.compile(); - Lexable ls { ld, "abba eol\nabba", [this](const string& msg) { - log(msg); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + Lexable ls { ld, "abba eol\nabba", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(LookaheadToken::ABBA, *lexer); - EXPECT_EQ(0, offset(lexer)); + REQUIRE(LookaheadToken::ABBA == *lexer); + CHECK(0 == offset(lexer)); - ASSERT_EQ(LookaheadToken::EOL_LF, *++lexer); - EXPECT_EQ(5, offset(lexer)); + REQUIRE(LookaheadToken::EOL_LF == *++lexer); + CHECK(5 == offset(lexer)); - ASSERT_EQ(LookaheadToken::ABBA, *++lexer); - EXPECT_EQ(9, offset(lexer)); + REQUIRE(LookaheadToken::ABBA == *++lexer); + CHECK(9 == offset(lexer)); - ASSERT_EQ(LookaheadToken::Eof, *++lexer); + REQUIRE(LookaheadToken::Eof == *++lexer); } -TEST(regex_Lexer, bol) +TEST_CASE("regex_Lexer.bol") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Pragma ::= ^pragma - |Test ::= test - |Unknown ::= . - |Eof ::= <> - |)"_multiline); + |Pragma ::= ^pragma + |Test ::= test + |Unknown ::= . + |Eof ::= <> + |)"_multiline); LexerDef ld = cc.compileMulti(); - Lexable ls { ld, "pragma", [this](const string& msg) { - log(msg); + Lexable ls { ld, "pragma", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(1, *lexer); // ^pragma - ASSERT_EQ(4, *++lexer); // EOS + REQUIRE(1 == *lexer); // ^pragma + REQUIRE(4 == *++lexer); // EOS } -TEST(regex_Lexer, bol_no_match) +TEST_CASE("regex_Lexer.bol_no_match") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Pragma ::= ^pragma - |Test ::= test - |Unknown ::= . - |Eof ::= <> - |)"_multiline); + |Pragma ::= ^pragma + |Test ::= test + |Unknown ::= . + |Eof ::= <> + |)"_multiline); LexerDef ld = cc.compileMulti(); - logf("LexerDef:\n{}", ld.to_string()); - Lexable ls { ld, "test pragma", [this](const string& msg) { - log(msg); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + Lexable ls { ld, "test pragma", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(2, *lexer); // test + REQUIRE(2 == *lexer); // test // pragma (char-wise) - must not be recognized as ^pragma - ASSERT_EQ(3, *++lexer); - ASSERT_EQ(3, *++lexer); - ASSERT_EQ(3, *++lexer); - ASSERT_EQ(3, *++lexer); - ASSERT_EQ(3, *++lexer); - ASSERT_EQ(3, *++lexer); - - ASSERT_EQ(4, *++lexer); // EOS + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + REQUIRE(3 == *++lexer); + + REQUIRE(4 == *++lexer); // EOS } -TEST(regex_Lexer, bol_line2) +TEST_CASE("regex_Lexer.bol_line2") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Pragma ::= ^pragma - |Test ::= test - |Eof ::= <> - |)"_multiline); + |Pragma ::= ^pragma + |Test ::= test + |Eof ::= <> + |)"_multiline); LexerDef ld = cc.compileMulti(); - logf("LexerDef:\n{}", ld.to_string()); - Lexable ls { ld, "test\npragma", [this](const string& msg) { - log(msg); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); + Lexable ls { ld, "test\npragma", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(2, *lexer); // test - ASSERT_EQ(1, *++lexer); // ^pragma - ASSERT_EQ(3, *++lexer); // EOS + REQUIRE(2 == *lexer); // test + REQUIRE(1 == *++lexer); // ^pragma + REQUIRE(3 == *++lexer); // EOS } -TEST(regex_Lexer, bol_and_other_conditions) +TEST_CASE("regex_Lexer.bol_and_other_conditions") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Pragma ::= ^pragma - |Test ::= test - |Eof ::= <> - |Jump ::= jmp)"_multiline); + |Pragma ::= ^pragma + |Test ::= test + |Eof ::= <> + |Jump ::= jmp)"_multiline); LexerDef ld = cc.compileMulti(); - logf("LexerDef:\n{}", ld.to_string()); + INFO(fmt::format("LexerDef:\n{}", ld.to_string())); - Lexable ls { ld, "pragma test", [this](const string& msg) { - log(msg); + Lexable ls { ld, "pragma test", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(1, *lexer); // ^pragma - ASSERT_EQ(2, *++lexer); // test - ASSERT_EQ(3, *++lexer); // <> + REQUIRE(1 == *lexer); // ^pragma + REQUIRE(2 == *++lexer); // test + REQUIRE(3 == *++lexer); // <> } -TEST(regex_Lexer, bol_rules_on_non_bol_lexer) +TEST_CASE("regex_Lexer.bol_rules_on_non_bol_lexer") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Eof ::= <> - |Test ::= "test" - |Pragma ::= ^"pragma" - |Unknown ::= . - |)"_multiline); + |Eof ::= <> + |Test ::= "test" + |Pragma ::= ^"pragma" + |Unknown ::= . + |)"_multiline); LexerDef ld = cc.compile(); using SimpleLexer = Lexable; - ASSERT_THROW(SimpleLexer(ld, "pragma"), invalid_argument); + CHECK_THROWS_AS(SimpleLexer(ld, "pragma"), std::invalid_argument); } -TEST(regex_Lexer, non_bol_rules_on_non_bol_lexer) +TEST_CASE("regex_Lexer.non_bol_rules_on_non_bol_lexer") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Eof ::= <> - |Test ::= "test" - |Unknown ::= . - |)"_multiline); + |Eof ::= <> + |Test ::= "test" + |Unknown ::= . + |)"_multiline); LexerDef ld = cc.compile(); Lexable ls { ld, " test " }; auto lexer = begin(ls); - ASSERT_EQ(2, *lexer); // "test" - ASSERT_EQ(1, *++lexer); // <> + REQUIRE(2 == *lexer); // "test" + REQUIRE(1 == *++lexer); // <> } -TEST(regex_Lexer, non_bol_rules_on_bol_lexer) +TEST_CASE("regex_Lexer.non_bol_rules_on_bol_lexer") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Eof ::= <> - |Test ::= "test" - |Unknown ::= . - |)"_multiline); + |Eof ::= <> + |Test ::= "test" + |Unknown ::= . + |)"_multiline); LexerDef ld = cc.compile(); Lexable ls { ld, " test " }; auto lexer = begin(ls); - ASSERT_EQ(2, *lexer); // "test" - ASSERT_EQ(1, *++lexer); // <> + REQUIRE(2 == *lexer); // "test" + REQUIRE(1 == *++lexer); // <> } -TEST(regex_Lexer, iterator) +TEST_CASE("regex_Lexer.iterator") { Compiler cc; cc.parse(make_unique(R"( - Spacing(ignore) ::= [\s\t\n]+ - A ::= a - B ::= b - Eof ::= <> - )")); + Spacing(ignore) ::= [\s\t\n]+ + A ::= a + B ::= b + Eof ::= <> + )")); auto const ld = cc.compile(); auto const ls = Lexable { ld, make_unique("a b b a") }; @@ -337,63 +339,63 @@ TEST(regex_Lexer, iterator) auto i = ls.begin(); // a - ASSERT_EQ(1, *i); - ASSERT_TRUE(i != e); + REQUIRE(1 == *i); + REQUIRE(i != e); // b i++; - ASSERT_EQ(2, *i); - ASSERT_TRUE(i != e); + REQUIRE(2 == *i); + REQUIRE(i != e); // b i++; - ASSERT_EQ(2, *i); - ASSERT_TRUE(i != e); + REQUIRE(2 == *i); + REQUIRE(i != e); // a i++; - ASSERT_EQ(1, *i); - ASSERT_TRUE(i != e); + REQUIRE(1 == *i); + REQUIRE(i != e); // <> i++; - ASSERT_EQ(3, *i); - ASSERT_TRUE(i != e); + REQUIRE(3 == *i); + REQUIRE(i != e); i++; - ASSERT_EQ(3, *i); // still EOF - ASSERT_TRUE(i == e); + REQUIRE(3 == *i); // still EOF + REQUIRE(i == e); } -TEST(regex_Lexer, empty_alt) +TEST_CASE("regex_Lexer.empty_alt") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Test ::= aa(bb|) - |Eof ::= <> - |)"_multiline); + |Test ::= aa(bb|) + |Eof ::= <> + |)"_multiline); LexerDef ld = cc.compileMulti(); - Lexable ls { ld, "aabb aa aabb", [this](const string& msg) { - log(msg); + Lexable ls { ld, "aabb aa aabb", [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(1, *lexer); - ASSERT_EQ(1, *++lexer); - ASSERT_EQ(1, *++lexer); - ASSERT_EQ(2, *++lexer); // EOF + REQUIRE(1 == *lexer); + REQUIRE(1 == *++lexer); + REQUIRE(1 == *++lexer); + REQUIRE(2 == *++lexer); // EOF } -TEST(regex_Lexer, ignore_many) +TEST_CASE("regex_Lexer.ignore_many") { Compiler cc; cc.parse(R"(|Spacing(ignore) ::= [\s\t\n]+ - |Comment(ignore) ::= #.* - |Eof ::= <> - |Foo ::= foo - |Bar ::= bar - |)"_multiline); + |Comment(ignore) ::= #.* + |Eof ::= <> + |Foo ::= foo + |Bar ::= bar + |)"_multiline); LexerDef ld = cc.compileMulti(); Lexable ls { ld, @@ -403,52 +405,52 @@ TEST(regex_Lexer, ignore_many) |# some bar |bar |)"_multiline, - [this](const string& msg) { - log(msg); + [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(2, *lexer); - ASSERT_EQ("foo", literal(lexer)); + REQUIRE(2 == *lexer); + REQUIRE("foo" == literal(lexer)); - ASSERT_EQ(3, *++lexer); - ASSERT_EQ("bar", literal(lexer)); + REQUIRE(3 == *++lexer); + REQUIRE("bar" == literal(lexer)); - ASSERT_EQ(1, *++lexer); // EOF + REQUIRE(1 == *++lexer); // EOF } -TEST(regex_Lexer, realworld_ipv4) +TEST_CASE("regex_Lexer.realworld_ipv4") { Compiler cc; cc.parse(R"(| - |Spacing(ignore) ::= [\s\t\n]+ - |Eof ::= <> - |IPv4Octet(ref) ::= [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] - |IPv4(ref) ::= {IPv4Octet}(\.{IPv4Octet}){3} - |IPv4Literal ::= {IPv4} - |)"_multiline); + |Spacing(ignore) ::= [\s\t\n]+ + |Eof ::= <> + |IPv4Octet(ref) ::= [0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5] + |IPv4(ref) ::= {IPv4Octet}(\.{IPv4Octet}){3} + |IPv4Literal ::= {IPv4} + |)"_multiline); auto ld = cc.compile(); auto ls = Lexable { ld, R"(0.0.0.0 4.2.2.1 10.10.40.199 255.255.255.255)", - [this](const string& msg) { - log(msg); + [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(2, *lexer); - ASSERT_EQ("0.0.0.0", literal(lexer)); + REQUIRE(2 == *lexer); + REQUIRE("0.0.0.0" == literal(lexer)); - ASSERT_EQ(2, *++lexer); - ASSERT_EQ("4.2.2.1", literal(lexer)); + REQUIRE(2 == *++lexer); + REQUIRE("4.2.2.1" == literal(lexer)); - ASSERT_EQ(2, *++lexer); - ASSERT_EQ("10.10.40.199", literal(lexer)); + REQUIRE(2 == *++lexer); + REQUIRE("10.10.40.199" == literal(lexer)); - ASSERT_EQ(2, *++lexer); - ASSERT_EQ("255.255.255.255", literal(lexer)); + REQUIRE(2 == *++lexer); + REQUIRE("255.255.255.255" == literal(lexer)); - ASSERT_EQ(1, *++lexer); + REQUIRE(1 == *++lexer); } enum class RealWorld @@ -473,16 +475,16 @@ struct formatter { switch (v) { - case RealWorld::Eof: return format_to(ctx.out(), "Eof"); - case RealWorld::IPv4: return format_to(ctx.out(), "IPv4"); - case RealWorld::IPv6: return format_to(ctx.out(), "IPv6"); - default: return format_to(ctx.out(), "<{}>", static_cast(v)); + case RealWorld::Eof: return fmt::format_to(ctx.out(), "Eof"); + case RealWorld::IPv4: return fmt::format_to(ctx.out(), "IPv4"); + case RealWorld::IPv6: return fmt::format_to(ctx.out(), "IPv6"); + default: return fmt::format_to(ctx.out(), "<{}>", static_cast(v)); } } }; } // namespace fmt -TEST(regex_Lexer, realworld_ipv6) +TEST_CASE("regex_Lexer.realworld_ipv6") { Compiler cc; cc.parse(R"(| @@ -508,93 +510,93 @@ TEST(regex_Lexer, realworld_ipv6) )"_multiline); static const string TEXT = R"(|0:0:0:0:0:0:0:0 - |1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd - |2001:0db8:85a3:0000:0000:8a2e:0370:7334 - |1234:5678:: - |0:: - |::0 - |:: - |1::3:4:5:6:7:8 - |1::4:5:6:7:8 - |1::5:6:7:8 - |1::8 - |1:2::4:5:6:7:8 - |1:2::5:6:7:8 - |1:2::8 - |::ffff:127.0.0.1 - |::ffff:c000:0280 - |)"_multiline; + |1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd + |2001:0db8:85a3:0000:0000:8a2e:0370:7334 + |1234:5678:: + |0:: + |::0 + |:: + |1::3:4:5:6:7:8 + |1::4:5:6:7:8 + |1::5:6:7:8 + |1::8 + |1:2::4:5:6:7:8 + |1:2::5:6:7:8 + |1:2::8 + |::ffff:127.0.0.1 + |::ffff:c000:0280 + |)"_multiline; auto ld = cc.compileMulti(); - auto ls = Lexable { ld, TEXT, [this](const string& msg) { - log(msg); + auto ls = Lexable { ld, TEXT, [](const string& msg) { + INFO(msg); } }; auto lexer = begin(ls); - ASSERT_EQ(RealWorld::IPv6, *lexer); - ASSERT_EQ("0:0:0:0:0:0:0:0", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *lexer); + REQUIRE("0:0:0:0:0:0:0:0" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1234:5678:90ab:cdef:aaaa:bbbb:cccc:dddd" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("2001:0db8:85a3:0000:0000:8a2e:0370:7334", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("2001:0db8:85a3:0000:0000:8a2e:0370:7334" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1234:5678::", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1234:5678::" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("0::", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("0::" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("::0", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::0" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("::", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1::3:4:5:6:7:8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::3:4:5:6:7:8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1::4:5:6:7:8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::4:5:6:7:8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1::5:6:7:8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::5:6:7:8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1::8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1::8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1:2::4:5:6:7:8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1:2::4:5:6:7:8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1:2::5:6:7:8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1:2::5:6:7:8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("1:2::8", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("1:2::8" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("::ffff:127.0.0.1", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::ffff:127.0.0.1" == literal(lexer)); - ASSERT_EQ(RealWorld::IPv6, *++lexer); - ASSERT_EQ("::ffff:c000:0280", literal(lexer)); + REQUIRE(RealWorld::IPv6 == *++lexer); + REQUIRE("::ffff:c000:0280" == literal(lexer)); - ASSERT_EQ(RealWorld::Eof, *++lexer); + REQUIRE(RealWorld::Eof == *++lexer); } -TEST(regex_Lexer, internal) +TEST_CASE("regex_Lexer.internal") { - ASSERT_EQ("Eof", fmt::format("{}", LookaheadToken::Eof)); - ASSERT_EQ("abba", fmt::format("{}", LookaheadToken::ABBA)); - ASSERT_EQ("ab/cd", fmt::format("{}", LookaheadToken::AB_CD)); - ASSERT_EQ("cd", fmt::format("{}", LookaheadToken::CD)); - ASSERT_EQ("cdef", fmt::format("{}", LookaheadToken::CDEF)); - ASSERT_EQ("eol$", fmt::format("{}", LookaheadToken::EOL_LF)); - ASSERT_EQ("", fmt::format("{}", LookaheadToken::XAnyLine)); - ASSERT_EQ("<724>", fmt::format("{}", static_cast(724))); - - ASSERT_EQ("Eof", fmt::format("{}", RealWorld::Eof)); - ASSERT_EQ("IPv4", fmt::format("{}", RealWorld::IPv4)); - ASSERT_EQ("IPv6", fmt::format("{}", RealWorld::IPv6)); - ASSERT_EQ("<724>", fmt::format("{}", static_cast(724))); + REQUIRE("Eof" == fmt::format("{}", LookaheadToken::Eof)); + REQUIRE("abba" == fmt::format("{}", LookaheadToken::ABBA)); + REQUIRE("ab/cd" == fmt::format("{}", LookaheadToken::AB_CD)); + REQUIRE("cd" == fmt::format("{}", LookaheadToken::CD)); + REQUIRE("cdef" == fmt::format("{}", LookaheadToken::CDEF)); + REQUIRE("eol$" == fmt::format("{}", LookaheadToken::EOL_LF)); + REQUIRE("" == fmt::format("{}", LookaheadToken::XAnyLine)); + REQUIRE("<724>" == fmt::format("{}", static_cast(724))); + + REQUIRE("Eof" == fmt::format("{}", RealWorld::Eof)); + REQUIRE("IPv4" == fmt::format("{}", RealWorld::IPv4)); + REQUIRE("IPv6" == fmt::format("{}", RealWorld::IPv6)); + REQUIRE("<724>" == fmt::format("{}", static_cast(724))); } diff --git a/src/regex_dfa/MultiDFA.cpp b/src/regex_dfa/MultiDFA.cpp index 3cd02d8d5a..208ce7f207 100644 --- a/src/regex_dfa/MultiDFA.cpp +++ b/src/regex_dfa/MultiDFA.cpp @@ -21,7 +21,7 @@ MultiDFA constructMultiDFA(map many) StateId q0 = 1; for (pair& p: many) { - multiDFA.dfa.append(move(p.second), q0); + multiDFA.dfa.append(std::move(p.second), q0); multiDFA.initialStates[p.first] = q0; multiDFA.dfa.setTransition(0, static_cast(q0), q0); q0++; diff --git a/src/regex_dfa/NFA.cpp b/src/regex_dfa/NFA.cpp index 29ea460cc9..f8674b980a 100644 --- a/src/regex_dfa/NFA.cpp +++ b/src/regex_dfa/NFA.cpp @@ -40,7 +40,7 @@ Alphabet NFA::alphabet() const for (const TransitionMap& transitions: states_) { - for (const pair& t: transitions) + for (auto const& t: transitions) { switch (t.first) { @@ -96,9 +96,9 @@ StateIdVec NFA::epsilonTransitions(StateId s) const StateIdVec t; const TransitionMap& transitions = stateTransitions(s); - for (const pair& p: transitions) - if (p.first == Symbols::Epsilon) - t.insert(t.end(), p.second.begin(), p.second.end()); + for (auto&& [p, q]: transitions) + if (p == Symbols::Epsilon) + t.insert(t.end(), q.begin(), q.end()); return t; } @@ -170,12 +170,12 @@ void NFA::prepareStateIds(StateId baseId) AcceptMap remapped; for (auto& a: acceptTags_) remapped[baseId + a.first] = a.second; - acceptTags_ = move(remapped); + acceptTags_ = std::move(remapped); BacktrackingMap backtracking; for (const auto& bt: backtrackStates_) backtracking[baseId + bt.first] = baseId + bt.second; - backtrackStates_ = move(backtracking); + backtrackStates_ = std::move(backtracking); } NFA NFA::join(const map& mappings) @@ -186,7 +186,7 @@ NFA NFA::join(const map& mappings) NFA multi; for (size_t i = 0; i <= mappings.size(); ++i) - multi.createState(); + (void) multi.createState(); Symbol transitionSymbol = 0; for (const auto& mapping: mappings) @@ -212,7 +212,7 @@ NFA& NFA::lookahead(NFA&& rhs) { if (empty()) { - *this = move(rhs); + *this = std::move(rhs); backtrackStates_[acceptState_] = initialState_; } else @@ -301,7 +301,7 @@ NFA& NFA::recurring() NFA& NFA::positive() { - return concatenate(move(clone().recurring())); + return concatenate(std::move(clone().recurring())); } NFA& NFA::times(unsigned factor) @@ -328,7 +328,7 @@ NFA& NFA::repeat(unsigned minimum, unsigned maximum) times(minimum); for (unsigned n = minimum + 1; n <= maximum; n++) - alternate(move(factor.clone().times(n))); + alternate(std::move(factor.clone().times(n))); if (minimum == 0) optional(); @@ -357,11 +357,11 @@ void NFA::visit(DotVisitor& v) const for (StateId sourceState = 0, sE = size(); sourceState != sE; ++sourceState) { map> reversed; - for (const pair& transitions: states_[sourceState]) + for (pair transitions: states_[sourceState]) for (StateId targetState: transitions.second) reversed[targetState].push_back(transitions.first /* symbol */); - for (const pair>& tr: reversed) + for (pair> tr: reversed) { StateId targetState = tr.first; const vector& T = tr.second; diff --git a/src/regex_dfa/NFA.h b/src/regex_dfa/NFA.h index 7cb776c664..7380de8333 100644 --- a/src/regex_dfa/NFA.h +++ b/src/regex_dfa/NFA.h @@ -7,7 +7,6 @@ #pragma once #include -#include #include #include @@ -79,7 +78,7 @@ class NFA void addTransition(StateId from, Symbol s, StateId to) { states_[from][s].push_back(to); } - static NFA join(const std::map& mappings); + [[nodiscard]] static NFA join(const std::map& mappings); /** * Traverses all states and edges in this NFA and calls @p visitor for each state & edge. @@ -89,26 +88,26 @@ class NFA void visit(DotVisitor& visitor) const; //! Tests whether or not this is an empty NFA. - bool empty() const noexcept { return states_.empty(); } + [[nodiscard]] bool empty() const noexcept { return states_.empty(); } //! Retrieves the number of states of this NFA. - size_t size() const noexcept { return states_.size(); } + [[nodiscard]] size_t size() const noexcept { return states_.size(); } //! Retrieves the one and only initial state. This value is nullptr iff the NFA is empty. - StateId initialStateId() const noexcept { return initialState_; } + [[nodiscard]] StateId initialStateId() const noexcept { return initialState_; } //! Retrieves the one and only accept state. This value is nullptr iff the NFA is empty. - StateId acceptStateId() const noexcept { return acceptState_; } + [[nodiscard]] StateId acceptStateId() const noexcept { return acceptState_; } //! Retrieves the list of states this FA contains. - const StateVec& states() const { return states_; } + [[nodiscard]] const StateVec& states() const { return states_; } StateVec& states() { return states_; } //! Retrieves the alphabet of this finite automaton. - Alphabet alphabet() const; + [[nodiscard]] Alphabet alphabet() const; //! Clones this NFA. - NFA clone() const; + [[nodiscard]] NFA clone() const; /** * Constructs an NFA where @p rhs is following but backtracking to @c acceptState(this) when @@ -140,24 +139,24 @@ class NFA NFA& repeat(unsigned minimum, unsigned maximum); //! Retrieves transitions for state with the ID @p id. - const TransitionMap& stateTransitions(StateId id) const { return states_[id]; } + [[nodiscard]] TransitionMap const& stateTransitions(StateId id) const { return states_[id]; } //! Retrieves all states that can be reached from @p S with one single input Symbol @p c. - StateIdVec delta(const StateIdVec& S, Symbol c) const; + [[nodiscard]] StateIdVec delta(const StateIdVec& S, Symbol c) const; StateIdVec* delta(const StateIdVec& S, Symbol c, StateIdVec* result) const; //! Retrieves all states that can be directly or indirectly accessed via epsilon-transitions exclusively. - StateIdVec epsilonClosure(const StateIdVec& S) const; + [[nodiscard]] StateIdVec epsilonClosure(const StateIdVec& S) const; void epsilonClosure(const StateIdVec& S, StateIdVec* result) const; - TransitionMap& stateTransitions(StateId s) { return states_[s]; } + [[nodiscard]] TransitionMap& stateTransitions(StateId s) { return states_[s]; } //! Flags given state as accepting-state with given Tag @p acceptTag. void setAccept(Tag acceptTag) { acceptTags_[acceptState_] = acceptTag; } void setAccept(StateId state, Tag tag) { acceptTags_[state] = tag; } - std::optional acceptTag(StateId s) const + [[nodiscard]] std::optional acceptTag(StateId s) const { if (auto i = acceptTags_.find(s); i != acceptTags_.end()) return i->second; @@ -165,12 +164,12 @@ class NFA return std::nullopt; } - bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } + [[nodiscard]] bool isAccepting(StateId s) const { return acceptTags_.find(s) != acceptTags_.end(); } /** * Returns whether or not the StateSet @p Q contains at least one State that is also "accepting". */ - bool isAnyAccepting(const StateIdVec& Q) const + [[nodiscard]] bool isAnyAccepting(const StateIdVec& Q) const { for (StateId q: Q) if (isAccepting(q)) @@ -179,10 +178,10 @@ class NFA return false; } - const AcceptMap& acceptMap() const noexcept { return acceptTags_; } - AcceptMap& acceptMap() noexcept { return acceptTags_; } + [[nodiscard]] const AcceptMap& acceptMap() const noexcept { return acceptTags_; } + [[nodiscard]] AcceptMap& acceptMap() noexcept { return acceptTags_; } - std::optional backtrack(StateId s) const + [[nodiscard]] std::optional backtrack(StateId s) const { if (auto i = backtrackStates_.find(s); i != backtrackStates_.end()) return i->second; @@ -194,7 +193,7 @@ class NFA * Checks if @p Q contains a state that is flagged as backtracking state in the NFA and returns * the target state within the NFA or @c std::nullopt if not a backtracking state. */ - std::optional containsBacktrackState(const StateIdVec& Q) const + [[nodiscard]] std::optional containsBacktrackState(const StateIdVec& Q) const { for (StateId q: Q) if (std::optional t = backtrack(q); t.has_value()) @@ -204,12 +203,12 @@ class NFA } private: - StateId createState(); + [[nodiscard]] StateId createState(); void visit(DotVisitor& v, StateId s, std::unordered_map& registry) const; void prepareStateIds(StateId baseId); //! Retrieves all epsilon-transitions directly connected to State @p s. - StateIdVec epsilonTransitions(StateId s) const; + [[nodiscard]] StateIdVec epsilonTransitions(StateId s) const; private: StateVec states_; diff --git a/src/regex_dfa/NFABuilder.cpp b/src/regex_dfa/NFABuilder.cpp index 912d875470..0e07d6f342 100644 --- a/src/regex_dfa/NFABuilder.cpp +++ b/src/regex_dfa/NFABuilder.cpp @@ -23,13 +23,13 @@ NFA NFABuilder::construct(const RegExpr& re, Tag tag) else fa_.setAccept(tag); - return move(fa_); + return std::move(fa_); } NFA NFABuilder::construct(const RegExpr& re) { visit(*this, re); - return move(fa_); + return std::move(fa_); } void NFABuilder::operator()(const LookAheadExpr& lookaheadExpr) @@ -37,24 +37,24 @@ void NFABuilder::operator()(const LookAheadExpr& lookaheadExpr) // fa_ = move(construct(lookaheadExpr.leftExpr()).lookahead(construct(lookaheadExpr.rightExpr()))); NFA lhs = construct(*lookaheadExpr.left); NFA rhs = construct(*lookaheadExpr.right); - lhs.lookahead(move(rhs)); - fa_ = move(lhs); + lhs.lookahead(std::move(rhs)); + fa_ = std::move(lhs); } void NFABuilder::operator()(const AlternationExpr& alternationExpr) { NFA lhs = construct(*alternationExpr.left); NFA rhs = construct(*alternationExpr.right); - lhs.alternate(move(rhs)); - fa_ = move(lhs); + lhs.alternate(std::move(rhs)); + fa_ = std::move(lhs); } void NFABuilder::operator()(const ConcatenationExpr& concatenationExpr) { NFA lhs = construct(*concatenationExpr.left); NFA rhs = construct(*concatenationExpr.right); - lhs.concatenate(move(rhs)); - fa_ = move(lhs); + lhs.concatenate(std::move(rhs)); + fa_ = std::move(lhs); } void NFABuilder::operator()(const CharacterExpr& characterExpr) @@ -74,15 +74,15 @@ void NFABuilder::operator()(const ClosureExpr& closureExpr) constexpr unsigned Infinity = numeric_limits::max(); if (xmin == 0 && xmax == 1) - fa_ = move(construct(*closureExpr.subExpr).optional()); + fa_ = std::move(construct(*closureExpr.subExpr).optional()); else if (xmin == 0 && xmax == Infinity) - fa_ = move(construct(*closureExpr.subExpr).recurring()); + fa_ = std::move(construct(*closureExpr.subExpr).recurring()); else if (xmin == 1 && xmax == Infinity) - fa_ = move(construct(*closureExpr.subExpr).positive()); + fa_ = std::move(construct(*closureExpr.subExpr).positive()); else if (xmin < xmax) - fa_ = move(construct(*closureExpr.subExpr).repeat(xmin, xmax)); + fa_ = std::move(construct(*closureExpr.subExpr).repeat(xmin, xmax)); else if (xmin == xmax) - fa_ = move(construct(*closureExpr.subExpr).times(xmin)); + fa_ = std::move(construct(*closureExpr.subExpr).times(xmin)); else throw invalid_argument { "closureExpr" }; } @@ -92,21 +92,21 @@ void NFABuilder::operator()(const BeginOfLineExpr&) fa_ = NFA { Symbols::Epsilon }; } -void NFABuilder::operator()(const EndOfLineExpr& eolExpr) +void NFABuilder::operator()(const EndOfLineExpr&) { // NFA lhs; // NFA rhs{'\n'}; // lhs.lookahead(move(rhs)); // fa_ = move(lhs); - fa_ = move(NFA {}.lookahead(NFA { '\n' })); + fa_ = std::move(NFA {}.lookahead(NFA { '\n' })); } -void NFABuilder::operator()(const EndOfFileExpr& eofExpr) +void NFABuilder::operator()(const EndOfFileExpr&) { fa_ = NFA { Symbols::EndOfFile }; } -void NFABuilder::operator()(const DotExpr& dotExpr) +void NFABuilder::operator()(const DotExpr&) { // any character except LF fa_ = NFA { '\t' }; @@ -116,7 +116,7 @@ void NFABuilder::operator()(const DotExpr& dotExpr) } } -void NFABuilder::operator()(const EmptyExpr& emptyExpr) +void NFABuilder::operator()(const EmptyExpr&) { fa_ = NFA { Symbols::Epsilon }; } diff --git a/src/regex_dfa/NFABuilder.h b/src/regex_dfa/NFABuilder.h index 646ca9ad4d..4ec4892856 100644 --- a/src/regex_dfa/NFABuilder.h +++ b/src/regex_dfa/NFABuilder.h @@ -33,8 +33,8 @@ class NFABuilder public: explicit NFABuilder(): fa_ {} {} - NFA construct(const RegExpr& re, Tag tag); - NFA construct(const RegExpr& re); + [[nodiscard]] NFA construct(const RegExpr& re, Tag tag); + [[nodiscard]] NFA construct(const RegExpr& re); void operator()(const LookAheadExpr& lookaheadExpr); void operator()(const ConcatenationExpr& concatenationExpr); void operator()(const AlternationExpr& alternationExpr); diff --git a/src/regex_dfa/NFA_test.cpp b/src/regex_dfa/NFA_test.cpp index 1ef09727c4..734e19581f 100644 --- a/src/regex_dfa/NFA_test.cpp +++ b/src/regex_dfa/NFA_test.cpp @@ -9,76 +9,77 @@ #include #include -#include +#include using namespace std; using namespace regex_dfa; -TEST(regex_NFA, emptyCtor) +TEST_CASE("regex_NFA.emptyCtor") { const NFA nfa; - ASSERT_EQ(0, nfa.size()); - ASSERT_TRUE(nfa.empty()); + REQUIRE(0 == nfa.size()); + REQUIRE(nfa.empty()); } -TEST(regex_NFA, characterCtor) +TEST_CASE("regex_NFA.characterCtor") { const NFA nfa { 'a' }; - ASSERT_EQ(2, nfa.size()); - ASSERT_EQ(0, nfa.initialStateId()); - ASSERT_EQ(1, nfa.acceptStateId()); - ASSERT_EQ(StateIdVec { 1 }, nfa.delta(StateIdVec { 0 }, 'a')); + REQUIRE(2 == nfa.size()); + REQUIRE(0 == nfa.initialStateId()); + REQUIRE(1 == nfa.acceptStateId()); + REQUIRE(StateIdVec { 1 } == nfa.delta(StateIdVec { 0 }, 'a')); } -TEST(regex_NFA, concatenate) +TEST_CASE("regex_NFA.concatenate") { - const NFA ab = move(NFA { 'a' }.concatenate(NFA { 'b' })); - ASSERT_EQ(4, ab.size()); - ASSERT_EQ(0, ab.initialStateId()); - ASSERT_EQ(3, ab.acceptStateId()); + const NFA ab = std::move(NFA { 'a' }.concatenate(NFA { 'b' })); + REQUIRE(4 == ab.size()); + REQUIRE(0 == ab.initialStateId()); + REQUIRE(3 == ab.acceptStateId()); // TODO: check ab.initial == A.initial // TODO: check A.accept == B.initial // TODO: check ab.accept == B.accept } -TEST(regex_NFA, alternate) +TEST_CASE("regex_NFA.alternate") { - const NFA ab = move(NFA { 'a' }.alternate(NFA { 'b' })); - ASSERT_EQ(6, ab.size()); - ASSERT_EQ(2, ab.initialStateId()); - ASSERT_EQ(3, ab.acceptStateId()); + const NFA ab = std::move(NFA { 'a' }.alternate(NFA { 'b' })); + REQUIRE(6 == ab.size()); + REQUIRE(2 == ab.initialStateId()); + REQUIRE(3 == ab.acceptStateId()); // TODO: check acceptState transitions to A and B // TODO: check A and B's outgoing edges to final acceptState } -TEST(regex_NFA, epsilonClosure) +TEST_CASE("regex_NFA.epsilonClosure") { const NFA nfa { 'a' }; - ASSERT_EQ(0, nfa.initialStateId()); - ASSERT_EQ(1, nfa.acceptStateId()); - ASSERT_EQ(StateIdVec { 0 }, nfa.epsilonClosure(StateIdVec { 0 })); + REQUIRE(0 == nfa.initialStateId()); + REQUIRE(1 == nfa.acceptStateId()); + REQUIRE(StateIdVec { 0 } == nfa.epsilonClosure(StateIdVec { 0 })); - const NFA abc = move(NFA { 'a' }.concatenate(move(NFA { 'b' }.alternate(NFA { 'c' }).recurring()))); - ASSERT_EQ(StateIdVec { 0 }, abc.epsilonClosure(StateIdVec { 0 })); + const NFA abc = + std::move(NFA { 'a' }.concatenate(std::move(NFA { 'b' }.alternate(NFA { 'c' }).recurring()))); + REQUIRE(StateIdVec { 0 } == abc.epsilonClosure(StateIdVec { 0 })); const StateIdVec e1 { 1, 2, 4, 6, 8, 9 }; - ASSERT_EQ(e1, abc.epsilonClosure(StateIdVec { 1 })); + REQUIRE(e1 == abc.epsilonClosure(StateIdVec { 1 })); } -TEST(regex_NFA, delta) +TEST_CASE("regex_NFA.delta") { const NFA nfa { 'a' }; - ASSERT_EQ(0, nfa.initialStateId()); - ASSERT_EQ(1, nfa.acceptStateId()); - ASSERT_EQ(StateIdVec { 1 }, nfa.delta(StateIdVec { 0 }, 'a')); + REQUIRE(0 == nfa.initialStateId()); + REQUIRE(1 == nfa.acceptStateId()); + REQUIRE(StateIdVec { 1 } == nfa.delta(StateIdVec { 0 }, 'a')); } -TEST(regex_NFA, alphabet) +TEST_CASE("regex_NFA.alphabet") { - ASSERT_EQ("{}", NFA {}.alphabet().to_string()); - ASSERT_EQ("{a}", NFA { 'a' }.alphabet().to_string()); - ASSERT_EQ("{ab}", NFA { 'a' }.concatenate(NFA { 'b' }).alphabet().to_string()); - ASSERT_EQ("{abc}", NFA { 'a' }.concatenate(NFA { 'b' }).alternate(NFA { 'c' }).alphabet().to_string()); + REQUIRE("{}" == NFA {}.alphabet().to_string()); + REQUIRE("{a}" == NFA { 'a' }.alphabet().to_string()); + REQUIRE("{ab}" == NFA { 'a' }.concatenate(NFA { 'b' }).alphabet().to_string()); + REQUIRE("{abc}" == NFA { 'a' }.concatenate(NFA { 'b' }).alternate(NFA { 'c' }).alphabet().to_string()); } diff --git a/src/regex_dfa/RegExpr.cpp b/src/regex_dfa/RegExpr.cpp index d087705e6d..b7ba9c70af 100644 --- a/src/regex_dfa/RegExpr.cpp +++ b/src/regex_dfa/RegExpr.cpp @@ -45,7 +45,7 @@ std::string to_string(const RegExpr& re) { return visit( overloaded { - [&](const ClosureExpr& e) { + [&](ClosureExpr const& e) { stringstream sstr; sstr << embrace(re, *e.subExpr); if (e.minimumOccurrences == 0 && e.maximumOccurrences == 1) @@ -62,12 +62,12 @@ std::string to_string(const RegExpr& re) [&](const ConcatenationExpr& e) { return embrace(re, *e.left) + embrace(re, *e.right); }, [&](const LookAheadExpr& e) { return embrace(re, *e.left) + "/" + embrace(re, *e.right); }, [](const CharacterExpr& e) { return string(1, e.value); }, - [](const EndOfFileExpr& e) { return string { "<>" }; }, - [](const BeginOfLineExpr& e) { return string { "^" }; }, - [](const EndOfLineExpr& e) { return string { "$" }; }, - [](const CharacterClassExpr& e) { return e.symbols.to_string(); }, - [](const DotExpr& e) { return string { "." }; }, - [](const EmptyExpr& e) { return string {}; }, + [](EndOfFileExpr) { return string { "<>" }; }, + [](BeginOfLineExpr) { return string { "^" }; }, + [](EndOfLineExpr) { return string { "$" }; }, + [](CharacterClassExpr const& e) { return e.symbols.to_string(); }, + [](DotExpr) { return string { "." }; }, + [](EmptyExpr) { return string {}; }, }, re); } @@ -75,17 +75,17 @@ std::string to_string(const RegExpr& re) int precedence(const RegExpr& regex) { return visit(overloaded { - [](const AlternationExpr& e) { return 1; }, - [](const BeginOfLineExpr& e) { return 4; }, - [](const CharacterClassExpr& e) { return 4; }, - [](const CharacterExpr& e) { return 4; }, - [](const ClosureExpr& e) { return 3; }, - [](const ConcatenationExpr& e) { return 2; }, - [](const DotExpr& e) { return 4; }, - [](const EmptyExpr& e) { return 4; }, - [](const EndOfFileExpr& e) { return 4; }, - [](const EndOfLineExpr& e) { return 4; }, - [](const LookAheadExpr& e) { return 0; }, + [](const AlternationExpr&) { return 1; }, + [](const BeginOfLineExpr&) { return 4; }, + [](const CharacterClassExpr&) { return 4; }, + [](const CharacterExpr&) { return 4; }, + [](const ClosureExpr&) { return 3; }, + [](const ConcatenationExpr&) { return 2; }, + [](const DotExpr&) { return 4; }, + [](const EmptyExpr&) { return 4; }, + [](const EndOfFileExpr&) { return 4; }, + [](const EndOfLineExpr&) { return 4; }, + [](const LookAheadExpr&) { return 0; }, }, regex); } @@ -96,17 +96,17 @@ bool containsBeginOfLine(const RegExpr& regex) [](const AlternationExpr& e) { return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); }, - [](const BeginOfLineExpr& e) { return true; }, - [](const CharacterClassExpr& e) { return false; }, - [](const CharacterExpr& e) { return false; }, + [](const BeginOfLineExpr&) { return true; }, + [](const CharacterClassExpr&) { return false; }, + [](const CharacterExpr&) { return false; }, [](const ClosureExpr& e) { return containsBeginOfLine(*e.subExpr); }, [](const ConcatenationExpr& e) { return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); }, - [](const DotExpr& e) { return false; }, - [](const EmptyExpr& e) { return false; }, - [](const EndOfFileExpr& e) { return false; }, - [](const EndOfLineExpr& e) { return false; }, + [](const DotExpr&) { return false; }, + [](const EmptyExpr&) { return false; }, + [](const EndOfFileExpr&) { return false; }, + [](const EndOfLineExpr&) { return false; }, [](const LookAheadExpr& e) { return containsBeginOfLine(*e.left) || containsBeginOfLine(*e.right); }, diff --git a/src/regex_dfa/RegExpr.h b/src/regex_dfa/RegExpr.h index c8ca1fdce7..02e892baa4 100644 --- a/src/regex_dfa/RegExpr.h +++ b/src/regex_dfa/RegExpr.h @@ -73,30 +73,21 @@ struct CharacterExpr { Symbol value; }; - struct CharacterClassExpr { SymbolSet symbols; }; -struct DotExpr -{ -}; -struct BeginOfLineExpr -{ -}; -struct EndOfLineExpr -{ -}; -struct EndOfFileExpr -{ -}; -struct EmptyExpr -{ -}; +// clang-format off +struct DotExpr {}; +struct BeginOfLineExpr {}; +struct EndOfLineExpr {}; +struct EndOfFileExpr {}; +struct EmptyExpr {}; +// clang-format on -std::string to_string(const RegExpr& regex); -int precedence(const RegExpr& regex); -bool containsBeginOfLine(const RegExpr& regex); +[[nodiscard]] std::string to_string(const RegExpr& regex); +[[nodiscard]] int precedence(const RegExpr& regex); +[[nodiscard]] bool containsBeginOfLine(const RegExpr& regex); } // namespace regex_dfa diff --git a/src/regex_dfa/RegExprParser.cpp b/src/regex_dfa/RegExprParser.cpp index b338be1a19..6c66dcfb0e 100644 --- a/src/regex_dfa/RegExprParser.cpp +++ b/src/regex_dfa/RegExprParser.cpp @@ -63,7 +63,7 @@ int RegExprParser::currentChar() const if (currentChar_ != input_.end()) return *currentChar_; else - return -1; + return std::char_traits::eof(); } bool RegExprParser::consumeIf(int ch) @@ -78,7 +78,7 @@ bool RegExprParser::consumeIf(int ch) int RegExprParser::consume() { if (currentChar_ == input_.end()) - return -1; + return std::char_traits::eof(); int ch = *currentChar_; if (ch == '\n') @@ -105,9 +105,9 @@ void RegExprParser::consume(int expected) } } -RegExpr RegExprParser::parse(string_view expr, int line, int column) +RegExpr RegExprParser::parse(string_view expr, unsigned line, unsigned column) { - input_ = move(expr); + input_ = expr; currentChar_ = input_.begin(); line_ = line; column_ = column; @@ -128,7 +128,7 @@ RegExpr RegExprParser::parseLookAheadExpr() { consume(); RegExpr rhs = parseAlternation(); - lhs = LookAheadExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + lhs = LookAheadExpr { make_unique(std::move(lhs)), make_unique(std::move(rhs)) }; } return lhs; @@ -142,7 +142,7 @@ RegExpr RegExprParser::parseAlternation() { consume(); RegExpr rhs = parseConcatenation(); - lhs = AlternationExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + lhs = AlternationExpr { make_unique(std::move(lhs)), make_unique(std::move(rhs)) }; } return lhs; @@ -154,10 +154,11 @@ RegExpr RegExprParser::parseConcatenation() static const string_view follow = "/|)"; RegExpr lhs = parseClosure(); - while (!eof() && follow.find(currentChar()) == follow.npos) + while (!eof() && follow.find(currentChar()) == std::string_view::npos) { RegExpr rhs = parseClosure(); - lhs = ConcatenationExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + lhs = + ConcatenationExpr { make_unique(std::move(lhs)), make_unique(std::move(rhs)) }; } return lhs; @@ -169,9 +170,9 @@ RegExpr RegExprParser::parseClosure() switch (currentChar()) { - case '?': consume(); return ClosureExpr { make_unique(move(subExpr)), 0, 1 }; - case '*': consume(); return ClosureExpr { make_unique(move(subExpr)), 0 }; - case '+': consume(); return ClosureExpr { make_unique(move(subExpr)), 1 }; + case '?': consume(); return ClosureExpr { make_unique(std::move(subExpr)), 0, 1 }; + case '*': consume(); return ClosureExpr { make_unique(std::move(subExpr)), 0 }; + case '+': consume(); return ClosureExpr { make_unique(std::move(subExpr)), 1 }; case '{': { consume(); unsigned int m = parseInt(); @@ -180,12 +181,12 @@ RegExpr RegExprParser::parseClosure() consume(); unsigned int n = parseInt(); consume('}'); - return ClosureExpr { make_unique(move(subExpr)), m, n }; + return ClosureExpr { make_unique(std::move(subExpr)), m, n }; } else { consume('}'); - return ClosureExpr { make_unique(move(subExpr)), m, m }; + return ClosureExpr { make_unique(std::move(subExpr)), m, m }; } } default: return subExpr; @@ -212,7 +213,7 @@ RegExpr RegExprParser::parseAtom() switch (currentChar()) { - case -1: // EOF + case std::char_traits::eof(): // EOF case ')': return EmptyExpr {}; case '<': consume(); @@ -235,7 +236,8 @@ RegExpr RegExprParser::parseAtom() while (!eof() && currentChar() != '"') { RegExpr rhs = CharacterExpr { consume() }; - lhs = ConcatenationExpr { make_unique(move(lhs)), make_unique(move(rhs)) }; + lhs = ConcatenationExpr { make_unique(std::move(lhs)), + make_unique(std::move(rhs)) }; } consume('"'); return lhs; @@ -262,7 +264,7 @@ RegExpr RegExprParser::parseCharacterClass() ss.complement(); consume(']'); - return CharacterClassExpr { move(ss) }; + return CharacterClassExpr { std::move(ss) }; } void RegExprParser::parseNamedCharacterClass(SymbolSet& ss) diff --git a/src/regex_dfa/RegExprParser.h b/src/regex_dfa/RegExprParser.h index d7dccb2302..8484087af8 100644 --- a/src/regex_dfa/RegExprParser.h +++ b/src/regex_dfa/RegExprParser.h @@ -24,9 +24,9 @@ class RegExprParser public: RegExprParser(); - RegExpr parse(std::string_view expr, int line, int column); + [[nodiscard]] RegExpr parse(std::string_view expr, unsigned line, unsigned column); - RegExpr parse(std::string_view expr) { return parse(std::move(expr), 1, 1); } + [[nodiscard]] RegExpr parse(std::string_view expr) { return parse(expr, 1, 1); } class UnexpectedToken: public std::runtime_error { @@ -44,15 +44,17 @@ class RegExprParser UnexpectedToken(unsigned int line, unsigned int column, int actual, int expected): UnexpectedToken { line, column, - actual == -1 ? "EOF" : fmt::format("{}", static_cast(actual)), + std::char_traits::eq(actual, std::char_traits::eof()) + ? "EOF" + : fmt::format("{}", static_cast(actual)), std::string(1, static_cast(expected)) } { } - unsigned int line() const noexcept { return line_; } - unsigned int column() const noexcept { return column_; } - const std::string& actual() const noexcept { return actual_; } - const std::string& expected() const noexcept { return expected_; } + [[nodiscard]] unsigned int line() const noexcept { return line_; } + [[nodiscard]] unsigned int column() const noexcept { return column_; } + [[nodiscard]] const std::string& actual() const noexcept { return actual_; } + [[nodiscard]] const std::string& expected() const noexcept { return expected_; } private: unsigned int line_; @@ -62,24 +64,27 @@ class RegExprParser }; private: - int currentChar() const; - bool eof() const noexcept { return currentChar() == -1; } - bool consumeIf(int ch); + [[nodiscard]] int currentChar() const; + [[nodiscard]] bool eof() const noexcept + { + return std::char_traits::eq(currentChar(), std::char_traits::eof()); + } + [[nodiscard]] bool consumeIf(int ch); void consume(int ch); int consume(); - unsigned parseInt(); - - RegExpr parse(); // expr - RegExpr parseExpr(); // lookahead - RegExpr parseLookAheadExpr(); // alternation ('/' alternation)? - RegExpr parseAlternation(); // concatenation ('|' concatenation)* - RegExpr parseConcatenation(); // closure (closure)* - RegExpr parseClosure(); // atom ['*' | '?' | '{' NUM [',' NUM] '}'] - RegExpr parseAtom(); // character | characterClass | '(' expr ')' - RegExpr parseCharacterClass(); // '[' characterClassFragment+ ']' + [[nodiscard]] unsigned parseInt(); + + [[nodiscard]] RegExpr parse(); // expr + [[nodiscard]] RegExpr parseExpr(); // lookahead + [[nodiscard]] RegExpr parseLookAheadExpr(); // alternation ('/' alternation)? + [[nodiscard]] RegExpr parseAlternation(); // concatenation ('|' concatenation)* + [[nodiscard]] RegExpr parseConcatenation(); // closure (closure)* + [[nodiscard]] RegExpr parseClosure(); // atom ['*' | '?' | '{' NUM [',' NUM] '}'] + [[nodiscard]] RegExpr parseAtom(); // character | characterClass | '(' expr ')' + [[nodiscard]] RegExpr parseCharacterClass(); // '[' characterClassFragment+ ']' void parseCharacterClassFragment(SymbolSet& ss); // namedClass | character | character '-' character void parseNamedCharacterClass(SymbolSet& ss); // '[' ':' NAME ':' ']' - Symbol parseSingleCharacter(); + [[nodiscard]] Symbol parseSingleCharacter(); private: std::string_view input_; diff --git a/src/regex_dfa/RegExprParser_test.cpp b/src/regex_dfa/RegExprParser_test.cpp index d2281e5ed1..e668143206 100644 --- a/src/regex_dfa/RegExprParser_test.cpp +++ b/src/regex_dfa/RegExprParser_test.cpp @@ -8,292 +8,302 @@ #include #include -#include +#include -#include +#include using namespace std; using namespace regex_dfa; -TEST(regex_RegExprParser, namedCharacterClass_graph) +namespace +{ + +RegExpr parseRegExpr(string const& s) +{ + return RegExprParser {}.parse(s); +} + +} // namespace + +TEST_CASE("regex_RegExprParser.namedCharacterClass_graph") { - RegExpr re = RegExprParser {}.parse("[[:graph:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("!-~", to_string(re)); + RegExpr re = parseRegExpr("[[:graph:]]"); + REQUIRE(holds_alternative(re)); + CHECK("!-~" == to_string(re)); } -TEST(regex_RegExprParser, whitespaces_concatination) +TEST_CASE("regex_RegExprParser.whitespaces_concatination") { - RegExpr re = RegExprParser {}.parse("a b"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("ab", to_string(re)); + RegExpr re = parseRegExpr("a b"); + REQUIRE(holds_alternative(re)); + CHECK("ab" == to_string(re)); } -TEST(regex_RegExprParser, whitespaces_alternation) +TEST_CASE("regex_RegExprParser.whitespaces_alternation") { - RegExpr re = RegExprParser {}.parse("a | b"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("a|b", to_string(re)); + RegExpr re = parseRegExpr("a | b"); + REQUIRE(holds_alternative(re)); + CHECK("a|b" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_digit) +TEST_CASE("regex_RegExprParser.namedCharacterClass_digit") { - RegExpr re = RegExprParser {}.parse("[[:digit:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("0-9", to_string(re)); + RegExpr re = parseRegExpr("[[:digit:]]"); + REQUIRE(holds_alternative(re)); + CHECK("0-9" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_alnum) +TEST_CASE("regex_RegExprParser.namedCharacterClass_alnum") { - RegExpr re = RegExprParser {}.parse("[[:alnum:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("0-9A-Za-z", to_string(re)); + RegExpr re = parseRegExpr("[[:alnum:]]"); + REQUIRE(holds_alternative(re)); + CHECK("0-9A-Za-z" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_alpha) +TEST_CASE("regex_RegExprParser.namedCharacterClass_alpha") { - RegExpr re = RegExprParser {}.parse("[[:alpha:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("A-Za-z", to_string(re)); + RegExpr re = parseRegExpr("[[:alpha:]]"); + REQUIRE(holds_alternative(re)); + CHECK("A-Za-z" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_blank) +TEST_CASE("regex_RegExprParser.namedCharacterClass_blank") { - RegExpr re = RegExprParser {}.parse("[[:blank:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\t\\s", to_string(re)); + RegExpr re = parseRegExpr("[[:blank:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\t\\s" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_cntrl) +TEST_CASE("regex_RegExprParser.namedCharacterClass_cntrl") { - RegExpr re = RegExprParser {}.parse("[[:cntrl:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\0-\\x1f\\x7f", to_string(re)); + RegExpr re = parseRegExpr("[[:cntrl:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\0-\\x1f\\x7f" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_print) +TEST_CASE("regex_RegExprParser.namedCharacterClass_print") { - RegExpr re = RegExprParser {}.parse("[[:print:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\s-~", to_string(re)); + RegExpr re = parseRegExpr("[[:print:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\s-~" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_punct) +TEST_CASE("regex_RegExprParser.namedCharacterClass_punct") { - RegExpr re = RegExprParser {}.parse("[[:punct:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("!-/:-@[-`{-~", to_string(re)); + RegExpr re = parseRegExpr("[[:punct:]]"); + REQUIRE(holds_alternative(re)); + CHECK("!-/:-@[-`{-~" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_space) +TEST_CASE("regex_RegExprParser.namedCharacterClass_space") { - RegExpr re = RegExprParser {}.parse("[[:space:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\0\\t-\\r", to_string(re)); + RegExpr re = parseRegExpr("[[:space:]]"); + REQUIRE(holds_alternative(re)); + CHECK("\\0\\t-\\r" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_unknown) +TEST_CASE("regex_RegExprParser.namedCharacterClass_unknown") { - EXPECT_THROW(RegExprParser {}.parse("[[:unknown:]]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[[:unknown:]]"), RegExprParser::UnexpectedToken); } -TEST(regex_RegExprParser, namedCharacterClass_upper) +TEST_CASE("regex_RegExprParser.namedCharacterClass_upper") { - RegExpr re = RegExprParser {}.parse("[[:upper:]]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("A-Z", to_string(re)); + RegExpr re = parseRegExpr("[[:upper:]]"); + REQUIRE(holds_alternative(re)); + CHECK("A-Z" == to_string(re)); } -TEST(regex_RegExprParser, namedCharacterClass_mixed) +TEST_CASE("regex_RegExprParser.namedCharacterClass_mixed") { - RegExpr re = RegExprParser {}.parse("[[:lower:]0-9]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("0-9a-z", to_string(re)); + RegExpr re = parseRegExpr("[[:lower:]0-9]"); + REQUIRE(holds_alternative(re)); + CHECK("0-9a-z" == to_string(re)); } -TEST(regex_RegExprParser, characterClass_complement) +TEST_CASE("regex_RegExprParser.characterClass_complement") { - RegExpr re = RegExprParser {}.parse("[^\\n]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_TRUE(get(re).symbols.isDot()); - EXPECT_EQ(".", get(re).symbols.to_string()); + RegExpr re = parseRegExpr("[^\\n]"); + REQUIRE(holds_alternative(re)); + CHECK(get(re).symbols.isDot()); + CHECK("." == get(re).symbols.to_string()); } -TEST(regex_RegExprParser, escapeSequences_invalid) +TEST_CASE("regex_RegExprParser.escapeSequences_invalid") { - EXPECT_THROW(RegExprParser {}.parse("[\\z]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\z]"), RegExprParser::UnexpectedToken); } -TEST(regex_RegExprParser, escapeSequences_abfnrstv) +TEST_CASE("regex_RegExprParser.escapeSequences_abfnrstv") { - EXPECT_EQ("\\a", to_string(RegExprParser {}.parse("[\\a]"))); - EXPECT_EQ("\\b", to_string(RegExprParser {}.parse("[\\b]"))); - EXPECT_EQ("\\f", to_string(RegExprParser {}.parse("[\\f]"))); - EXPECT_EQ("\\n", to_string(RegExprParser {}.parse("[\\n]"))); - EXPECT_EQ("\\r", to_string(RegExprParser {}.parse("[\\r]"))); - EXPECT_EQ("\\s", to_string(RegExprParser {}.parse("[\\s]"))); - EXPECT_EQ("\\t", to_string(RegExprParser {}.parse("[\\t]"))); - EXPECT_EQ("\\v", to_string(RegExprParser {}.parse("[\\v]"))); + CHECK("\\a" == to_string(parseRegExpr("[\\a]"))); + CHECK("\\b" == to_string(parseRegExpr("[\\b]"))); + CHECK("\\f" == to_string(parseRegExpr("[\\f]"))); + CHECK("\\n" == to_string(parseRegExpr("[\\n]"))); + CHECK("\\r" == to_string(parseRegExpr("[\\r]"))); + CHECK("\\s" == to_string(parseRegExpr("[\\s]"))); + CHECK("\\t" == to_string(parseRegExpr("[\\t]"))); + CHECK("\\v" == to_string(parseRegExpr("[\\v]"))); } -TEST(regex_RegExprParser, newline) +TEST_CASE("regex_RegExprParser.newline") { - RegExpr re = RegExprParser {}.parse("\n"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ('\n', get(re).value); + RegExpr re = parseRegExpr("\n"); + REQUIRE(holds_alternative(re)); + CHECK('\n' == get(re).value); } -TEST(regex_RegExprParser, escapeSequences_hex) +TEST_CASE("regex_RegExprParser.escapeSequences_hex") { - RegExpr re = RegExprParser {}.parse("[\\x20]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\s", get(re).symbols.to_string()); + RegExpr re = parseRegExpr("[\\x20]"); + REQUIRE(holds_alternative(re)); + CHECK("\\s" == get(re).symbols.to_string()); - EXPECT_THROW(RegExprParser {}.parse("[\\xZZ]"), RegExprParser::UnexpectedToken); - EXPECT_THROW(RegExprParser {}.parse("[\\xAZ]"), RegExprParser::UnexpectedToken); - EXPECT_THROW(RegExprParser {}.parse("[\\xZA]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\xZZ]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\xAZ]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\xZA]"), RegExprParser::UnexpectedToken); } -TEST(regex_RegExprParser, escapeSequences_nul) +TEST_CASE("regex_RegExprParser.escapeSequences_nul") { - RegExpr re = RegExprParser {}.parse("[\\0]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\0", get(re).symbols.to_string()); + RegExpr re = parseRegExpr("[\\0]"); + REQUIRE(holds_alternative(re)); + CHECK("\\0" == get(re).symbols.to_string()); } -TEST(regex_RegExprParser, escapeSequences_octal) +TEST_CASE("regex_RegExprParser.escapeSequences_octal") { // with leading zero - RegExpr re = RegExprParser {}.parse("[\\040]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("\\s", get(re).symbols.to_string()); + RegExpr re = parseRegExpr("[\\040]"); + REQUIRE(holds_alternative(re)); + CHECK("\\s" == get(re).symbols.to_string()); // with leading non-zero - re = RegExprParser {}.parse("[\\172]"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("z", get(re).symbols.to_string()); + re = parseRegExpr("[\\172]"); + REQUIRE(holds_alternative(re)); + CHECK("z" == get(re).symbols.to_string()); // invalids - EXPECT_THROW(RegExprParser {}.parse("[\\822]"), RegExprParser::UnexpectedToken); - EXPECT_THROW(RegExprParser {}.parse("[\\282]"), RegExprParser::UnexpectedToken); - EXPECT_THROW(RegExprParser {}.parse("[\\228]"), RegExprParser::UnexpectedToken); - EXPECT_THROW(RegExprParser {}.parse("[\\082]"), RegExprParser::UnexpectedToken); - EXPECT_THROW(RegExprParser {}.parse("[\\028]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\822]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\282]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\228]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\082]"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("[\\028]"), RegExprParser::UnexpectedToken); } -TEST(regex_RegExprParser, doubleQuote) +TEST_CASE("regex_RegExprParser.doubleQuote") { // as concatenation character - RegExpr re = RegExprParser {}.parse(R"(\")"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ('"', get(re).value); + RegExpr re = parseRegExpr(R"(\")"); + REQUIRE(holds_alternative(re)); + CHECK('"' == get(re).value); // as character class - re = RegExprParser {}.parse(R"([\"])"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ(R"(")", get(re).symbols.to_string()); + re = parseRegExpr(R"([\"])"); + REQUIRE(holds_alternative(re)); + CHECK(R"(")" == get(re).symbols.to_string()); } -TEST(regex_RegExprParser, dot) +TEST_CASE("regex_RegExprParser.dot") { - RegExpr re = RegExprParser {}.parse("."); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ(".", to_string(re)); + RegExpr re = parseRegExpr("."); + REQUIRE(holds_alternative(re)); + CHECK("." == to_string(re)); } -TEST(regex_RegExprParser, optional) +TEST_CASE("regex_RegExprParser.optional") { - RegExpr re = RegExprParser {}.parse("a?"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("a?", to_string(re)); + RegExpr re = parseRegExpr("a?"); + REQUIRE(holds_alternative(re)); + CHECK("a?" == to_string(re)); } -TEST(regex_RegExprParser, bol) +TEST_CASE("regex_RegExprParser.bol") { - RegExpr re = RegExprParser {}.parse("^a"); - ASSERT_TRUE(holds_alternative(re)); + RegExpr re = parseRegExpr("^a"); + REQUIRE(holds_alternative(re)); const ConcatenationExpr& cat = get(re); - ASSERT_TRUE(holds_alternative(*cat.left)); - EXPECT_EQ("^", to_string(*cat.left)); - EXPECT_EQ("a", to_string(*cat.right)); + REQUIRE(holds_alternative(*cat.left)); + CHECK("^" == to_string(*cat.left)); + CHECK("a" == to_string(*cat.right)); } -TEST(regex_RegExprParser, eol) +TEST_CASE("regex_RegExprParser.eol") { - RegExpr re = RegExprParser {}.parse("a$"); - ASSERT_TRUE(holds_alternative(re)); + RegExpr re = parseRegExpr("a$"); + REQUIRE(holds_alternative(re)); const ConcatenationExpr& cat = get(re); - ASSERT_TRUE(holds_alternative(*cat.right)); - EXPECT_EQ("a$", to_string(re)); + REQUIRE(holds_alternative(*cat.right)); + CHECK("a$" == to_string(re)); } -TEST(regex_RegExprParser, eof) +TEST_CASE("regex_RegExprParser.eof") { - RegExpr re = RegExprParser {}.parse("<>"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("<>", to_string(re)); + RegExpr re = parseRegExpr("<>"); + REQUIRE(holds_alternative(re)); + CHECK("<>" == to_string(re)); } -TEST(regex_RegExprParser, alternation) +TEST_CASE("regex_RegExprParser.alternation") { - EXPECT_EQ("a|b", to_string(RegExprParser {}.parse("a|b"))); - EXPECT_EQ("(a|b)c", to_string(RegExprParser {}.parse("(a|b)c"))); - EXPECT_EQ("a(b|c)", to_string(RegExprParser {}.parse("a(b|c)"))); + CHECK("a|b" == to_string(parseRegExpr("a|b"))); + CHECK("(a|b)c" == to_string(parseRegExpr("(a|b)c"))); + CHECK("a(b|c)" == to_string(parseRegExpr("a(b|c)"))); } -TEST(regex_RegExprParser, lookahead) +TEST_CASE("regex_RegExprParser.lookahead") { - RegExpr re = RegExprParser {}.parse("ab/cd"); - ASSERT_TRUE(holds_alternative(re)); - EXPECT_EQ("ab/cd", to_string(re)); - EXPECT_EQ("(a/b)|b", to_string(RegExprParser {}.parse("(a/b)|b"))); - EXPECT_EQ("a|(b/c)", to_string(RegExprParser {}.parse("a|(b/c)"))); + RegExpr re = parseRegExpr("ab/cd"); + REQUIRE(holds_alternative(re)); + CHECK("ab/cd" == to_string(re)); + CHECK("(a/b)|b" == to_string(parseRegExpr("(a/b)|b"))); + CHECK("a|(b/c)" == to_string(parseRegExpr("a|(b/c)"))); } -TEST(regex_RegExprParser, closure) +TEST_CASE("regex_RegExprParser.closure") { - RegExpr re = RegExprParser {}.parse("(abc)*"); - ASSERT_TRUE(holds_alternative(re)); + RegExpr re = parseRegExpr("(abc)*"); + REQUIRE(holds_alternative(re)); const ClosureExpr& e = get(re); - EXPECT_EQ(0, e.minimumOccurrences); - EXPECT_EQ(numeric_limits::max(), e.maximumOccurrences); - EXPECT_EQ("(abc)*", to_string(re)); + CHECK(0 == e.minimumOccurrences); + CHECK(numeric_limits::max() == e.maximumOccurrences); + CHECK("(abc)*" == to_string(re)); } -TEST(regex_RegExprParser, positive) +TEST_CASE("regex_RegExprParser.positive") { - auto re = RegExprParser {}.parse("(abc)+"); - ASSERT_TRUE(holds_alternative(re)); + auto re = parseRegExpr("(abc)+"); + REQUIRE(holds_alternative(re)); const ClosureExpr& e = get(re); - EXPECT_EQ(1, e.minimumOccurrences); - EXPECT_EQ(numeric_limits::max(), e.maximumOccurrences); - EXPECT_EQ("(abc)+", to_string(re)); + CHECK(1 == e.minimumOccurrences); + CHECK(numeric_limits::max() == e.maximumOccurrences); + CHECK("(abc)+" == to_string(re)); } -TEST(regex_RegExprParser, closure_range) +TEST_CASE("regex_RegExprParser.closure_range") { - auto re = RegExprParser {}.parse("a{2,4}"); - ASSERT_TRUE(holds_alternative(re)); + auto re = parseRegExpr("a{2,4}"); + REQUIRE(holds_alternative(re)); const ClosureExpr& e = get(re); - EXPECT_EQ(2, e.minimumOccurrences); - EXPECT_EQ(4, e.maximumOccurrences); - EXPECT_EQ("a{2,4}", to_string(re)); + CHECK(2 == e.minimumOccurrences); + CHECK(4 == e.maximumOccurrences); + CHECK("a{2,4}" == to_string(re)); } -TEST(regex_RegExprParser, empty) +TEST_CASE("regex_RegExprParser.empty") { - auto re = RegExprParser {}.parse("(a|)"); - EXPECT_EQ("a|", to_string(re)); // grouping '(' & ')' is not preserved as node in the parse tree. + auto re = parseRegExpr("(a|)"); + CHECK("a|" == to_string(re)); // grouping '(' & ')' is not preserved as node in the parse tree. } -TEST(regex_RegExprParser, UnexpectedToken_grouping) +TEST_CASE("regex_RegExprParser.UnexpectedToken_grouping") { - EXPECT_THROW(RegExprParser {}.parse("(a"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("(a"), RegExprParser::UnexpectedToken); } -TEST(regex_RegExprParser, UnexpectedToken_literal) +TEST_CASE("regex_RegExprParser.UnexpectedToken_literal") { - EXPECT_THROW(RegExprParser {}.parse("\"a"), RegExprParser::UnexpectedToken); + CHECK_THROWS_AS(parseRegExpr("\"a"), RegExprParser::UnexpectedToken); } diff --git a/src/regex_dfa/Report.cpp b/src/regex_dfa/Report.cpp index d7a1b5d281..9f2b9b51cd 100644 --- a/src/regex_dfa/Report.cpp +++ b/src/regex_dfa/Report.cpp @@ -5,11 +5,11 @@ // file except in compliance with the License. You may obtain a copy of // the License at: http://opensource.org/licenses/MIT +#include + #include #include -#include - using namespace std; using namespace regex_dfa; @@ -43,7 +43,7 @@ void ConsoleReport::onMessage(Message&& message) // {{{ BufferedReport void BufferedReport::onMessage(Message&& msg) { - messages_.emplace_back(move(msg)); + messages_.emplace_back(std::move(msg)); } void BufferedReport::clear() diff --git a/src/regex_dfa/Report.h b/src/regex_dfa/Report.h index 86efcdf8e2..0fc9bc71e7 100644 --- a/src/regex_dfa/Report.h +++ b/src/regex_dfa/Report.h @@ -7,6 +7,8 @@ #pragma once +#include + #include #include @@ -15,8 +17,6 @@ #include #include -#include - namespace regex_dfa { @@ -196,7 +196,7 @@ struct formatter template constexpr auto format(const regex_dfa::SourceLocation& sloc, FormatContext& ctx) { - return format_to(ctx.out(), "{} ({}-{})", sloc.filename, sloc.offset, sloc.offset + sloc.count); + return fmt::format_to(ctx.out(), "{} ({}-{})", sloc.filename, sloc.offset, sloc.offset + sloc.count); } }; } // namespace fmt @@ -217,7 +217,7 @@ struct formatter template constexpr auto format(const Message& v, FormatContext& ctx) { - return format_to(ctx.out(), "{}", v.to_string()); + return fmt::format_to(ctx.out(), "{}", v.to_string()); } }; } // namespace fmt diff --git a/src/regex_dfa/Rule.h b/src/regex_dfa/Rule.h index abbc0d244a..0c97764494 100644 --- a/src/regex_dfa/Rule.h +++ b/src/regex_dfa/Rule.h @@ -29,9 +29,9 @@ struct Rule std::string pattern; std::unique_ptr regexpr = nullptr; - bool isIgnored() const noexcept { return tag == IgnoreTag; } + [[nodiscard]] bool isIgnored() const noexcept { return tag == IgnoreTag; } - Rule clone() const + [[nodiscard]] Rule clone() const { return regexpr ? Rule { line, column, @@ -45,20 +45,20 @@ struct Rule Rule() = default; - Rule(unsigned _line, - unsigned _column, - Tag _tag, - std::vector _conditions, - std::string _name, - std::string _pattern, - std::unique_ptr _regexpr = nullptr): - line { _line }, - column { _column }, - tag { _tag }, - conditions { _conditions }, - name { _name }, - pattern { _pattern }, - regexpr { std::move(_regexpr) } + Rule(unsigned line, + unsigned column, + Tag tag, + std::vector conditions, + std::string name, + std::string pattern, + std::unique_ptr regexpr = nullptr): + line { line }, + column { column }, + tag { tag }, + conditions { std::move(conditions) }, + name { std::move(name) }, + pattern { std::move(pattern) }, + regexpr { std::move(regexpr) } { } @@ -120,18 +120,18 @@ struct formatter { if (!v.conditions.empty()) { - format_to(ctx.out(), "<"); + fmt::format_to(ctx.out(), "<"); for (size_t i = 0; i < v.conditions.size(); ++i) if (i != 0) - format_to(ctx.out(), ", {}", v.conditions[i]); + fmt::format_to(ctx.out(), ", {}", v.conditions[i]); else - format_to(ctx.out(), "{}", v.conditions[i]); - format_to(ctx.out(), ">"); + fmt::format_to(ctx.out(), "{}", v.conditions[i]); + fmt::format_to(ctx.out(), ">"); } if (v.tag == regex_dfa::IgnoreTag) - return format_to(ctx.out(), "{}({}) ::= {}", v.name, "ignore", v.pattern); + return fmt::format_to(ctx.out(), "{}({}) ::= {}", v.name, "ignore", v.pattern); else - return format_to(ctx.out(), "{}({}) ::= {}", v.name, v.tag, v.pattern); + return fmt::format_to(ctx.out(), "{}({}) ::= {}", v.name, v.tag, v.pattern); } }; } // namespace fmt diff --git a/src/regex_dfa/RuleParser.cpp b/src/regex_dfa/RuleParser.cpp index 1c95f67175..dda518404b 100644 --- a/src/regex_dfa/RuleParser.cpp +++ b/src/regex_dfa/RuleParser.cpp @@ -20,22 +20,21 @@ using namespace std; namespace regex_dfa { -RuleParser::RuleParser(unique_ptr input, int firstTag): - stream_ { move(input) }, - refRules_ {}, - lastParsedRule_ { nullptr }, - lastParsedRuleIsRef_ { false }, - currentChar_ { 0 }, - line_ { 1 }, - column_ { 0 }, - offset_ { 0 }, - nextTag_ { firstTag } +RuleParser::RuleParser(unique_ptr input, int firstTerminalId): + _stream { std::move(input) }, + _lastParsedRule { nullptr }, + _lastParsedRuleIsRef { false }, + _currentChar { 0 }, + _line { 1 }, + _column { 0 }, + _offset { 0 }, + _nextTag { firstTerminalId } { consumeChar(); } -RuleParser::RuleParser(string input, int firstTag): - RuleParser { make_unique(move(input)), firstTag } +RuleParser::RuleParser(string input, int firstTerminalId): + RuleParser { make_unique(std::move(input)), firstTerminalId } { } @@ -95,18 +94,18 @@ void RuleParser::parseRule(RuleList& rules) // RuleOption ::= ignore consumeSP(); - if (currentChar_ == '|' && lastParsedRule_ != nullptr) + if (_currentChar == '|' && _lastParsedRule != nullptr) { consumeChar(); consumeSP(); const string pattern = parseExpression(); - lastParsedRule_->pattern += '|' + pattern; + _lastParsedRule->pattern += '|' + pattern; return; } // finalize ref-rule by surrounding it with round braces - if (lastParsedRuleIsRef_) - lastParsedRule_->pattern = fmt::format("({})", lastParsedRule_->pattern); + if (_lastParsedRuleIsRef) + _lastParsedRule->pattern = fmt::format("({})", _lastParsedRule->pattern); vector conditions = parseRuleConditions(); consumeSP(); @@ -124,11 +123,11 @@ void RuleParser::parseRule(RuleList& rules) if (currentChar() == '\n') consumeChar(); else if (!eof()) - throw UnexpectedChar { line_, column_, currentChar_, '\n' }; + throw UnexpectedChar { _line, _column, _currentChar, '\n' }; } else { - parseBasicRule(rules, move(conditions)); + parseBasicRule(rules, std::move(conditions)); } } @@ -140,16 +139,16 @@ struct TestRuleForName void RuleParser::parseBasicRule(RuleList& rules, vector&& conditions) { - const unsigned int beginLine = line_; - const unsigned int beginColumn = column_; + const unsigned int beginLine = _line; + const unsigned int beginColumn = _column; string token = consumeToken(); bool ignore = false; bool ref = false; - if (currentChar_ == '(') + if (_currentChar == '(') { consumeChar(); - unsigned optionOffset = offset_; + unsigned optionOffset = _offset; string option = consumeToken(); consumeChar(')'); @@ -163,13 +162,13 @@ void RuleParser::parseBasicRule(RuleList& rules, vector&& conditions) consumeSP(); consumeAssoc(); consumeSP(); - const unsigned int line = line_; - const unsigned int column = column_; + const unsigned int line = _line; + const unsigned int column = _column; const string pattern = parseExpression(); if (currentChar() == '\n') consumeChar(); else if (!eof()) - throw UnexpectedChar { line_, column_, currentChar_, '\n' }; + throw UnexpectedChar { _line, _column, _currentChar, '\n' }; const Tag tag = [&] { if (ignore || ref) @@ -177,13 +176,13 @@ void RuleParser::parseBasicRule(RuleList& rules, vector&& conditions) else if (auto i = find_if(rules.begin(), rules.end(), TestRuleForName { token }); i != rules.end()) return i->tag; else - return nextTag_++; + return _nextTag++; }(); if (ref && !conditions.empty()) - throw InvalidRefRuleWithConditions { beginLine, - beginColumn, - Rule { line, column, tag, move(conditions), token, pattern } }; + throw InvalidRefRuleWithConditions { + beginLine, beginColumn, Rule { line, column, tag, std::move(conditions), token, pattern } + }; if (conditions.empty()) conditions.emplace_back("INITIAL"); @@ -194,25 +193,25 @@ void RuleParser::parseBasicRule(RuleList& rules, vector&& conditions) { if (auto i = find_if(rules.begin(), rules.end(), TestRuleForName { token }); i != rules.end()) { - throw DuplicateRule { Rule { line, column, tag, move(conditions), token, pattern }, *i }; + throw DuplicateRule { Rule { line, column, tag, std::move(conditions), token, pattern }, *i }; } else { rules.emplace_back(Rule { line, column, tag, conditions, token, pattern }); - lastParsedRule_ = &rules.back(); - lastParsedRuleIsRef_ = false; + _lastParsedRule = &rules.back(); + _lastParsedRuleIsRef = false; } } - else if (auto i = refRules_.find(token); i != refRules_.end()) + else if (auto i = _refRules.find(token); i != _refRules.end()) { - throw DuplicateRule { Rule { line, column, tag, move(conditions), token, pattern }, i->second }; + throw DuplicateRule { Rule { line, column, tag, std::move(conditions), token, pattern }, i->second }; } else { // TODO: throw if !conditions.empty(); - refRules_[token] = { line, column, tag, {}, token, pattern }; - lastParsedRule_ = &refRules_[token]; - lastParsedRuleIsRef_ = true; + _refRules[token] = { line, column, tag, {}, token, pattern }; + _lastParsedRule = &_refRules[token]; + _lastParsedRuleIsRef = true; } } @@ -254,9 +253,9 @@ string RuleParser::parseExpression() size_t i = 0; size_t lastGraph = 0; - while (!eof() && currentChar_ != '\n') + while (!eof() && _currentChar != '\n') { - if (isgraph(currentChar_)) + if (isgraph(_currentChar)) lastGraph = i + 1; i++; sstr << consumeChar(); @@ -264,7 +263,7 @@ string RuleParser::parseExpression() string pattern = sstr.str().substr(0, lastGraph); // skips trailing spaces // replace all occurrences of {ref} - for (const pair& ref: refRules_) + for (const pair& ref: _refRules) { const Rule& rule = ref.second; const string name = fmt::format("{{{}}}", rule.name); @@ -287,13 +286,13 @@ void RuleParser::consumeSpace() { for (;;) { - switch (currentChar_) + switch (_currentChar) { case ' ': case '\t': case '\r': consumeChar(); break; case '#': - while (!eof() && currentChar_ != '\n') + while (!eof() && _currentChar != '\n') { consumeChar(); } @@ -305,33 +304,33 @@ void RuleParser::consumeSpace() char RuleParser::currentChar() const noexcept { - return currentChar_; + return _currentChar; } char RuleParser::consumeChar(char ch) { - if (currentChar_ != ch) - throw UnexpectedChar { line_, column_, currentChar_, ch }; + if (_currentChar != ch) + throw UnexpectedChar { _line, _column, _currentChar, ch }; return consumeChar(); } char RuleParser::consumeChar() { - char t = currentChar_; + char t = _currentChar; - currentChar_ = stream_->get(); - if (!stream_->eof()) + _currentChar = _stream->get(); + if (!_stream->eof()) { - offset_++; + _offset++; if (t == '\n') { - line_++; - column_ = 1; + _line++; + _column = 1; } else { - column_++; + _column++; } } @@ -340,32 +339,32 @@ char RuleParser::consumeChar() bool RuleParser::eof() const noexcept { - return currentChar_ < 0 || stream_->eof(); + return std::char_traits::eq(_currentChar, std::char_traits::eof()) || _stream->eof(); } string RuleParser::consumeToken() { stringstream sstr; - if (!isalpha(currentChar_) || currentChar_ == '_') - throw UnexpectedToken { offset_, currentChar_, "Token" }; + if (!isalpha(_currentChar) || _currentChar == '_') + throw UnexpectedToken { _offset, _currentChar, "Token" }; do sstr << consumeChar(); - while (isalnum(currentChar_) || currentChar_ == '_'); + while (isalnum(_currentChar) || _currentChar == '_'); return sstr.str(); } void RuleParser::consumeAnySP() { - while (currentChar_ == ' ' || currentChar_ == '\t' || currentChar_ == '\n') + while (_currentChar == ' ' || _currentChar == '\t' || _currentChar == '\n') consumeChar(); } void RuleParser::consumeSP() { - while (currentChar_ == ' ' || currentChar_ == '\t') + while (_currentChar == ' ' || _currentChar == '\t') consumeChar(); } diff --git a/src/regex_dfa/RuleParser.h b/src/regex_dfa/RuleParser.h index 817e945fa9..393e26ec8c 100644 --- a/src/regex_dfa/RuleParser.h +++ b/src/regex_dfa/RuleParser.h @@ -46,41 +46,41 @@ class RuleParser void consumeSP(); void consumeAssoc(); void consumeSpace(); - char currentChar() const noexcept; + [[nodiscard]] char currentChar() const noexcept; char consumeChar(char ch); char consumeChar(); - bool eof() const noexcept; - std::string replaceRefs(const std::string& pattern); + [[nodiscard]] bool eof() const noexcept; + [[nodiscard]] std::string replaceRefs(const std::string& pattern); private: - std::unique_ptr stream_; - std::map refRules_; - Rule* lastParsedRule_; - bool lastParsedRuleIsRef_; - char currentChar_; - unsigned int line_; - unsigned int column_; - unsigned int offset_; - int nextTag_; + std::unique_ptr _stream; + std::map _refRules; + Rule* _lastParsedRule; + bool _lastParsedRuleIsRef; + char _currentChar; + unsigned int _line; + unsigned int _column; + unsigned int _offset; + int _nextTag; }; class RuleParser::InvalidRefRuleWithConditions: public std::runtime_error { public: - InvalidRefRuleWithConditions(unsigned line, unsigned column, Rule&& rule): + InvalidRefRuleWithConditions(unsigned line, unsigned column, Rule rule): std::runtime_error { fmt::format( "{}:{}: Invalid rule \"{}\". Reference rules must not be labelled with conditions.", line, column, rule.name) }, - rule_ { std::move(rule) } + _rule { std::move(rule) } { } - const Rule& rule() const noexcept { return rule_; } + [[nodiscard]] Rule const& rule() const noexcept { return _rule; } private: - const Rule rule_; + Rule _rule; }; class RuleParser::DuplicateRule: public std::runtime_error @@ -94,17 +94,17 @@ class RuleParser::DuplicateRule: public std::runtime_error duplicate.name, other.line, other.column) }, - duplicate_ { std::move(duplicate) }, - other_ { other } + _duplicate { std::move(duplicate) }, + _other { other } { } - const Rule& duplicate() const noexcept { return duplicate_; } - const Rule& other() const noexcept { return other_; } + [[nodiscard]] Rule const& duplicate() const noexcept { return _duplicate; } + [[nodiscard]] Rule const& other() const noexcept { return _other; } private: - const Rule duplicate_; - const Rule& other_; + Rule _duplicate; + Rule const& _other; }; class RuleParser::UnexpectedToken: public std::runtime_error @@ -113,20 +113,20 @@ class RuleParser::UnexpectedToken: public std::runtime_error UnexpectedToken(unsigned offset, char actual, std::string expected): std::runtime_error { fmt::format( "{}: Unexpected token {}, expected <{}> instead.", offset, actual, expected) }, - offset_ { offset }, - actual_ { std::move(actual) }, - expected_ { std::move(expected) } + _offset { offset }, + _actual { actual }, + _expected { std::move(expected) } { } - unsigned offset() const noexcept { return offset_; } - char actual() const noexcept { return actual_; } - const std::string& expected() const noexcept { return expected_; } + [[nodiscard]] unsigned offset() const noexcept { return _offset; } + [[nodiscard]] char actual() const noexcept { return _actual; } + [[nodiscard]] const std::string& expected() const noexcept { return _expected; } private: - unsigned offset_; - char actual_; - std::string expected_; + unsigned _offset; + char _actual; + std::string _expected; }; class RuleParser::UnexpectedChar: public std::runtime_error @@ -138,32 +138,32 @@ class RuleParser::UnexpectedChar: public std::runtime_error column, quoted(actual), quoted(expected)) }, - line_ { line }, - column_ { column }, - actual_ { actual }, - expected_ { expected } + _line { line }, + _column { column }, + _actual { actual }, + _expected { expected } { } - unsigned int line() const noexcept { return line_; } - unsigned int column() const noexcept { return column_; } - char actual() const noexcept { return actual_; } - char expected() const noexcept { return expected_; } + [[nodiscard]] unsigned int line() const noexcept { return _line; } + [[nodiscard]] unsigned int column() const noexcept { return _column; } + [[nodiscard]] char actual() const noexcept { return _actual; } + [[nodiscard]] char expected() const noexcept { return _expected; } private: static std::string quoted(char ch) { - if (ch < 0) + if (std::char_traits::eq(ch, std::char_traits::eof())) return "<>"; else - return fmt::format("'{}'", ch); + return fmt::format("'{}'", static_cast(ch)); } private: - unsigned int line_; - unsigned int column_; - char actual_; - char expected_; + unsigned int _line; + unsigned int _column; + char _actual; + char _expected; }; class RuleParser::InvalidRuleOption: public std::runtime_error @@ -171,17 +171,17 @@ class RuleParser::InvalidRuleOption: public std::runtime_error public: InvalidRuleOption(unsigned offset, std::string option): std::runtime_error { fmt::format("{}: Invalid rule option \"{}\".", offset, option) }, - offset_ { offset }, - option_ { option } + _offset { offset }, + _option { option } { } - unsigned offset() const noexcept { return offset_; } - const std::string& option() const noexcept { return option_; } + [[nodiscard]] unsigned offset() const noexcept { return _offset; } + [[nodiscard]] const std::string& option() const noexcept { return _option; } private: - unsigned offset_; - std::string option_; + unsigned _offset; + std::string _option; }; } // namespace regex_dfa diff --git a/src/regex_dfa/RuleParser_test.cpp b/src/regex_dfa/RuleParser_test.cpp index b41669ee27..aae7fdc58f 100644 --- a/src/regex_dfa/RuleParser_test.cpp +++ b/src/regex_dfa/RuleParser_test.cpp @@ -7,86 +7,86 @@ #include +#include + #include #include -#include - using namespace regex_dfa; -TEST(regex_RuleParser, simple) +TEST_CASE("regex_RuleParser.simple") { RuleParser rp { "main ::= blah\n" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ("blah", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK("blah" == rules[0].pattern); } -TEST(regex_RuleParser, whitespaces) +TEST_CASE("regex_RuleParser.whitespaces") { RuleParser rp { "main ::= a\n\t| b | c\n" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ("a|b | c", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK("a|b | c" == rules[0].pattern); } -TEST(regex_RuleParser, rule_at_eof) +TEST_CASE("regex_RuleParser.rule_at_eof") { RuleParser rp { "main ::= blah" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ("blah", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK("blah" == rules[0].pattern); } -TEST(regex_RuleParser, simple_trailing_spaces) +TEST_CASE("regex_RuleParser.simple_trailing_spaces") { RuleParser rp { "main ::= blah\n " }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ("blah", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK("blah" == rules[0].pattern); } -TEST(regex_RuleParser, quotedPattern) +TEST_CASE("regex_RuleParser.quotedPattern") { RuleParser rp { "main ::= \"blah\"" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ("\"blah\"", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK("\"blah\"" == rules[0].pattern); } -TEST(regex_RuleParser, multiQuotedPattern) +TEST_CASE("regex_RuleParser.multiQuotedPattern") { RuleParser rp { R"(rule ::= "b"la"h")" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ(R"("b"la"h")", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK(R"("b"la"h")" == rules[0].pattern); } -TEST(regex_RuleParser, doubleQuote) +TEST_CASE("regex_RuleParser.doubleQuote") { RuleParser rp { R"(rule ::= \")" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ(R"(\")", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK(R"(\")" == rules[0].pattern); } -TEST(regex_RuleParser, spaceRule) +TEST_CASE("regex_RuleParser.spaceRule") { RuleParser rp { R"(rule ::= [ \n\t]+)" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ(R"([ \n\t]+)", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK(R"([ \n\t]+)" == rules[0].pattern); } -TEST(regex_RuleParser, stringRule) +TEST_CASE("regex_RuleParser.stringRule") { RuleParser rp { R"(rule ::= \"[^\"]*\")" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ(R"(\"[^\"]*\")", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK(R"(\"[^\"]*\")" == rules[0].pattern); } -TEST(regex_RuleParser, ref) +TEST_CASE("regex_RuleParser.ref") { RuleParser rp { R"( Foo(ref) ::= foo @@ -94,21 +94,21 @@ TEST(regex_RuleParser, ref) FooBar ::= {Foo}_{Bar} )" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(1, rules.size()); - EXPECT_EQ("(foo)_(bar)", rules[0].pattern); + REQUIRE(1 == rules.size()); + CHECK("(foo)_(bar)" == rules[0].pattern); } -TEST(regex_RuleParser, ref_duplicated) +TEST_CASE("regex_RuleParser.ref_duplicated") { RuleParser rp { R"( Foo(ref) ::= foo Foo(ref) ::= bar FooBar ::= {Foo} )" }; - EXPECT_THROW(rp.parseRules(), RuleParser::DuplicateRule); + CHECK_THROWS_AS(rp.parseRules(), RuleParser::DuplicateRule); } -TEST(regex_RuleParser, multiline_alt) +TEST_CASE("regex_RuleParser.multiline_alt") { RuleParser rp { R"( Rule1 ::= foo @@ -119,12 +119,12 @@ TEST(regex_RuleParser, multiline_alt) | {Rule2} )" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(2, rules.size()); - EXPECT_EQ("foo|bar", rules[0].pattern); - EXPECT_EQ("(fnord|hard)|(fnord|hard)", rules[1].pattern); + REQUIRE(2 == rules.size()); + CHECK("foo|bar" == rules[0].pattern); + CHECK("(fnord|hard)|(fnord|hard)" == rules[1].pattern); } -TEST(regex_RuleParser, condition1) +TEST_CASE("regex_RuleParser.condition1") { RuleParser rp { R"( Rule1 ::= foo @@ -132,18 +132,18 @@ TEST(regex_RuleParser, condition1) )" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(2, rules.size()); - EXPECT_EQ("foo", rules[0].pattern); - EXPECT_EQ("bar", rules[1].pattern); + REQUIRE(2 == rules.size()); + CHECK("foo" == rules[0].pattern); + CHECK("bar" == rules[1].pattern); - ASSERT_EQ(1, rules[0].conditions.size()); - EXPECT_EQ("foo", rules[0].conditions[0]); + REQUIRE(1 == rules[0].conditions.size()); + CHECK("foo" == rules[0].conditions[0]); - ASSERT_EQ(1, rules[1].conditions.size()); - EXPECT_EQ("bar", rules[1].conditions[0]); + REQUIRE(1 == rules[1].conditions.size()); + CHECK("bar" == rules[1].conditions[0]); } -TEST(regex_RuleParser, condition2) +TEST_CASE("regex_RuleParser.condition2") { RuleParser rp { R"( Rule1 ::= foo @@ -151,20 +151,20 @@ TEST(regex_RuleParser, condition2) )" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(2, rules.size()); - EXPECT_EQ("foo", rules[0].pattern); - EXPECT_EQ("bar", rules[1].pattern); + REQUIRE(2 == rules.size()); + CHECK("foo" == rules[0].pattern); + CHECK("bar" == rules[1].pattern); - ASSERT_EQ(1, rules[0].conditions.size()); - EXPECT_EQ("foo", rules[0].conditions[0]); + REQUIRE(1 == rules[0].conditions.size()); + CHECK("foo" == rules[0].conditions[0]); - ASSERT_EQ(2, rules[1].conditions.size()); + REQUIRE(2 == rules[1].conditions.size()); // in sorted order - EXPECT_EQ("bar", rules[1].conditions[0]); - EXPECT_EQ("foo", rules[1].conditions[1]); + CHECK("bar" == rules[1].conditions[0]); + CHECK("foo" == rules[1].conditions[1]); } -TEST(regex_RuleParser, conditional_star) +TEST_CASE("regex_RuleParser.conditional_star") { RuleParser rp { R"( Zero ::= zero @@ -174,28 +174,28 @@ TEST(regex_RuleParser, conditional_star) )" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(4, rules.size()); + REQUIRE(4 == rules.size()); - EXPECT_EQ("zero", rules[0].pattern); - ASSERT_EQ(1, rules[0].conditions.size()); - EXPECT_EQ("INITIAL", rules[0].conditions[0]); + CHECK("zero" == rules[0].pattern); + REQUIRE(1 == rules[0].conditions.size()); + CHECK("INITIAL" == rules[0].conditions[0]); - EXPECT_EQ("one", rules[1].pattern); - ASSERT_EQ(1, rules[1].conditions.size()); - EXPECT_EQ("one", rules[1].conditions[0]); + CHECK("one" == rules[1].pattern); + REQUIRE(1 == rules[1].conditions.size()); + CHECK("one" == rules[1].conditions[0]); - EXPECT_EQ("two", rules[2].pattern); - ASSERT_EQ(1, rules[2].conditions.size()); - EXPECT_EQ("two", rules[2].conditions[0]); + CHECK("two" == rules[2].pattern); + REQUIRE(1 == rules[2].conditions.size()); + CHECK("two" == rules[2].conditions[0]); - EXPECT_EQ("tri", rules[3].pattern); - ASSERT_EQ(3, rules[3].conditions.size()); - EXPECT_EQ("INITIAL", rules[3].conditions[0]); - EXPECT_EQ("one", rules[3].conditions[1]); - EXPECT_EQ("two", rules[3].conditions[2]); + CHECK("tri" == rules[3].pattern); + REQUIRE(3 == rules[3].conditions.size()); + CHECK("INITIAL" == rules[3].conditions[0]); + CHECK("one" == rules[3].conditions[1]); + CHECK("two" == rules[3].conditions[2]); } -TEST(regex_RuleParser, grouped_conditions) +TEST_CASE("regex_RuleParser.grouped_conditions") { RuleParser rp { R"( Rule1 ::= foo @@ -205,43 +205,43 @@ TEST(regex_RuleParser, grouped_conditions) )" }; RuleList rules = rp.parseRules(); - ASSERT_EQ(2, rules.size()); - EXPECT_EQ("foo", rules[0].pattern); - EXPECT_EQ("bar", rules[1].pattern); + REQUIRE(2 == rules.size()); + CHECK("foo" == rules[0].pattern); + CHECK("bar" == rules[1].pattern); - ASSERT_EQ(1, rules[1].conditions.size()); - EXPECT_EQ("blah", rules[1].conditions[0]); + REQUIRE(1 == rules[1].conditions.size()); + CHECK("blah" == rules[1].conditions[0]); } -TEST(regex_RuleParser, InvalidRefRuleWithConditions) +TEST_CASE("regex_RuleParser.InvalidRefRuleWithConditions") { - ASSERT_THROW(RuleParser { "main(ref) ::= blah\n" }.parseRules(), - RuleParser::InvalidRefRuleWithConditions); + CHECK_THROWS_AS(RuleParser { "main(ref) ::= blah\n" }.parseRules(), + RuleParser::InvalidRefRuleWithConditions); } -TEST(regex_RuleParser, InvalidRuleOption) +TEST_CASE("regex_RuleParser.InvalidRuleOption") { - ASSERT_THROW(RuleParser { "A(invalid) ::= a\n" }.parseRules(), RuleParser::InvalidRuleOption); + CHECK_THROWS_AS(RuleParser { "A(invalid) ::= a\n" }.parseRules(), RuleParser::InvalidRuleOption); } -TEST(regex_RuleParser, DuplicateRule) +TEST_CASE("regex_RuleParser.DuplicateRule") { RuleParser rp { R"( foo ::= abc foo ::= def )" }; - ASSERT_THROW(rp.parseRules(), RuleParser::DuplicateRule); + CHECK_THROWS_AS(rp.parseRules(), RuleParser::DuplicateRule); } -TEST(regex_RuleParser, UnexpectedChar) +TEST_CASE("regex_RuleParser.UnexpectedChar") { - ASSERT_THROW(RuleParser { "A :=" }.parseRules(), RuleParser::UnexpectedChar); - ASSERT_THROW(RuleParser { " A ::= a" }.parseRules(), RuleParser::UnexpectedToken); - ASSERT_THROW(RuleParser { "<> A ::= a" }.parseRules(), RuleParser::UnexpectedToken); - ASSERT_THROW(RuleParser { " ::= a" }.parseRules(), RuleParser::UnexpectedToken); + CHECK_THROWS_AS(RuleParser { " A ::= a" }.parseRules(), RuleParser::UnexpectedToken); + CHECK_THROWS_AS(RuleParser { "<> A ::= a" }.parseRules(), RuleParser::UnexpectedToken); + CHECK_THROWS_AS(RuleParser { " ::= a" }.parseRules(), RuleParser::UnexpectedToken); } diff --git a/src/regex_dfa/SourceLocation.cpp b/src/regex_dfa/SourceLocation.cpp index 67b6c986da..c9e6cd8267 100644 --- a/src/regex_dfa/SourceLocation.cpp +++ b/src/regex_dfa/SourceLocation.cpp @@ -5,10 +5,10 @@ // file except in compliance with the License. You may obtain a copy of // the License at: http://opensource.org/licenses/MIT -#include - #include +#include + using namespace std; namespace regex_dfa diff --git a/src/regex_dfa/State.h b/src/regex_dfa/State.h index 82e0162336..975dd8851e 100644 --- a/src/regex_dfa/State.h +++ b/src/regex_dfa/State.h @@ -29,7 +29,7 @@ using AcceptMap = std::map; /** * Returns a human readable string of @p S, such as "{n0, n1, n2}". */ -std::string to_string(const StateIdVec& S, std::string_view stateLabelPrefix = "n"); +[[nodiscard]] std::string to_string(const StateIdVec& S, std::string_view stateLabelPrefix = "n"); } // namespace regex_dfa @@ -47,7 +47,7 @@ struct formatter template constexpr auto format(const regex_dfa::StateIdVec& v, FormatContext& ctx) { - return format_to(ctx.out(), "{}", regex_dfa::to_string(v)); + return fmt::format_to(ctx.out(), "{}", regex_dfa::to_string(v)); } }; } // namespace fmt diff --git a/src/regex_dfa/State_test.cpp b/src/regex_dfa/State_test.cpp index c2e34dad67..4cb9074f1e 100644 --- a/src/regex_dfa/State_test.cpp +++ b/src/regex_dfa/State_test.cpp @@ -9,10 +9,10 @@ #include -#include +#include -TEST(regex_State, to_string) +TEST_CASE("regex_State.to_string") { regex_dfa::StateIdVec v { 1, 2, 3 }; - EXPECT_EQ("{n1, n2, n3}", fmt::format("{}", v)); + CHECK("{n1, n2, n3}" == fmt::format("{}", v)); } diff --git a/src/regex_dfa/Symbols.h b/src/regex_dfa/Symbols.h index d76a2307ff..bb8a5488e1 100644 --- a/src/regex_dfa/Symbols.h +++ b/src/regex_dfa/Symbols.h @@ -24,22 +24,24 @@ namespace regex_dfa //! input symbol as used for transitions using Symbol = int; -std::string prettySymbol(Symbol input); -std::string prettyCharRange(Symbol ymin, Symbol ymax); -std::string groupCharacterClassRanges(const std::vector& syms); -std::string groupCharacterClassRanges(std::vector syms); +[[nodiscard]] std::string prettySymbol(Symbol input); +[[nodiscard]] std::string prettyCharRange(Symbol ymin, Symbol ymax); +[[nodiscard]] std::string groupCharacterClassRanges(const std::vector& syms); +[[nodiscard]] std::string groupCharacterClassRanges(std::vector syms); // new way of wrapping up Symbols struct Symbols { + // NOLINTBEGIN(readability-identifier-naming) constexpr static Symbol Epsilon = -1; constexpr static Symbol Error = -2; constexpr static Symbol BeginOfLine = -3; constexpr static Symbol EndOfLine = -4; constexpr static Symbol EndOfFile = -5; constexpr static Symbol Character(char ch) { return Symbol(ch); } + // NOLINTEND(readability-identifier-naming) - constexpr static bool isSpecial(Symbol s) + [[nodiscard]] constexpr static bool isSpecial(Symbol s) { switch (s) { @@ -72,8 +74,8 @@ class SymbolSet std::for_each(list.begin(), list.end(), [this](Symbol s) { insert(s); }); } - bool empty() const noexcept { return size_ == 0; } - size_t size() const noexcept { return size_; } + [[nodiscard]] bool empty() const noexcept { return size_ == 0; } + [[nodiscard]] size_t size() const noexcept { return size_; } //! Transforms into the complement set. void complement(); @@ -99,28 +101,28 @@ class SymbolSet } //! @returns whether or not given Symbol @p s is in this set. - bool contains(Symbol s) const + [[nodiscard]] bool contains(Symbol s) const { assert(s >= 0 && s <= 255 && "Only ASCII allowed."); return set_[(size_t) s]; } //! Tests whether or not this SymbolSet can be represented as dot (.), i.e. all but \n. - bool isDot() const noexcept; + [[nodiscard]] bool isDot() const noexcept; //! @returns a human readable representation of this set - std::string to_string() const; + [[nodiscard]] std::string to_string() const; bool operator==(const SymbolSet& rhs) const noexcept { return hash_ == rhs.hash_ && set_ == rhs.set_; } bool operator!=(const SymbolSet& rhs) const noexcept { return !(*this == rhs); } - class const_iterator - { // {{{ + class const_iterator // NOLINT(readability-identifier-naming) + { // {{{ public: const_iterator(std::vector::const_iterator beg, std::vector::const_iterator end, size_t n): - beg_ { std::move(beg) }, end_ { std::move(end) }, offset_ { n } + beg_ { beg }, end_ { end }, offset_ { n } { while (beg_ != end_ && !*beg_) { @@ -160,10 +162,10 @@ class SymbolSet size_t offset_; }; // }}} - const_iterator begin() const { return const_iterator(set_.begin(), set_.end(), 0); } - const_iterator end() const { return const_iterator(set_.end(), set_.end(), set_.size()); } + [[nodiscard]] const_iterator begin() const { return const_iterator(set_.begin(), set_.end(), 0); } + [[nodiscard]] const_iterator end() const { return const_iterator(set_.end(), set_.end(), set_.size()); } - size_t hash() const noexcept { return hash_; } + [[nodiscard]] size_t hash() const noexcept { return hash_; } private: void recalculateHash(); @@ -191,7 +193,7 @@ struct formatter template constexpr auto format(const regex_dfa::SymbolSet& v, FormatContext& ctx) { - return format_to(ctx.out(), "{}", v.to_string()); + return fmt::format_to(ctx.out(), "{}", v.to_string()); } }; } // namespace fmt diff --git a/src/regex_dfa/Symbols_test.cpp b/src/regex_dfa/Symbols_test.cpp index 4cfb69e112..3374865b1b 100644 --- a/src/regex_dfa/Symbols_test.cpp +++ b/src/regex_dfa/Symbols_test.cpp @@ -7,86 +7,86 @@ #include -#include +#include using namespace std; using regex_dfa::SymbolSet; -TEST(regex_SymbolSet, s0) +TEST_CASE("regex_SymbolSet.s0") { SymbolSet s0; - ASSERT_EQ(0, s0.size()); - ASSERT_TRUE(s0.empty()); + REQUIRE(0 == s0.size()); // NOLINT(readability-container-size-empty) + REQUIRE(s0.empty()); } -TEST(regex_SymbolSet, s1) +TEST_CASE("regex_SymbolSet.s1") { SymbolSet s1; // first add s1.insert('a'); - ASSERT_EQ(1, s1.size()); - ASSERT_FALSE(s1.empty()); + CHECK(1 == s1.size()); + REQUIRE_FALSE(s1.empty()); // overwrite s1.insert('a'); - ASSERT_EQ(1, s1.size()); - ASSERT_FALSE(s1.empty()); + CHECK(1 == s1.size()); + REQUIRE_FALSE(s1.empty()); } -TEST(regex_SymbolSet, initializer_list) +TEST_CASE("regex_SymbolSet.initializer_list") { SymbolSet a { 'a' }; - EXPECT_EQ(1, a.size()); - EXPECT_TRUE(a.contains('a')); + CHECK(1 == a.size()); + CHECK(a.contains('a')); SymbolSet s2 { 'a', 'b', 'b', 'c' }; - EXPECT_EQ(3, s2.size()); - EXPECT_EQ("abc", s2.to_string()); + CHECK(3 == s2.size()); + CHECK("abc" == s2.to_string()); } -TEST(regex_SymbolSet, dot) +TEST_CASE("regex_SymbolSet.dot") { SymbolSet dot(SymbolSet::Dot); - EXPECT_FALSE(dot.contains('\n')); - EXPECT_TRUE(dot.contains('\0')); - EXPECT_TRUE(dot.contains(' ')); - EXPECT_TRUE(dot.isDot()); - EXPECT_EQ(".", dot.to_string()); + REQUIRE(!dot.contains('\n')); + CHECK(dot.contains('\0')); + CHECK(dot.contains(' ')); + CHECK(dot.isDot()); + CHECK("." == dot.to_string()); } -TEST(regex_SymbolSet, complement) +TEST_CASE("regex_SymbolSet.complement") { SymbolSet s; s.insert('\n'); - EXPECT_EQ("\\n", s.to_string()); + CHECK("\\n" == s.to_string()); s.complement(); - EXPECT_EQ(".", s.to_string()); + CHECK("." == s.to_string()); } -TEST(regex_SymbolSet, range) +TEST_CASE("regex_SymbolSet.range") { SymbolSet r; r.insert(make_pair('a', 'f')); - EXPECT_EQ(6, r.size()); - EXPECT_EQ("a-f", r.to_string()); + CHECK(6 == r.size()); + CHECK("a-f" == r.to_string()); r.insert(make_pair('0', '9')); - EXPECT_EQ(16, r.size()); - EXPECT_EQ("0-9a-f", r.to_string()); + CHECK(16 == r.size()); + CHECK("0-9a-f" == r.to_string()); } -TEST(regex_SymbolSet, fmt_format) +TEST_CASE("regex_SymbolSet.fmt_format") { SymbolSet s; s.insert(make_pair('0', '9')); s.insert(make_pair('a', 'f')); - EXPECT_EQ("0-9a-f", fmt::format("{}", s)); + CHECK("0-9a-f" == fmt::format("{}", s)); } -TEST(regex_SymbolSet, hash_map) +TEST_CASE("regex_SymbolSet.hash_map") { SymbolSet s0; SymbolSet s1 { 'a' }; @@ -97,16 +97,16 @@ TEST(regex_SymbolSet, hash_map) map[s1] = 1; map[s2] = 2; - EXPECT_EQ(0, map[s0]); - EXPECT_EQ(1, map[s1]); - EXPECT_EQ(2, map[s2]); + CHECK(0 == map[s0]); + CHECK(1 == map[s1]); + CHECK(2 == map[s2]); } -TEST(regex_SymbolSet, compare) +TEST_CASE("regex_SymbolSet.compare") { SymbolSet s1 { 'a', 'b' }; SymbolSet s2 { 'a', 'b' }; SymbolSet s3 { 'a', 'c' }; - ASSERT_TRUE(s1 == s2); - ASSERT_TRUE(s1 != s3); + REQUIRE(s1 == s2); + REQUIRE(s1 != s3); } diff --git a/src/regex_dfa/TransitionMap-inl.h b/src/regex_dfa/TransitionMap-inl.h index 36ba30d013..df949ecab7 100644 --- a/src/regex_dfa/TransitionMap-inl.h +++ b/src/regex_dfa/TransitionMap-inl.h @@ -37,10 +37,10 @@ inline std::vector TransitionMap::states() const return v; } -inline std::map TransitionMap::map(StateId s) const +inline std::map TransitionMap::map(StateId inputState) const { std::map m; - if (auto mapping = mapping_.find(s); mapping != mapping_.end()) + if (auto mapping = mapping_.find(inputState); mapping != mapping_.end()) for (const auto& i: mapping->second) m[i.first] = i.second; return m; diff --git a/src/regex_dfa/TransitionMap.h b/src/regex_dfa/TransitionMap.h index d05229f64b..5b0693748b 100644 --- a/src/regex_dfa/TransitionMap.h +++ b/src/regex_dfa/TransitionMap.h @@ -45,17 +45,17 @@ class TransitionMap * * @returns the transition from (currentState, charCat) to (nextState) or ErrorState if not defined. */ - StateId apply(StateId currentState, Symbol charCat) const; + [[nodiscard]] StateId apply(StateId currentState, Symbol charCat) const; /** * Retrieves a list of all available states. */ - std::vector states() const; + [[nodiscard]] std::vector states() const; /** * Retrieves a map of all transitions from given state @p inputState. */ - std::map map(StateId inputState) const; + [[nodiscard]] std::map map(StateId inputState) const; private: Container mapping_; diff --git a/src/regex_dfa/regex_dfa_test.cpp b/src/regex_dfa/regex_dfa_test.cpp new file mode 100644 index 0000000000..45742d7742 --- /dev/null +++ b/src/regex_dfa/regex_dfa_test.cpp @@ -0,0 +1,25 @@ +/** + * This file is part of the "libterminal" project + * Copyright (c) 2019-2020 Christian Parpart + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define CATCH_CONFIG_RUNNER +#include + +int main(int argc, char const* argv[]) +{ + int const result = Catch::Session().run(argc, argv); + + // avoid closing extern console to close on VScode/windows + // system("pause"); + + return result; +} diff --git a/src/regex_dfa/util/AnsiColor.h b/src/regex_dfa/util/AnsiColor.h deleted file mode 100644 index d7b27c99bc..0000000000 --- a/src/regex_dfa/util/AnsiColor.h +++ /dev/null @@ -1,153 +0,0 @@ -// This file is part of the "x0" project, http://github.com/christianparpart/x0> -// (c) 2009-2019 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT -#pragma once - -#include - -namespace AnsiColor { - -enum Code : unsigned { - Clear = 0, - Reset = Clear, - Bold = 0x0001, // 1 - Dark = 0x0002, // 2 - Undef1 = 0x0004, - Underline = 0x0008, // 4 - Blink = 0x0010, // 5 - Undef2 = 0x0020, - Reverse = 0x0040, // 7 - Concealed = 0x0080, // 8 - AllFlags = 0x00FF, - Black = 0x0100, - Red = 0x0200, - Green = 0x0300, - Yellow = 0x0400, - Blue = 0x0500, - Magenta = 0x0600, - Cyan = 0x0700, - White = 0x0800, - AnyFg = 0x0F00, - OnBlack = 0x1000, - OnRed = 0x2000, - OnGreen = 0x3000, - OnYellow = 0x4000, - OnBlue = 0x5000, - OnMagenta = 0x6000, - OnCyan = 0x7000, - OnWhite = 0x8000, - AnyBg = 0xF000 -}; - -/// Combines two ANSI escape sequences into one Code. -constexpr inline Code operator|(Code a, Code b) -{ - return Code{unsigned(a) | unsigned(b)}; -} - -/** - * Counts the number of ANSI escape sequences in @p codes. - */ -constexpr unsigned count(Code codes) -{ - if (codes == Clear) - return 1; - - unsigned i = 0; - - if (codes & AllFlags) - for (int k = 0; k < 8; ++k) - if (codes & (1 << k)) - ++i; - - if (codes & AnyFg) - ++i; - - if (codes & AnyBg) - ++i; - - return i; -} - -/** - * Retrieves the number of bytes required to store the ANSI escape sequences of @p codes - * without prefix/suffix notation. - */ -constexpr unsigned capacity(Code codes) -{ - if (codes == Clear) - return 1; - - unsigned i = 0; - - if (codes & AllFlags) - for (int k = 0; k < 8; ++k) - if (codes & (1 << k)) - ++i; - - if (codes & AnyFg) - i += 2; - - if (codes & AnyBg) - i += 2; - - return i + (count(codes) - 1); -} - -/// Constructs a sequence of ANSI codes for the colors in this @p codes. -template -constexpr auto codes() -{ - std::array result{}; - - size_t n = 0; // n'th escape sequence being iterate through - size_t i = 0; // i'th byte in output array - - result[i++] = '\x1B'; - result[i++] = '['; - - if constexpr (value != 0) - { - if (value & AllFlags) - { - for (int k = 0; k < 8; ++k) - { - if (value & (1 << k)) - { - if (n++) - result[i++] = ';'; - result[i++] = k + '1'; - } - } - } - - if (value & AnyFg) - { - if (n++) - result[i++] = ';'; - unsigned const val = ((value >> 8) & 0x0F) + 29; // 36 -> {'3', '6'} - result[i++] = (val / 10) + '0'; - result[i++] = (val % 10) + '0'; - } - - if (value & AnyBg) - { - if (n++) - result[i++] = ';'; - unsigned const val = ((value >> 12) & 0x0F) + 39; - result[i++] = (val / 10) + '0'; - result[i++] = (val % 10) + '0'; - } - } - else - result[i++] = '0'; // reset/clear - - result[i++] = 'm'; - - return result; -} - -} // namespace AnsiColor diff --git a/src/regex_dfa/util/Flags.cpp b/src/regex_dfa/util/Flags.cpp deleted file mode 100644 index 5082fce7df..0000000000 --- a/src/regex_dfa/util/Flags.cpp +++ /dev/null @@ -1,578 +0,0 @@ -// This file is part of the "x0" project, http://github.com/christianparpart/x0> -// (c) 2009-2018 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT - -#include "Flags.h" - -#include -#include -#include - -#include "AnsiColor.h" - -using namespace std; - -namespace regex_dfa::util -{ - -auto static constexpr clearColor = AnsiColor::codes(); -auto static constexpr optionColor = AnsiColor::codes(); -auto static constexpr valueColor = AnsiColor::codes(); -auto static constexpr headerColor = AnsiColor::codes(); - -// {{{ Flags::Error -Flags::Error::Error(ErrorCode code, string arg): - runtime_error { FlagsErrorCategory::get().message(static_cast(code)) + ": " + arg }, - code_ { code }, - arg_ { move(arg) } -{ -} -// }}} - -// {{{ Flag -Flags::Flag::Flag(const string& opt, const string& val, FlagStyle fs, FlagType ft): - type_(ft), style_(fs), name_(opt), value_(val) -{ -} -// }}} - -Flags::Flags(): - flagDefs_ {}, - parametersEnabled_ { false }, - parametersPlaceholder_ {}, - parametersHelpText_ {}, - set_ {}, - raw_ {} -{ -} - -void Flags::set(const Flag& flag) -{ - set_[flag.name()] = make_pair(flag.type(), flag.value()); -} - -void Flags::set(const string& opt, const string& val, FlagStyle fs, FlagType ft) -{ - set(Flag { opt, val, fs, ft }); -} - -bool Flags::isSet(const string& flag) const -{ - return set_.find(flag) != set_.end(); -} - -string Flags::asString(const string& flag) const -{ - auto i = set_.find(flag); - if (i == set_.end()) - throw Error { ErrorCode::NotFound, flag }; - - return i->second.second; -} - -string Flags::getString(const string& flag) const -{ - auto i = set_.find(flag); - if (i == set_.end()) - throw Error { ErrorCode::NotFound, flag }; - - if (i->second.first != FlagType::String) - throw Error { ErrorCode::TypeMismatch, flag }; - - return i->second.second; -} - -long int Flags::getNumber(const string& flag) const -{ - auto i = set_.find(flag); - if (i == set_.end()) - throw Error { ErrorCode::NotFound, flag }; - - if (i->second.first != FlagType::Number) - throw Error { ErrorCode::TypeMismatch, flag }; - - return stoi(i->second.second); -} - -float Flags::getFloat(const string& flag) const -{ - auto i = set_.find(flag); - if (i == set_.end()) - throw Error { ErrorCode::NotFound, flag }; - - if (i->second.first != FlagType::Float) - throw Error { ErrorCode::TypeMismatch, flag }; - - return stof(i->second.second); -} - -bool Flags::getBool(const string& flag) const -{ - auto i = set_.find(flag); - if (i == set_.end()) - return false; - - return i->second.second == "true"; -} - -const vector& Flags::parameters() const -{ - return raw_; -} - -void Flags::setParameters(const vector& v) -{ - raw_ = v; -} - -string Flags::to_s() const -{ - stringstream sstr; - - int i = 0; - for (const pair& flag: set_) - { - if (i) - sstr << ' '; - - i++; - - switch (flag.second.first) - { - case FlagType::Bool: - if (flag.second.second == "true") - sstr << "--" << flag.first; - else - sstr << "--" << flag.first << "=false"; - break; - case FlagType::String: sstr << "--" << flag.first << "=\"" << flag.second.second << "\""; break; - default: sstr << "--" << flag.first << "=" << flag.second.second; break; - } - } - - return sstr.str(); -} - -Flags& Flags::define(const string& longOpt, - char shortOpt, - bool required, - FlagType type, - const string& valuePlaceholder, - const string& helpText, - const optional& defaultValue, - function callback) -{ - FlagDef fd; - fd.type = type; - fd.longOption = longOpt; - fd.shortOption = shortOpt; - fd.required = required; - fd.valuePlaceholder = valuePlaceholder; - fd.helpText = helpText; - fd.defaultValue = defaultValue; - fd.callback = callback; - - flagDefs_.emplace_back(fd); - - return *this; -} - -Flags& Flags::defineString(const string& longOpt, - char shortOpt, - const string& valuePlaceholder, - const string& helpText, - optional defaultValue, - function callback) -{ - return define( - longOpt, shortOpt, false, FlagType::String, valuePlaceholder, helpText, defaultValue, callback); -} - -Flags& Flags::defineNumber(const string& longOpt, - char shortOpt, - const string& valuePlaceholder, - const string& helpText, - optional defaultValue, - function callback) -{ - return define(longOpt, - shortOpt, - false, - FlagType::Number, - valuePlaceholder, - helpText, - defaultValue.has_value() ? make_optional(to_string(*defaultValue)) : nullopt, - [=](const string& value) { - if (callback) - { - callback(stoi(value)); - } - }); -} - -Flags& Flags::defineFloat(const string& longOpt, - char shortOpt, - const string& valuePlaceholder, - const string& helpText, - optional defaultValue, - function callback) -{ - return define(longOpt, - shortOpt, - false, - FlagType::Float, - valuePlaceholder, - helpText, - defaultValue.has_value() ? make_optional(to_string(*defaultValue)) : nullopt, - [=](const string& value) { - if (callback) - { - callback(stof(value)); - } - }); -} - -Flags& Flags::defineBool(const string& longOpt, - char shortOpt, - const string& helpText, - function callback) -{ - return define( - longOpt, shortOpt, false, FlagType::Bool, "", helpText, nullopt, [=](const string& value) { - if (callback) - { - callback(value == "true"); - } - }); -} - -Flags& Flags::enableParameters(const string& valuePlaceholder, const string& helpText) -{ - parametersEnabled_ = true; - parametersPlaceholder_ = valuePlaceholder; - parametersHelpText_ = helpText; - - return *this; -} - -const Flags::FlagDef* Flags::findDef(const string& longOption) const -{ - for (const auto& flag: flagDefs_) - if (flag.longOption == longOption) - return &flag; - - return nullptr; -} - -const Flags::FlagDef* Flags::findDef(char shortOption) const -{ - for (const auto& flag: flagDefs_) - if (flag.shortOption == shortOption) - return &flag; - - return nullptr; -} - -// ----------------------------------------------------------------------------- -void Flags::parse(int argc, const char* argv[]) -{ - vector args; - for (int i = 1; i < argc; ++i) - args.push_back(argv[i]); - - parse(args); -} - -error_code Flags::tryParse(const vector& args) -{ - try - { - parse(args); - } - catch (const Error& parseError) - { - return parseError.code(); - } - return error_code(); -} - -void Flags::parse(const vector& args) -{ - auto invokeCallback = [&](const FlagDef* fd, FlagStyle style, const string& value) { - if (fd) - { - set(fd->longOption, value, style, fd->type); - if (fd->callback) - { - fd->callback(value); - } - } - }; - - enum class ParsingState - { - Options, - Parameters, - }; - - vector params; - ParsingState pstate = ParsingState::Options; - size_t i = 0; - - while (i < args.size()) - { - string arg = args[i]; - i++; - if (pstate == ParsingState::Parameters) - params.push_back(arg); - else if (arg == "--") - { - if (parametersEnabled_) - pstate = ParsingState::Parameters; - else - throw Error { ErrorCode::UnknownOption, arg }; - } - else if (arg.size() > 2 && arg[0] == '-' && arg[1] == '-') - { - // longopt - string name = arg.substr(2); - size_t eq = name.find('='); - if (eq != name.npos) - { // --name=value - string value = name.substr(eq + 1); - name = name.substr(0, eq); - const FlagDef* fd = findDef(name); - if (fd == nullptr) - throw Error { ErrorCode::UnknownOption, arg }; - else - invokeCallback(fd, FlagStyle::LongWithValue, value); - } - else - { // --name [VALUE] - const FlagDef* fd = findDef(name); - if (fd == nullptr) - throw Error { ErrorCode::UnknownOption, arg }; - else if (fd->type == FlagType::Bool) - // --name - invokeCallback(fd, FlagStyle::LongSwitch, "true"); - else - { - // --name VALUE - if (i >= args.size()) - throw Error { ErrorCode::MissingOption, arg }; - - string value = args[i]; - i++; - - invokeCallback(fd, FlagStyle::LongWithValue, value); - } - } - } - else if (arg.size() > 1 && arg[0] == '-') - { - // shortopt - arg = arg.substr(1); - while (!arg.empty()) - { - const FlagDef* fd = findDef(arg[0]); - if (fd == nullptr) // option not found - throw Error { ErrorCode::UnknownOption, "-" + arg.substr(0, 1) }; - else if (fd->type == FlagType::Bool) - { - invokeCallback(fd, FlagStyle::ShortSwitch, "true"); - arg = arg.substr(1); - } - else if (arg.size() > 1) // -fVALUE - { - string value = arg.substr(1); - invokeCallback(fd, FlagStyle::ShortSwitch, value); - arg.clear(); - } - else - { - // -f VALUE - string name = fd->longOption; - - if (i >= args.size()) - { - char option[3] = { '-', fd->shortOption, '\0' }; - throw Error { ErrorCode::MissingOptionValue, option }; - } - - arg.clear(); - string value = args[i]; - i++; - - if (!value.empty() && value[0] == '-') - { - char option[3] = { '-', fd->shortOption, '\0' }; - throw Error { ErrorCode::MissingOptionValue, option }; - } - - invokeCallback(fd, FlagStyle::ShortSwitch, value); - } - } - } - else if (parametersEnabled_) - params.push_back(arg); - else - throw Error { ErrorCode::UnknownOption, arg }; - } - - setParameters(params); - - // fill any missing default flags - for (const FlagDef& fd: flagDefs_) - { - if (fd.defaultValue.has_value()) - { - if (!isSet(fd.longOption)) - invokeCallback(&fd, FlagStyle::LongWithValue, fd.defaultValue.value()); - } - else if (fd.type == FlagType::Bool) - { - if (!isSet(fd.longOption)) - invokeCallback(&fd, FlagStyle::LongWithValue, "false"); - } - } -} - -// ----------------------------------------------------------------------------- - -string Flags::helpText(string_view const& header, size_t width, size_t helpTextOffset) const -{ - stringstream sstr; - - if (!header.empty()) - sstr << headerColor.data() << header << clearColor.data(); - - if (parametersEnabled_ || !flagDefs_.empty()) - sstr << headerColor.data() << "Options:\n" << clearColor.data(); - - for (const FlagDef& fd: flagDefs_) - sstr << fd.makeHelpText(width, helpTextOffset); - - if (parametersEnabled_) - { - sstr << endl; - - const streampos p = sstr.tellp(); - const size_t column = static_cast(sstr.tellp() - p); - - sstr << " [--] " << valueColor.data() << parametersPlaceholder_ << clearColor.data(); - if (column < helpTextOffset) - sstr << setw(helpTextOffset - column) << ' '; - else - sstr << endl << setw(helpTextOffset) << ' '; - - sstr << parametersHelpText_ << endl; - } - - return sstr.str(); -} - -static string wordWrap(const string& text, size_t currentWidth, size_t width, size_t indent) -{ - stringstream sstr; - - size_t i = 0; - while (i < text.size()) - { - if (currentWidth >= width) - { - sstr << endl << setw(indent) << ' '; - currentWidth = 0; - } - - sstr << text[i]; - currentWidth++; - i++; - } - - return sstr.str(); -} - -error_code make_error_code(Flags::ErrorCode errc) -{ - return error_code(static_cast(errc), FlagsErrorCategory::get()); -} - -// {{{ Flags::FlagDef -string Flags::FlagDef::makeHelpText(size_t width, size_t helpTextOffset) const -{ - stringstream sstr; - - sstr << " "; - - // short option - if (shortOption) - sstr << optionColor.data() << "-" << shortOption << clearColor.data() << ", "; - else - sstr << " "; - - // long option - sstr << optionColor.data() << "--" << longOption; - - // value placeholder - if (type != FlagType::Bool) - { - sstr << "=" << valueColor.data(); - if (!valuePlaceholder.empty()) - sstr << valuePlaceholder; - else - sstr << "VALUE"; - } - sstr << clearColor.data(); - - // spacer - size_t column = static_cast(sstr.tellp()); - if (column < helpTextOffset) - sstr << setw(helpTextOffset - sstr.tellp()) << ' '; - else - { - sstr << endl << setw(helpTextOffset) << ' '; - column = helpTextOffset; - } - - // help output with default value hint. - if (type != FlagType::Bool && defaultValue.has_value()) - sstr << wordWrap(helpText + " [" + *defaultValue + "]", column, width, helpTextOffset); - else - sstr << wordWrap(helpText, column, width, helpTextOffset); - - sstr << endl; - - return sstr.str(); -} -// }}} - -// {{{ FlagsErrorCategory -FlagsErrorCategory& FlagsErrorCategory::get() -{ - static FlagsErrorCategory cat; - return cat; -} - -const char* FlagsErrorCategory::name() const noexcept -{ - return "Flags"; -} - -string FlagsErrorCategory::message(int ec) const -{ - switch (static_cast(ec)) - { - case Flags::ErrorCode::TypeMismatch: return "Type Mismatch"; - case Flags::ErrorCode::UnknownOption: return "Unknown Option"; - case Flags::ErrorCode::MissingOption: return "Missing Option"; - case Flags::ErrorCode::MissingOptionValue: return "Missing Option Value"; - case Flags::ErrorCode::NotFound: return "Flag Not Found"; - default: return ""; - } -} -// }}} - -} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/Flags.h b/src/regex_dfa/util/Flags.h deleted file mode 100644 index 611eafbf84..0000000000 --- a/src/regex_dfa/util/Flags.h +++ /dev/null @@ -1,171 +0,0 @@ -// This file is part of the "x0" project, // http://github.com/christianparpart/x0> -// (c) 2009-2018 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace regex_dfa::util { - -class Flags { - public: - enum class FlagType { - String, - Number, - Float, - Bool, - }; - - // FlagPassingStyle - enum FlagStyle { ShortSwitch, LongSwitch, ShortWithValue, LongWithValue, UnnamedParameter }; - - enum class ErrorCode { - TypeMismatch, - UnknownOption, - MissingOption, - MissingOptionValue, - NotFound, - }; - - class Error : public std::runtime_error { - public: - Error(ErrorCode code, std::string arg); - - ErrorCode code() const noexcept { return code_; } - const std::string& arg() const noexcept { return arg_; } - - private: - ErrorCode code_; - std::string arg_; - }; - - struct FlagDef; - class Flag; - - Flags(); - - std::string getString(const std::string& flag) const; - std::string asString(const std::string& flag) const; - long int getNumber(const std::string& flag) const; - float getFloat(const std::string& flag) const; - bool getBool(const std::string& flag) const; - - const std::vector& parameters() const; - void setParameters(const std::vector& v); - - size_t size() const { return set_.size(); } - - std::string to_s() const; - - void set(const Flag& flag); - void set(const std::string& opt, const std::string& val, FlagStyle fs, FlagType ft); - bool isSet(const std::string& flag) const; - - Flags& defineString(const std::string& longOpt, char shortOpt, const std::string& valuePlaceholder, - const std::string& helpText, std::optional defaultValue = std::nullopt, - std::function callback = nullptr); - - Flags& defineNumber(const std::string& longOpt, char shortOpt, const std::string& valuePlaceholder, - const std::string& helpText, std::optional defaultValue = std::nullopt, - std::function callback = nullptr); - - Flags& defineFloat(const std::string& longOpt, char shortOpt, const std::string& valuePlaceholder, - const std::string& helpText, std::optional defaultValue = std::nullopt, - std::function callback = nullptr); - - Flags& defineBool(const std::string& longOpt, char shortOpt, const std::string& helpText, - std::function callback = nullptr); - - Flags& enableParameters(const std::string& valuePlaceholder, const std::string& helpText); - - std::string helpText(std::string_view const& header = "") const { return helpText(header, 78, 30); } - std::string helpText(std::string_view const& header, size_t width, size_t helpTextOffset) const; - - const FlagDef* findDef(const std::string& longOption) const; - const FlagDef* findDef(char shortOption) const; - - void parse(int argc, const char* argv[]); - void parse(const std::vector& args); - - // Attempts to parse given arguments and returns an error code in case of parsing errors instead - // of throwing. - std::error_code tryParse(const std::vector& args); - - private: - Flags& define(const std::string& longOpt, char shortOpt, bool required, FlagType type, - const std::string& helpText, const std::string& valuePlaceholder, - const std::optional& defaultValue, - std::function callback); - - private: - std::list flagDefs_; - bool parametersEnabled_; // non-option parameters enabled? - std::string parametersPlaceholder_; - std::string parametersHelpText_; - - typedef std::pair FlagValue; - std::unordered_map set_; - std::vector raw_; -}; - -struct Flags::FlagDef { - FlagType type; - std::string longOption; - char shortOption; - bool required; - std::string valuePlaceholder; - std::string helpText; - std::optional defaultValue; - std::function callback; - - std::string makeHelpText(size_t width, size_t helpTextOffset) const; -}; - -class Flags::Flag { - public: - Flag(const std::string& opt, const std::string& val, FlagStyle fs, FlagType ft); - - explicit Flag(char shortOpt); - Flag(char shortOpt, const std::string& val); - Flag(const std::string& longOpt); - Flag(const std::string& longOpt, const std::string& val); - - FlagType type() const { return type_; } - const std::string& name() const { return name_; } - const std::string& value() const { return value_; } - - private: - FlagType type_; - FlagStyle style_; - std::string name_; - std::string value_; -}; - -class FlagsErrorCategory : public std::error_category { - public: - static FlagsErrorCategory& get(); - - const char* name() const noexcept override; - std::string message(int ec) const override; -}; - -std::error_code make_error_code(Flags::ErrorCode errc); - -} // namespace regex_dfa::util - -namespace std { -template <> -struct is_error_code_enum : public std::true_type { -}; -} // namespace std diff --git a/src/regex_dfa/util/IntVector.h b/src/regex_dfa/util/IntVector.h deleted file mode 100644 index 8c15c407e3..0000000000 --- a/src/regex_dfa/util/IntVector.h +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -/** - * Encapsulates std::vector with speed improvements. - * - */ -template -class IntVector { - public: - using value_type = T; - using vector = std::vector; - using iterator = Vector::iterator; - using const_iterator = Vector::const_iterator; - - IntVector() : vector_{}, hash_{2166136261llu} {} - - void clear() { - vector_.clear(); - hash_ = 2166136261llu; - } - - void push_back(T v) { - vector_.push_back(v); - - hash_ ^= v; - hash_ *= 16777619llu; - } - - bool operator==(const IntVector& rhs) const noexcept { - return hash_ == rhs.hash_ && vector_ == rhs.vector_; - } - - bool operator!=(const IntVector& rhs) const noexcept { - return !(*this == rhs); - } - - private: - Vector vector_; - unsigned hash_; -}; diff --git a/src/regex_dfa/util/UnboxedRange.h b/src/regex_dfa/util/UnboxedRange.h deleted file mode 100644 index 69133c3af2..0000000000 --- a/src/regex_dfa/util/UnboxedRange.h +++ /dev/null @@ -1,94 +0,0 @@ -// This file is part of the "x0" project, http://github.com/christianparpart/x0> -// (c) 2009-2018 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT -#pragma once - -#include - -namespace regex_dfa::util { - -template -class UnboxedRange { - public: - using BoxedContainer = T; - using BoxedIterator = typename BoxedContainer::iterator; - using element_type = typename BoxedContainer::value_type::element_type; - - class iterator { // {{{ - public: - typedef typename BoxedContainer::iterator::difference_type difference_type; - typedef typename BoxedContainer::iterator::value_type::element_type value_type; - typedef typename BoxedContainer::iterator::value_type::element_type* pointer; - typedef typename BoxedContainer::iterator::value_type::element_type& reference; - typedef typename BoxedContainer::iterator::iterator_category iterator_category; - - explicit iterator(BoxedIterator boxed) : it_(boxed) {} - - const element_type& operator->() const { return **it_; } - element_type& operator->() { return **it_; } - - const element_type* operator*() const { return (*it_).get(); } - element_type* operator*() { return (*it_).get(); } - - iterator& operator++() - { - ++it_; - return *this; - } - iterator& operator++(int) - { - ++it_; - return *this; - } - - bool operator==(const iterator& other) const { return it_ == other.it_; } - bool operator!=(const iterator& other) const { return it_ != other.it_; } - - private: - BoxedIterator it_; - }; // }}} - - UnboxedRange(BoxedIterator begin, BoxedIterator end) : begin_(begin), end_(end) {} - explicit UnboxedRange(BoxedContainer& c) : begin_(c.begin()), end_(c.end()) {} - explicit UnboxedRange(const BoxedContainer& c) : UnboxedRange{const_cast(c)} {} - - iterator begin() const { return begin_; } - iterator end() const { return end_; } - iterator cbegin() const { return begin_; } - iterator cend() const { return end_; } - size_t size() const { return std::distance(begin_, end_); } - - private: - iterator begin_; - iterator end_; -}; - -/** - * Unboxes boxed element types in containers. - * - * Good examples are: - * - * \code - * std::vector> numbers; - * // ... - * for (int number: unbox(numbers)) { - * // ... juse use number here, instead of number.get() or *number. - * }; - * \endcode - */ -template -UnboxedRange unbox(BoxedContainer& boxedContainer) -{ - return UnboxedRange(boxedContainer); -} - -template -UnboxedRange unbox(const BoxedContainer& boxedContainer) -{ - return UnboxedRange(boxedContainer); -} - -} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/iterator-detail.h b/src/regex_dfa/util/iterator-detail.h index 6f96cc5bd2..948beabf06 100644 --- a/src/regex_dfa/util/iterator-detail.h +++ b/src/regex_dfa/util/iterator-detail.h @@ -9,161 +9,169 @@ #include #include -namespace regex_dfa::util::detail { +namespace regex_dfa::util::detail +{ template -struct reversed { - const Container container; +struct reversed +{ + const Container container; - auto begin() { return container.crbegin(); } - auto end() { return container.crend(); } + auto begin() { return container.crbegin(); } + auto end() { return container.crend(); } }; template -struct indexed { - Container& container; - - struct iterator { - typename Container::iterator iter; - std::size_t index = 0; - - iterator& operator++() - { - ++iter; - ++index; - return *this; - } - - iterator& operator++(int) - { - ++*this; - return *this; - } - - auto operator*() const { return std::make_pair(index, *iter); } - - bool operator==(const iterator& rhs) const noexcept { return iter == rhs.iter; } - bool operator!=(const iterator& rhs) const noexcept { return iter != rhs.iter; } - }; - - struct const_iterator { - typename Container::const_iterator iter; - std::size_t index = 0; - - const_iterator& operator++() - { - ++iter; - ++index; - return *this; - } - - const_iterator& operator++(int) - { - ++*this; - return *this; - } - - auto operator*() const { return std::make_pair(index, *iter); } - - bool operator==(const const_iterator& rhs) const noexcept { return iter == rhs.iter; } - bool operator!=(const const_iterator& rhs) const noexcept { return iter != rhs.iter; } - }; - - auto begin() const - { - if constexpr (std::is_const::value) - return const_iterator{container.cbegin()}; - else - return iterator{container.begin()}; - } - - auto end() const - { - if constexpr (std::is_const::value) - return const_iterator{container.cend()}; - else - return iterator{container.end()}; - } +struct indexed +{ + Container& container; + + struct iterator + { + typename Container::iterator iter; + std::size_t index = 0; + + iterator& operator++() + { + ++iter; + ++index; + return *this; + } + + iterator& operator++(int) + { + ++*this; + return *this; + } + + auto operator*() const { return std::make_pair(index, *iter); } + + bool operator==(const iterator& rhs) const noexcept { return iter == rhs.iter; } + bool operator!=(const iterator& rhs) const noexcept { return iter != rhs.iter; } + }; + + struct const_iterator + { + typename Container::const_iterator iter; + std::size_t index = 0; + + const_iterator& operator++() + { + ++iter; + ++index; + return *this; + } + + const_iterator& operator++(int) + { + ++*this; + return *this; + } + + auto operator*() const { return std::make_pair(index, *iter); } + + bool operator==(const const_iterator& rhs) const noexcept { return iter == rhs.iter; } + bool operator!=(const const_iterator& rhs) const noexcept { return iter != rhs.iter; } + }; + + auto begin() const + { + if constexpr (std::is_const::value) + return const_iterator { container.cbegin() }; + else + return iterator { container.begin() }; + } + + auto end() const + { + if constexpr (std::is_const::value) + return const_iterator { container.cend() }; + else + return iterator { container.end() }; + } }; template -struct filter { - Container& container; - Lambda proc; +struct filter +{ + Container& container; + Lambda proc; - struct iterator { + struct iterator + { using iterator_category = std::forward_iterator_tag; using value_type = typename Container::value_type; using difference_type = long; using pointer = value_type*; using reference = value_type&; - typename Container::iterator i; - typename Container::iterator e; - Lambda filter; - - auto operator*() const { return *i; } - - iterator& operator++() - { - ++i; - while (i != e && !filter(*i)) - ++i; - return *this; - } - - iterator& operator++(int) { return ++*this; } - - bool operator==(const iterator& rhs) const noexcept { return i == rhs.i; } - bool operator!=(const iterator& rhs) const noexcept { return !(*this == rhs); } - }; - - struct const_iterator { - typename Container::const_iterator i; - typename Container::const_iterator e; - Lambda filter; - - auto operator*() const { return *i; } - - const_iterator& operator++() - { - ++i; - while (i != e && !filter(*i)) - ++i; - return *this; - } - - const_iterator& operator++(int) { return ++*this; } - - bool operator==(const const_iterator& rhs) const noexcept { return i == rhs.i; } - bool operator!=(const const_iterator& rhs) const noexcept { return !(*this == rhs); } - }; - - auto begin() const - { - if constexpr (std::is_const::value) - { - auto i = const_iterator{std::cbegin(container), std::cend(container), proc}; - while (i != end() && !proc(*i)) - ++i; - return i; - } - else - { - auto i = iterator{std::begin(container), std::end(container), proc}; - while (i != end() && !proc(*i)) - ++i; - return i; - } - } - - auto end() const - { - if constexpr (std::is_const::value) - return const_iterator{std::cend(container), std::cend(container), proc}; - else - return iterator{std::end(container), std::end(container), proc}; - } + typename Container::iterator i; + typename Container::iterator e; + Lambda filter; + + auto operator*() const { return *i; } + + iterator& operator++() + { + ++i; + while (i != e && !filter(*i)) + ++i; + return *this; + } + + iterator& operator++(int) { return ++*this; } + + bool operator==(const iterator& rhs) const noexcept { return i == rhs.i; } + bool operator!=(const iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + struct const_iterator + { + typename Container::const_iterator i; + typename Container::const_iterator e; + Lambda filter; + + auto operator*() const { return *i; } + + const_iterator& operator++() + { + ++i; + while (i != e && !filter(*i)) + ++i; + return *this; + } + + const_iterator& operator++(int) { return ++*this; } + + bool operator==(const const_iterator& rhs) const noexcept { return i == rhs.i; } + bool operator!=(const const_iterator& rhs) const noexcept { return !(*this == rhs); } + }; + + auto begin() const + { + if constexpr (std::is_const::value) + { + auto i = const_iterator { std::cbegin(container), std::cend(container), proc }; + while (i != end() && !proc(*i)) + ++i; + return i; + } + else + { + auto i = iterator { std::begin(container), std::end(container), proc }; + while (i != end() && !proc(*i)) + ++i; + return i; + } + } + + auto end() const + { + if constexpr (std::is_const::value) + return const_iterator { std::cend(container), std::cend(container), proc }; + else + return iterator { std::end(container), std::end(container), proc }; + } }; -} // namespace regex_dfa::util::detail +} // namespace regex_dfa::util::detail diff --git a/src/regex_dfa/util/iterator.h b/src/regex_dfa/util/iterator.h index 60240ba12a..81c95838d9 100644 --- a/src/regex_dfa/util/iterator.h +++ b/src/regex_dfa/util/iterator.h @@ -7,7 +7,8 @@ #pragma once -#include +#include + #include #include #include @@ -15,71 +16,73 @@ #include #include -namespace regex_dfa::util { +namespace regex_dfa::util +{ template inline auto reversed(Container&& c) { - if constexpr (std::is_reference::value) - return detail::reversed{std::forward(c)}; - else - return detail::reversed{std::forward(c)}; + if constexpr (std::is_reference::value) + return detail::reversed { std::forward(c) }; + else + return detail::reversed { std::forward(c) }; } template inline auto indexed(const Container& c) { - return typename std::add_const>::type{c}; + return typename std::add_const>::type { c }; } template inline auto indexed(Container& c) { - return detail::indexed{c}; + return detail::indexed { c }; } template -inline auto translate(const Container& container, Lambda mapfn) { - using namespace std; - using T = decltype(mapfn(*begin(container))); +inline auto translate(const Container& container, Lambda mapfn) +{ + using namespace std; + using T = decltype(mapfn(*begin(container))); - vector out; - out.reserve(distance(begin(container), end(container))); - transform(begin(container), end(container), back_inserter(out), move(mapfn)); + vector out; + out.reserve(distance(begin(container), end(container))); + transform(begin(container), end(container), back_inserter(out), std::move(mapfn)); - return out; + return out; } template inline std::string join(const Container& container, const std::string& separator = ", ") { - std::stringstream out; + std::stringstream out; - for (const auto&& [i, v] : indexed(container)) - if (i) - out << separator << v; - else - out << v; + for (const auto&& [i, v]: indexed(container)) + if (i) + out << separator << v; + else + out << v; - return out.str(); + return out.str(); } template inline auto filter(std::initializer_list&& c, Lambda proc) { - return typename std::add_const, Lambda>>::type{c, proc}; + return typename std::add_const, Lambda>>::type { c, proc }; } template inline auto filter(const Container& c, Lambda proc) { - return typename std::add_const>::type{c, proc}; + return typename std::add_const>::type { c, proc }; } template inline auto filter(Container& c, Lambda proc) { - return detail::filter{c, proc}; + return detail::filter { c, proc }; } /** @@ -87,20 +90,20 @@ inline auto filter(Container& c, Lambda proc) * * @returns the iterator representing the last item satisfying @p test or @p end if none found. */ -template +template auto find_last(const Container& container, Test test) -> decltype(std::cbegin(container)) { - auto begin = std::cbegin(container); - auto end = std::cend(container); + auto begin = std::cbegin(container); + auto end = std::cend(container); - for (auto i = std::prev(end); i != begin; --i) - if (test(*i)) - return i; + for (auto i = std::prev(end); i != begin; --i) + if (test(*i)) + return i; - if (test(*begin)) - return begin; - else - return end; + if (test(*begin)) + return begin; + else + return end; } -} // namespace regex_dfa::util +} // namespace regex_dfa::util diff --git a/src/regex_dfa/util/iterator_test.cpp b/src/regex_dfa/util/iterator_test.cpp index c64f67e6f0..aa41e5a5dd 100644 --- a/src/regex_dfa/util/iterator_test.cpp +++ b/src/regex_dfa/util/iterator_test.cpp @@ -5,8 +5,11 @@ // file except in compliance with the License. You may obtain a copy of // the License at: http://opensource.org/licenses/MIT -#include -#include +#include + +#include + +#include #include #include @@ -16,107 +19,107 @@ using namespace std; using namespace regex_dfa::util; -TEST(util_iterator_reversed, empty) +TEST_CASE("util_iterator_reversed.empty") { const vector v; auto x = reversed(v); auto i = begin(x); - ASSERT_TRUE(i == end(x)); + REQUIRE(i == end(x)); } -TEST(util_iterator_reversed, one) +TEST_CASE("util_iterator_reversed.one") { const vector v { 1 }; auto x = reversed(v); auto i = begin(x); - ASSERT_EQ(1, *i); + REQUIRE(1 == *i); i++; - ASSERT_TRUE(i == end(x)); + REQUIRE(i == end(x)); } -TEST(util_iterator_reversed, many) +TEST_CASE("util_iterator_reversed.many") { const vector v { 1, 2, 3 }; auto x = reversed(v); auto i = begin(x); - ASSERT_EQ(3, *i); + REQUIRE(3 == *i); i++; - ASSERT_EQ(2, *i); + REQUIRE(2 == *i); i++; - ASSERT_EQ(1, *i); + REQUIRE(1 == *i); i++; - ASSERT_TRUE(i == end(x)); + REQUIRE(i == end(x)); } -TEST(util_iterator_indexed, many_const) +TEST_CASE("util_iterator_indexed.many_const") { const vector v { 10, 20, 30 }; const auto x = indexed(v); static_assert(is_const::value); auto i = begin(x); - ASSERT_EQ(0, (*i).first); - ASSERT_EQ(10, (*i).second); + REQUIRE(0 == (*i).first); + REQUIRE(10 == (*i).second); i++; - ASSERT_EQ(1, (*i).first); - ASSERT_EQ(20, (*i).second); + REQUIRE(1 == (*i).first); + REQUIRE(20 == (*i).second); i++; - ASSERT_EQ(2, (*i).first); - ASSERT_EQ(30, (*i).second); + REQUIRE(2 == (*i).first); + REQUIRE(30 == (*i).second); i++; - ASSERT_TRUE(i == end(x)); + REQUIRE(i == end(x)); } -TEST(util_iterator_indexed, many) +TEST_CASE("util_iterator_indexed.many") { vector v { "zero", "one", "two" }; auto x = indexed(v); auto i = begin(x); - ASSERT_EQ(0, (*i).first); - ASSERT_EQ("zero", (*i).second); + REQUIRE(0 == (*i).first); + REQUIRE("zero" == (*i).second); i++; - ASSERT_EQ(1, (*i).first); - ASSERT_EQ("one", (*i).second); + REQUIRE(1 == (*i).first); + REQUIRE("one" == (*i).second); i++; - ASSERT_EQ(2, (*i).first); - ASSERT_EQ("two", (*i).second); + REQUIRE(2 == (*i).first); + REQUIRE("two" == (*i).second); i++; - ASSERT_TRUE(i == end(x)); + REQUIRE(i == end(x)); } -TEST(util_iterator_indexed, range_based_for_loop) +TEST_CASE("util_iterator_indexed.range_based_for_loop") { - log("const:"); + INFO("const:"); const vector v1 { 10, 20, 30 }; for (const auto&& [index, value]: indexed(v1)) - logf("index {}, value {}", index, value); + INFO(fmt::format("index {}, value {}", index, value)); - log("non-const:"); + INFO("non-const:"); vector v2 { 10, 20, 30 }; for (const auto&& [index, value]: indexed(v2)) - logf("index {}, value {}", index, value); + INFO(fmt::format("index {}, value {}", index, value)); } -TEST(util_iterator_filter, for_range) +TEST_CASE("util_iterator_filter.for_range") { const vector nums = { 1, 2, 3, 4 }; vector odds; for (const int i: filter(nums, [](int x) { return x % 2 != 0; })) odds.push_back(i); - ASSERT_EQ(2, odds.size()); - EXPECT_EQ(1, odds[0]); - EXPECT_EQ(3, odds[1]); + REQUIRE(2 == odds.size()); + REQUIRE(1 == odds[0]); + CHECK(3 == odds[1]); } -TEST(util_iterator_filter, count_proc_invocations) +TEST_CASE("util_iterator_filter.count_proc_invocations") { static const array numbers = { 1, 2, 3, 4 }; int count = 0; @@ -126,54 +129,54 @@ TEST(util_iterator_filter, count_proc_invocations) }; const auto f = filter(numbers, counter); for_each(begin(f), end(f), [](int) {}); - ASSERT_EQ(4, count); + REQUIRE(4 == count); } -TEST(util_iterator_filter, for_range_initializer_list) +TEST_CASE("util_iterator_filter.for_range_initializer_list") { static const array numbers = { 1, 2, 3, 4 }; vector odds; auto f_odd = [&](int x) { - logf("f_odd: x={0}", x); + INFO(fmt::format("f_odd: x={0}", x)); return x % 2 != 0; }; for (const int i: filter(numbers, f_odd)) odds.push_back(i); - ASSERT_EQ(2, odds.size()); - EXPECT_EQ(1, odds[0]); - EXPECT_EQ(3, odds[1]); + REQUIRE(2 == odds.size()); + CHECK(1 == odds[0]); + CHECK(3 == odds[1]); } -TEST(util_iterator_translate, vector) +TEST_CASE("util_iterator_translate.vector") { const vector in { 1, 2, 3, 4 }; const vector out = translate(in, [](auto i) -> int { return int(i * 2); }); for (const auto&& [i, v]: indexed(out)) - logf("out[{}] = {}", i, v); + INFO(fmt::format("out[{}] = {}", i, v)); - ASSERT_EQ(4, out.size()); + REQUIRE(4 == out.size()); - EXPECT_EQ(2, out[0]); - EXPECT_EQ(4, out[1]); - EXPECT_EQ(6, out[2]); - EXPECT_EQ(8, out[3]); + CHECK(2 == out[0]); + CHECK(4 == out[1]); + CHECK(6 == out[2]); + CHECK(8 == out[3]); } -TEST(util_iterator_translate, chain_translate_join) +TEST_CASE("util_iterator_translate.chain_translate_join") { const vector in { 1, 2, 3, 4 }; const string out { join(translate(in, [](int i) -> string { return to_string(i); }), ", ") }; - ASSERT_EQ("1, 2, 3, 4", out); + REQUIRE("1, 2, 3, 4" == out); } -TEST(util_iterator, find_last) +TEST_CASE("util_iterator.find_last") { const vector v { 1, 2, 3, 4 }; const auto i = find_last(v, [](int i) { return i % 2 != 0; }); // find last odd value -> 3 - ASSERT_TRUE(i != end(v)); - ASSERT_EQ(3, *i); + REQUIRE(i != end(v)); + REQUIRE(3 == *i); } diff --git a/src/regex_dfa/util/literals.h b/src/regex_dfa/util/literals.h index 427822539e..9a1f9bc698 100644 --- a/src/regex_dfa/util/literals.h +++ b/src/regex_dfa/util/literals.h @@ -11,7 +11,8 @@ #include #include -namespace regex_dfa::util::literals { +namespace regex_dfa::util::literals +{ /** * Strips a multiline string's indentation prefix. @@ -27,47 +28,48 @@ namespace regex_dfa::util::literals { * * This prints three lines: @c "line one\nline two\nline three\n" */ -inline std::string operator""_multiline(const char* text, size_t size) +inline std::string operator""_multiline(const char* text, size_t /*size*/) { - if (!*text) - return {}; + if (!*text) + return {}; - enum class State { - LineData, - SkipUntilPrefix, - }; + enum class State + { + LineData, + SkipUntilPrefix, + }; - constexpr char LF = '\n'; - State state = State::LineData; - std::stringstream sstr; - char sep = *text++; + constexpr char LF = '\n'; + State state = State::LineData; + std::stringstream sstr; + char sep = *text++; - while (*text) - { - switch (state) - { - case State::LineData: - if (*text == LF) - { - state = State::SkipUntilPrefix; - sstr << *text++; - } - else - sstr << *text++; - break; - case State::SkipUntilPrefix: - if (*text == sep) - { - state = State::LineData; - text++; - } - else - text++; - break; - } - } + while (*text) + { + switch (state) + { + case State::LineData: + if (*text == LF) + { + state = State::SkipUntilPrefix; + sstr << *text++; + } + else + sstr << *text++; + break; + case State::SkipUntilPrefix: + if (*text == sep) + { + state = State::LineData; + text++; + } + else + text++; + break; + } + } - return sstr.str(); + return sstr.str(); } -} // namespace regex_dfa::util::literals +} // namespace regex_dfa::util::literals diff --git a/src/regex_dfa/util/overloaded.h b/src/regex_dfa/util/overloaded.h deleted file mode 100644 index 733d201c05..0000000000 --- a/src/regex_dfa/util/overloaded.h +++ /dev/null @@ -1,21 +0,0 @@ -// This file is part of the "klex" project, http://github.com/christianparpart/klex> -// (c) 2018 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT - -#pragma once - -// This is a nice helper for conviniently using std::visit() with an arbitrary list of lambdas as -// overload for pattern matching the variant's input type - -template -struct overloaded : Ts... -{ - using Ts::operator()...; -}; - -template -overloaded(Ts...) -> overloaded; - diff --git a/src/regex_dfa/util/testing.cpp b/src/regex_dfa/util/testing.cpp deleted file mode 100644 index b8ecaecdaa..0000000000 --- a/src/regex_dfa/util/testing.cpp +++ /dev/null @@ -1,610 +0,0 @@ -// This file is part of the "x0" project, http://github.com/christianparpart/x0> -// (c) 2009-2018 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -#if defined(_WIN32) || defined(_WIN64) - #include -#else - #include -#endif - -using namespace std; - -namespace regex_dfa::util::testing -{ - -auto static constexpr colorsReset = AnsiColor::codes(); -auto static constexpr colorsTestCaseHeader = AnsiColor::codes(); -auto static constexpr colorsError = AnsiColor::codes(); -auto static constexpr colorsOk = AnsiColor::codes(); -auto static constexpr colorsLog = AnsiColor::codes(); - -int main(int argc, const char* argv[]) -{ - return UnitTest::instance()->main(argc, argv); -} - -bool beginsWith(const string& str, const string_view& prefix) -{ - if (str.length() < prefix.length()) - { - return false; - } - - return string_view(&str[0], prefix.length()) == prefix; -} - -// ############################################################################ - -class BailOutException -{ - public: - BailOutException() {} -}; - -// ############################################################################ - -Environment::~Environment() -{ -} - -void Environment::SetUp() -{ -} - -void Environment::TearDown() -{ -} - -// ############################################################################ - -Test::~Test() -{ -} - -void Test::SetUp() -{ -} - -void Test::TearDown() -{ -} - -void Test::log(const string& message) -{ - UnitTest::instance()->log(message); -} - -void Test::reportUnhandledException(const exception& e) -{ - UnitTest::instance()->reportUnhandledException(e); -} - -// ############################################################################ - -TestInfo::TestInfo(const string& testCaseName, - const string& testName, - bool enabled, - unique_ptr&& testFactory): - testCaseName_(testCaseName), testName_(testName), enabled_(enabled), testFactory_(move(testFactory)) -{ -} - -// ############################################################################ - -UnitTest::UnitTest(): - environments_(), - testCases_(), - activeTests_(), - repeats_(1), - printProgress_(false), - printSummaryDetails_(true), - currentTestCase_(nullptr), - currentCount_(0), - successCount_(0), - failCount_(0) -{ -} - -UnitTest::~UnitTest() -{ -} - -UnitTest* UnitTest::instance() -{ - static UnitTest unitTest; - return &unitTest; -} - -void UnitTest::randomizeTestOrder() -{ - unsigned int seed = static_cast(chrono::system_clock::now().time_since_epoch().count()); - - shuffle(activeTests_.begin(), activeTests_.end(), default_random_engine(seed)); -} - -void UnitTest::sortTestsAlphabetically() -{ - sort(activeTests_.begin(), activeTests_.end(), [this](size_t a, size_t b) -> bool { - TestInfo* left = testCases_[a].get(); - TestInfo* right = testCases_[b].get(); - - if (left->testCaseName() < right->testCaseName()) - return true; - - if (left->testCaseName() == right->testCaseName()) - return left->testName() < right->testName(); - - return false; - }); -} - -bool initializeTTY() -{ -#if defined(_WIN32) && defined(ENABLE_VIRTUAL_TERMINAL_PROCESSING) - HANDLE output = GetStdHandle(STD_OUTPUT_HANDLE); - if (output == INVALID_HANDLE_VALUE) - return false; - - DWORD mode = 0; - if (!GetConsoleMode(output, &mode)) - return false; - - mode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING; - if (!SetConsoleMode(output, mode)) - return false; -#endif - - return true; -} - -int UnitTest::main(int argc, const char* argv[]) -{ - initializeTTY(); - // TODO: add CLI parameters (preferably gtest compatible) - // - // --no-color | --color explicitely enable/disable color output - // --filter=REGEX filter tests by regular expression - // --exclude=REGEX excludes tests by regular expressions - // --randomize randomize test order - // --repeats=NUMBER repeats tests given number of times - // --list[-tests] Just list the tests and exit. - - Flags flags; - flags.defineBool("help", 'h', "Prints this help and terminates.") - .defineBool("verbose", 'v', "Prints to console in debug log level.") - .defineString("filter", 'f', "GLOB", "Filters tests by given glob.", "*") - .defineString("exclude", 'e', "GLOB", "Excludes tests by given glob.", "") - .defineBool("list", 'l', "Prints all tests and exits.") - .defineBool("randomize", 'R', "Randomizes test order.") - .defineBool("sort", 's', "Sorts tests alphabetically ascending.") - .defineBool("no-progress", 0, "Avoids printing progress.") - .defineNumber("repeat", 'r', "COUNT", "Repeat tests given number of times.", 1); - - try - { - flags.parse(argc, argv); - } - catch (const exception& ex) - { - fprintf(stderr, "Failed to parse flags. %s\n", ex.what()); - return EXIT_FAILURE; - } - - if (flags.getBool("help")) - { - printf("%s\n", flags.helpText().c_str()); - return EXIT_SUCCESS; - } - - verbose_ = flags.getBool("verbose"); - - string filter = flags.getString("filter"); - string exclude = flags.getString("exclude"); - repeats_ = flags.getNumber("repeat"); - printProgress_ = !flags.getBool("no-progress"); - - if (flags.getBool("randomize")) - randomizeTestOrder(); - else if (flags.getBool("sort")) - sortTestsAlphabetically(); - - filterTests(filter, exclude); - - if (flags.getBool("list")) - { - printTestList(); - return EXIT_SUCCESS; - } - - run(); - - return failCount_ == 0 ? EXIT_SUCCESS : EXIT_FAILURE; -} - -void UnitTest::filterTests(const string& filter, const string& exclude) -{ - // if (filter != "*") { ... } - - vector filtered; - for (size_t i = 0, e = activeTests_.size(); i != e; ++i) - { - TestInfo* testInfo = testCases_[activeTests_[i]].get(); - string matchName = fmt::format("{}.{}", testInfo->testCaseName(), testInfo->testName()); - -#if defined(_WIN32) || defined(_WIN64) - if (!exclude.empty() && PathMatchSpec(matchName.c_str(), exclude.c_str()) == TRUE) - continue; // exclude this one - - if (PathMatchSpec(matchName.c_str(), filter.c_str()) == TRUE) - filtered.push_back(activeTests_[i]); -#else - const int flags = 0; - - if (!exclude.empty() && fnmatch(exclude.c_str(), matchName.c_str(), flags) == 0) - continue; // exclude this one - - if (fnmatch(filter.c_str(), matchName.c_str(), flags) == 0) - { - filtered.push_back(activeTests_[i]); - } -#endif - } - activeTests_ = move(filtered); -} - -void UnitTest::run() -{ - for (auto& env: environments_) - { - env->SetUp(); - } - - for (auto& init: initializers_) - { - init->invoke(); - } - - for (int i = 0; i < repeats_; i++) - { - runAllTestsOnce(); - } - - for (auto& env: environments_) - { - env->TearDown(); - } - - printSummary(); -} - -void UnitTest::printTestList() -{ - for (size_t i = 0, e = activeTests_.size(); i != e; ++i) - { - TestInfo* testCase = testCases_[activeTests_[i]].get(); - printf("%4zu. %s.%s\n", i + 1, testCase->testCaseName().c_str(), testCase->testName().c_str()); - } -} - -void UnitTest::printSummary() -{ - // print summary - fmt::print("{}Finished running {} tests ({} repeats). {} success, {} failed, {} disabled.{}\n", - failCount_ ? colorsError.data() : colorsOk.data(), - repeats_ * activeTests_.size(), - repeats_, - successCount_, - failCount_, - disabledCount(), - colorsReset.data()); - - if (printSummaryDetails_ && !failures_.empty()) - { - printf("================================\n"); - printf(" Summary:\n"); - printf("================================\n"); - - for (size_t i = 0, e = failures_.size(); i != e; ++i) - { - const auto& failure = failures_[i]; - fmt::print("{}{}{}\n", colorsError.data(), failure, colorsReset.data()); - } - } -} - -size_t UnitTest::enabledCount() const -{ - size_t count = 0; - - for (size_t i = 0, e = activeTests_.size(); i != e; ++i) - { - if (testCases_[activeTests_[i]]->isEnabled()) - { - count++; - } - } - - return count; -} - -size_t UnitTest::disabledCount() const -{ - size_t count = 0; - - for (size_t i = 0, e = activeTests_.size(); i != e; ++i) - { - if (!testCases_[activeTests_[i]]->isEnabled()) - { - count++; - } - } - - return count; -} - -void UnitTest::runAllTestsOnce() -{ - const size_t totalCount = repeats_ * enabledCount(); - - for (size_t i = 0, e = activeTests_.size(); i != e; ++i) - { - TestInfo* testCase = testCases_[activeTests_[i]].get(); - unique_ptr test = testCase->createTest(); - - if (!testCase->isEnabled()) - continue; - - currentTestCase_ = testCase; - currentCount_++; - size_t percentage = currentCount_ * 100 / totalCount; - - if (printProgress_) - { - fmt::print("{}{:>3} Running test: {}.{}{}\n", - colorsTestCaseHeader.data(), - percentage, - testCase->testCaseName(), - testCase->testName(), - colorsReset.data()); - } - - int failed = 0; - - try - { - test->SetUp(); - } - catch (const BailOutException&) - { - // SHOULD NOT HAPPEND: complain about it - failed++; - } - catch (...) - { - // TODO: report failure upon set-up phase, hence skipping actual test - failed++; - } - - if (!failed) - { - try - { - test->TestBody(); - } - catch (const BailOutException&) - { - // no-op - failed++; - } - catch (const exception& ex) - { - reportUnhandledException(ex); - failed++; - } - catch (...) - { - reportMessage("Unhandled exception caught in test.", false); - failed++; - } - - try - { - test->TearDown(); - } - catch (const BailOutException&) - { - // SHOULD NOT HAPPEND: complain about it - failed++; - } - catch (...) - { - // TODO: report failure in tear-down - failed++; - } - - if (!failed) - { - successCount_++; - } - } - } -} - -void UnitTest::reportError( - const char* fileName, int lineNo, bool fatal, const char* actual, const error_code& ec) -{ - string message = fmt::format("{}:{}: Failure\n" - " Value of: {}\n" - " Expected: success\n" - " Actual: ({}) {}\n", - fileName, - lineNo, - actual, - ec.category().name(), - ec.message()); - - reportMessage(message, fatal); -} - -void UnitTest::reportError(const char* fileName, - int lineNo, - bool fatal, - const char* expected, - const error_code& expectedEvaluated, - const char* actual, - const error_code& actualEvaluated) -{ - string message = fmt::format("{}:{}: Failure\n" - " Value of: {}\n" - " Expected: ({}) {}\n" - " Actual: ({}) {}\n", - fileName, - lineNo, - actual, - expectedEvaluated.category().name(), - expectedEvaluated.message(), - actualEvaluated.category().name(), - actualEvaluated.message()); - - reportMessage(message, fatal); -} - -void UnitTest::reportBinary(const char* fileName, - int lineNo, - bool fatal, - const char* expected, - const char* actual, - const string& actualEvaluated, - const char* op) -{ - string message = fmt::format("{}:{}: Failure\n" - " Value of: {}\n" - " Expected: {} {}\n" - " Actual: {}\n", - fileName, - lineNo, - actual, - expected, - op, - actualEvaluated); - - reportMessage(message, fatal); -} - -void UnitTest::reportUnhandledException(const exception& e) -{ - string message = fmt::format("Unhandled Exception\n" - " Type: {}\n" - " What: {}\n", - typeid(e).name(), - e.what()); - reportMessage(message, false); -} - -void UnitTest::reportEH(const char* fileName, - int lineNo, - bool fatal, - const char* program, - const char* expected, - const char* actual) -{ - string message = fmt::format("{}:{}: {}\n" - " Value of: {}\n" - " Expected: {}\n" - " Actual: {}\n", - fileName, - lineNo, - actual ? "Unexpected exception caught" : "No exception caught", - program, - expected, - actual); - - reportMessage(message, fatal); -} - -void UnitTest::reportMessage(const char* fileName, int lineNo, bool fatal, const string& msg) -{ - string message = fmt::format("{}:{}: {}\n", fileName, lineNo, msg); - reportMessage(message, fatal); -} - -void UnitTest::reportMessage(const string& message, bool fatal) -{ - fmt::print("{}{}{}\n", colorsError.data(), message, colorsReset.data()); - - failCount_++; - failures_.emplace_back(message); - - if (fatal) - { - throw BailOutException(); - } -} - -void UnitTest::addEnvironment(unique_ptr&& env) -{ - environments_.emplace_back(move(env)); -} - -Callback* UnitTest::addInitializer(unique_ptr&& cb) -{ - initializers_.emplace_back(move(cb)); - return initializers_.back().get(); -} - -TestInfo* UnitTest::addTest(const char* testCaseName, - const char* testName, - unique_ptr&& testFactory) -{ - testCases_.emplace_back( - make_unique(testCaseName, - testName, - !beginsWith(testCaseName, "DISABLED_") && !beginsWith(testName, "DISABLED_"), - move(testFactory))); - - activeTests_.emplace_back(activeTests_.size()); - - return testCases_.back().get(); -} - -void UnitTest::log(const string& message) -{ - if (verbose_) - { - size_t bol = 0; - size_t eol = 0; - do - { - eol = message.find('\n', bol); - string line = message.substr(bol, eol - bol); - if (eol + 1 < message.size() || (!line.empty() && line != "\n")) - { - fmt::print("{}{}.{}:{} {}\n", - colorsLog.data(), - currentTestCase_->testCaseName(), - currentTestCase_->testName(), - colorsReset.data(), - line); - } - bol = eol + 1; - } while (eol != string::npos); - } -} - -} // namespace regex_dfa::util::testing diff --git a/src/regex_dfa/util/testing.h b/src/regex_dfa/util/testing.h deleted file mode 100644 index 5e726a5820..0000000000 --- a/src/regex_dfa/util/testing.h +++ /dev/null @@ -1,425 +0,0 @@ -// This file is part of the "x0" project, http://github.com/christianparpart/x0> -// (c) 2009-2018 Christian Parpart -// -// Licensed under the MIT License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of -// the License at: http://opensource.org/licenses/MIT - -#pragma once - -#include - -#include -#include -#include - -namespace regex_dfa::util::testing { - -#define TEST_ENV_SETUP(Name) \ - class _CALLBACK_NAME(Name) : public ::regex_dfa::util::testing::Callback { \ - public: \ - void invoke() override; \ - private: \ - static ::regex_dfa::util::testing::Callback* const ref_ [[maybe_unused]]; \ - }; \ - \ - ::regex_dfa::util::testing::Callback* const \ - _CALLBACK_NAME(Name)::ref_ = \ - ::regex_dfa::util::testing::UnitTest::instance()->addInitializer( \ - std::make_unique<_CALLBACK_NAME>(Name)); \ - \ - void _CALLBACK_NAME(Name)::invoke() - -#define _CALLBACK_NAME(Name) Callback_##Name - -#define TEST_ENV_TEARDOWN(Name) // TODO - -#define TEST_ENV_F(EnvName) \ - ::regex_dfa::util::testing::UnitTest::instance()->addEnvironment( \ - std::unique_ptr<::regex_dfa::util::testing::Environment>(EnvName)); - -// ############################################################################ - -#define TEST(testCase, testName) _CREATE_TEST(testCase, testName, ::regex_dfa::util::testing::Test) -#define TEST_F(testFixture, testName) _CREATE_TEST(testFixture, testName, testFixture) - -#define EXPECT_EQ(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, ==) - -#define EXPECT_NE(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, !=) - -#define EXPECT_GE(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, >=) - -#define EXPECT_LE(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, <=) - -#define EXPECT_GT(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, >) - -#define EXPECT_LT(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, false, expected, actual, <) - -#define EXPECT_TRUE(actual) \ - _EXPECT_BOOLEAN(__FILE__, __LINE__, false, true, actual) - -#define EXPECT_FALSE(actual) \ - _EXPECT_BOOLEAN(__FILE__, __LINE__, false, false, actual) - -#define EXPECT_NEAR(expected, actual, diff) // TODO - -#define REPORT_ERROR(message) \ - do { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportMessage( \ - __FILE__, __LINE__, false, (message)); \ - } while (0) - -#define EXPECT_ERROR_CODE_SUCCESS(errorCode) \ - if (errorCode) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ - __FILE__, __LINE__, false, #errorCode, errorCode); \ - } - -#define EXPECT_ERROR_CODE(expected, actual) \ - do { \ - std::error_code actual_ {(actual)}; \ - if (actual_ != (expected)) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ - __FILE__, __LINE__, false, \ - #expected, (expected), \ - #actual, actual_); \ - } \ - } while (0) - -#define EXPECT_THROW(program, ExceptionType) \ - do { \ - try { \ - program; \ - ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ - __FILE__, __LINE__, false, #program, #ExceptionType, \ - ""); \ - } catch (const ExceptionType&) { \ - break; \ - } catch (...) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ - __FILE__, __LINE__, false, #program, #ExceptionType, ""); \ - } \ - } while (0) - -#define EXPECT_ANY_THROW(program) \ - do { \ - try { \ - program; \ - ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ - __FILE__, __LINE__, false, #program, "", \ - ""); \ - } catch (...) { \ - } \ - } while (0) - -// ############################################################################ - -#define ASSERT_EQ(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, ==) - -#define ASSERT_NE(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, !=) - -#define ASSERT_GE(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, >=) - -#define ASSERT_LE(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, <=) - -#define ASSERT_GT(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, >) - -#define ASSERT_LT(expected, actual) \ - _EXPECT_BINARY(__FILE__, __LINE__, true, expected, actual, <) - -#define ASSERT_TRUE(actual) \ - _EXPECT_BOOLEAN(__FILE__, __LINE__, true, true, actual) - -#define ASSERT_FALSE(actual) \ - _EXPECT_BOOLEAN(__FILE__, __LINE__, true, false, actual) - -#define ASSERT_NEAR(expected, actual, diff) // TODO - -#define ASSERT_ERROR_CODE_SUCCESS(errorCode) \ - if (errorCode) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ - __FILE__, __LINE__, true, #errorCode, errorCode); \ - } - -#define ASSERT_ERROR_CODE(expected, actual) \ - do { \ - std::error_code actual_ {(actual)}; \ - if (actual_ != (expected)) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportError( \ - __FILE__, __LINE__, true, \ - #expected, (expected), \ - #actual, actual_); \ - } \ - } while (0) - -#define ASSERT_THROW(program, ExceptionType) \ - do { \ - try { \ - program; \ - ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ - __FILE__, __LINE__, true, #program, #ExceptionType, \ - ""); \ - } catch (const ExceptionType&) { \ - break; \ - } catch (...) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ - __FILE__, __LINE__, true, #program, #ExceptionType, ""); \ - } \ - } while (0) - -#define ASSERT_ANY_THROW(program) \ - do { \ - try { \ - program; \ - ::regex_dfa::util::testing::UnitTest::instance()->reportEH( \ - __FILE__, __LINE__, true, #program, "", \ - ""); \ - } catch (...) { \ - } \ - } while (0) - -// ############################################################################ - -#define _EXPECT_BOOLEAN(fileName, lineNo, fatal, expected, actual) \ - do { \ - bool actualEvaluated = !! (actual); \ - bool failed = (expected && !actualEvaluated) \ - || (!expected && actualEvaluated); \ - if (failed) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportBinary( \ - __FILE__, __LINE__, fatal, #expected, #actual, \ - ::fmt::format("{}", (actualEvaluated)), ""); \ - } \ - } while (0) - -#define _EXPECT_BINARY(fileName, lineNo, fatal, expected, actual, op) \ - do { \ - auto actual_ = (actual); \ - if (!((expected) op (actual_))) { \ - ::regex_dfa::util::testing::UnitTest::instance()->reportBinary( \ - __FILE__, __LINE__, fatal, #expected, #actual, \ - ::fmt::format("{}", actual_), #op); \ - } \ - } while (0) - -#define _TEST_CLASS_NAME(testCaseName, testName) \ - Test_##testCaseName##testName - -#define _CREATE_TEST(testCaseName, testName, ParentClass) \ -class _TEST_CLASS_NAME(testCaseName, testName) : public ParentClass { \ - public: \ - _TEST_CLASS_NAME(testCaseName, testName)() {} \ - \ - private: \ - virtual void TestBody(); \ - \ - static ::regex_dfa::util::testing::TestInfo* const test_info_; \ -}; \ - \ -::regex_dfa::util::testing::TestInfo* const \ -_TEST_CLASS_NAME(testCaseName, testName)::test_info_ = \ - ::regex_dfa::util::testing::UnitTest::instance()->addTest( \ - #testCaseName, #testName, \ - std::make_unique< \ - ::regex_dfa::util::testing::TestFactoryTemplate< \ - _TEST_CLASS_NAME(testCaseName, testName)>>()); \ - \ -void _TEST_CLASS_NAME(testCaseName, testName)::TestBody() - -// ############################################################################ - -int main(int argc, const char* argv[]); - -// ############################################################################ - -class Callback { - public: - virtual ~Callback() {} - - virtual void invoke() = 0; -}; - -/** - * Environment hooks. - */ -class Environment { - public: - virtual ~Environment(); - - virtual void SetUp(); - virtual void TearDown(); -}; - -/** - * interface to a single test. - */ -class Test { - public: - virtual ~Test(); - - virtual void SetUp(); - virtual void TestBody() = 0; - virtual void TearDown(); - - void log(const std::string& message); - - template - void logf(const char* fmt, Args... args); - - void reportUnhandledException(const std::exception& e); -}; - -/** - * API to create one kind of a test. - */ -class TestFactory { - TestFactory(const TestFactory&) = delete; - TestFactory& operator=(const TestFactory&) = delete; - - public: - TestFactory() {} - virtual ~TestFactory() {} - virtual std::unique_ptr createTest() = 0; -}; - -template -class TestFactoryTemplate : public TestFactory { - public: - std::unique_ptr createTest() override { - return std::make_unique(); - } -}; - -/** - * TestInfo describes a single test. - */ -class TestInfo { - TestInfo(const TestInfo&) = delete; - TestInfo& operator=(const TestInfo&) = delete; - - public: - TestInfo(const std::string& testCaseName, - const std::string& testName, - bool enabled, - std::unique_ptr&& testFactory); - - const std::string& testCaseName() const { return testCaseName_; } - const std::string& testName() const { return testName_; } - bool isEnabled() const { return enabled_; } - - std::unique_ptr createTest() { return testFactory_->createTest(); } - - private: - std::string testCaseName_; - std::string testName_; - bool enabled_; - std::unique_ptr testFactory_; -}; - -class UnitTest { - public: - UnitTest(); - ~UnitTest(); - - static UnitTest* instance(); - - int main(int argc, const char* argv[]); - - void randomizeTestOrder(); - void sortTestsAlphabetically(); - void printTestList(); - void filterTests(const std::string& filter, const std::string& exclude); - void run(); - - void addEnvironment(std::unique_ptr&& env); - - Callback* addInitializer(std::unique_ptr&& cb); - - TestInfo* addTest(const char* testCaseName, - const char* testName, - std::unique_ptr&& testFactory); - - void reportError(const char* fileName, - int lineNo, - bool fatal, - const char* actual, - const std::error_code& ec); - - void reportError(const char* fileName, - int lineNo, - bool fatal, - const char* expected, - const std::error_code& expectedEvaluated, - const char* actual, - const std::error_code& actualEvaluated); - - void reportBinary(const char* fileName, - int lineNo, - bool fatal, - const char* expected, - const char* actual, - const std::string& actualEvaluated, - const char* op); - - void reportUnhandledException(const std::exception& e); - - void reportEH(const char* fileName, - int lineNo, - bool fatal, - const char* program, - const char* expected, - const char* actual); - - void reportMessage(const std::string& message, bool fatal); - void reportMessage(const char* fileName, int lineNo, bool fatal, const std::string& message); - - void log(const std::string& message); - - template - void logf(const char* format, Args... args) { - log(fmt::format(format, args...)); - } - - private: - void runAllTestsOnce(); - void printSummary(); - size_t enabledCount() const; - size_t disabledCount() const; - - private: - std::vector> environments_; - std::vector> initializers_; - std::vector> testCases_; - - //! ordered list of tests as offsets into testCases_ - std::vector activeTests_; - - int repeats_; - bool verbose_; - bool printProgress_; - bool printSummaryDetails_; - - TestInfo* currentTestCase_; - size_t currentCount_; - size_t successCount_; - int failCount_; - std::vector failures_; -}; - -template -inline void Test::logf(const char* fmt, Args... args) { - UnitTest::instance()->logf(fmt, args...); -} - -} // namespace regex_dfa::util::testing From da9ac1b547af048579422570620717bd5257958b Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Tue, 30 May 2023 01:58:12 +0200 Subject: [PATCH 4/5] wip Signed-off-by: Christian Parpart --- src/vtbackend/CMakeLists.txt | 1 + src/vtbackend/TerminalState.cpp | 6 +++--- src/vtbackend/TerminalState.h | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/vtbackend/CMakeLists.txt b/src/vtbackend/CMakeLists.txt index 4ad0ab1cf1..039ef91b79 100644 --- a/src/vtbackend/CMakeLists.txt +++ b/src/vtbackend/CMakeLists.txt @@ -103,6 +103,7 @@ target_link_libraries(vtbackend PUBLIC fmt::fmt-header-only range-v3::range-v3 ${LIBUNICODE_LIBS} + regex_dfa vtparser vtpty ) diff --git a/src/vtbackend/TerminalState.cpp b/src/vtbackend/TerminalState.cpp index 7b1fb76a8d..b850815e5c 100644 --- a/src/vtbackend/TerminalState.cpp +++ b/src/vtbackend/TerminalState.cpp @@ -2,6 +2,8 @@ #include #include +#include + namespace terminal { @@ -16,9 +18,7 @@ TerminalState::TerminalState(Terminal& terminal): te->discardImage(*image); } }, hyperlinks { HyperlinkCache { 1024 } }, - urlPattern { settings.urlPattern, - std::regex_constants::ECMAScript | std::regex_constants::optimize - | std::regex_constants::icase }, + urlPattern { regex_dfa::RegExprParser {}.parse(settings.urlPattern) }, sequencer { terminal }, parser { std::ref(sequencer) }, viCommands { terminal }, diff --git a/src/vtbackend/TerminalState.h b/src/vtbackend/TerminalState.h index cc875dbcfa..7940465c25 100644 --- a/src/vtbackend/TerminalState.h +++ b/src/vtbackend/TerminalState.h @@ -18,6 +18,8 @@ #include +#include + #include #include @@ -194,7 +196,7 @@ struct TerminalState // Hyperlink related // HyperlinkStorage hyperlinks {}; - std::regex urlPattern; + regex_dfa::RegExpr urlPattern; std::string windowTitle {}; std::stack savedWindowTitles {}; From 5e4e5fa34f6edf064e12c44d93542dc6204017fe Mon Sep 17 00:00:00 2001 From: Christian Parpart Date: Mon, 21 Aug 2023 21:21:32 +0200 Subject: [PATCH 5/5] Add embedded CTRE Signed-off-by: Christian Parpart --- cmake/ContourThirdParties.cmake | 4 ++++ scripts/install-deps.sh | 11 +++++++++++ src/vtbackend/CMakeLists.txt | 1 + 3 files changed, 16 insertions(+) diff --git a/cmake/ContourThirdParties.cmake b/cmake/ContourThirdParties.cmake index 75f9334f09..ecad1d2e08 100644 --- a/cmake/ContourThirdParties.cmake +++ b/cmake/ContourThirdParties.cmake @@ -130,6 +130,9 @@ endif() ContourThirdParties_Embed_boxed_cpp() set(THIRDPARTY_BUILDIN_boxed_cpp "embedded") +ContourThirdParties_Embed_ctre() +set(THIRDPARTY_BUILDIN_ctre "embedded") + macro(ContourThirdPartiesSummary2) message(STATUS "==============================================================================") message(STATUS " Contour ThirdParties") @@ -144,5 +147,6 @@ macro(ContourThirdPartiesSummary2) message(STATUS "libunicode ${THIRDPARTY_BUILTIN_unicode_core} (${LIBUNICODE_LIBS})") message(STATUS "yaml-cpp ${THIRDPARTY_BUILTIN_yaml_cpp}") message(STATUS "boxed-cpp ${THIRDPARTY_BUILDIN_boxed_cpp}") + message(STATUS "CTRE ${THIRDPARTY_BUILDIN_ctre}") message(STATUS "------------------------------------------------------------------------------") endmacro() diff --git a/scripts/install-deps.sh b/scripts/install-deps.sh index 011372964b..787df91f2c 100755 --- a/scripts/install-deps.sh +++ b/scripts/install-deps.sh @@ -108,6 +108,16 @@ fetch_and_unpack_termbenchpro() termbench_pro } +fetch_and_unpack_ctre() +{ + local ctre_git_sha="0fdd96db416188a07833606b16633fb977c0cc11" + fetch_and_unpack \ + compile-time-regular-expressions-$ctre_git_sha \ + ctre-$ctre_git_sha.tar.gz \ + https://github.com/hanickadot/compile-time-regular-expressions/archive/$ctre_git_sha.tar.gz \ + ctre +} + fetch_and_unpack_boxed() { local boxed_cpp_git_sha="daa702e22e71f3da3eef838e4946b6c3df1f16b1" @@ -573,6 +583,7 @@ main() fetch_and_unpack_boxed fetch_and_unpack_termbenchpro + fetch_and_unpack_ctre } main $* diff --git a/src/vtbackend/CMakeLists.txt b/src/vtbackend/CMakeLists.txt index 039ef91b79..fc00e9b5f7 100644 --- a/src/vtbackend/CMakeLists.txt +++ b/src/vtbackend/CMakeLists.txt @@ -104,6 +104,7 @@ target_link_libraries(vtbackend PUBLIC range-v3::range-v3 ${LIBUNICODE_LIBS} regex_dfa + ctre::ctre vtparser vtpty )