diff --git a/lttoolbox/lt-trim.1 b/lttoolbox/lt-trim.1 index 307c58a1..e0e5fc48 100644 --- a/lttoolbox/lt-trim.1 +++ b/lttoolbox/lt-trim.1 @@ -85,6 +85,24 @@ You should not trim a generator unless you have a .Em very simple translator pipeline, since the output of bidix seldom goes unchanged through transfer. +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl s , Fl Fl match-section +A section with this name (id@type) in the analyser will only be +trimmed against a section with the same id in the bidix. (The default +is to trim all sections of the analyser against all sections of the +bidix.) Using this option can some times speed up trimming +considerably. For example, if you have some complicated regular +expressions, try putting them in a + +
+ +in both .dix files and passing +.Dq regex@standard +to \fI--match-section\fP. +.Pp +This argument may be used multiple times to specify multiple sections +that must match by name. .Sh FILES .Bl -tag -width Ds .It Ar analyser_binary diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index da61ff15..3b1ecf9b 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -21,7 +21,7 @@ #include void -trim(FILE* file_mono, FILE* file_bi, FILE* file_out) +trim(FILE* file_mono, FILE* file_bi, FILE* file_out, std::set match_sections) { Alphabet alph_mono; std::set letters_mono; @@ -41,30 +41,43 @@ trim(FILE* file_mono, FILE* file_bi, FILE* file_out) std::set loopback_symbols; // ints refer to alph_prefix alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); + UString union_name = u""; // Not a valid section name, used as key for those where we don't care about names matching + std::map moved_bi_transducers; for (auto& it : trans_bi) { - if (union_transducer.isEmpty()) { - union_transducer = it.second; - } else { - union_transducer.unionWith(alph_bi, it.second); + if(match_sections.contains(it.first)) { + moved_bi_transducers[it.first] = it.second.appendDotStar(loopback_symbols).moveLemqsLast(alph_prefix); + } + else { + if (union_transducer.isEmpty()) { + union_transducer = it.second; + } + else { + union_transducer.unionWith(alph_bi, it.second); + } } } union_transducer.minimize(); - Transducer prefix_transducer = union_transducer.appendDotStar(loopback_symbols); - // prefix_transducer should _not_ be minimized (both useless and takes forever) - Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); + // prefix/moved transducer should _not_ be minimized (both useless and takes forever) + moved_bi_transducers[union_name] = union_transducer.appendDotStar(loopback_symbols).moveLemqsLast(alph_prefix); std::map trans_trim; + std::set sections_unmatched = match_sections; // just used to warn if user asked for a match that never happened for (auto& it : trans_mono) { if (it.second.numberOfTransitions() == 0) { std::cerr << "Warning: section " << it.first << " is empty! Skipping it..." << std::endl; continue; } - // TODO: parallelise this loop (as in lt_compose.cc) + if (moved_bi_transducers.contains(it.first)) { + sections_unmatched.erase(it.first); + } + Transducer& moved_transducer = moved_bi_transducers.contains(it.first) + ? moved_bi_transducers[it.first] + : moved_bi_transducers[union_name]; Transducer trimmed = it.second.trim(moved_transducer, - alph_mono, - alph_prefix); + alph_mono, + alph_prefix); if (trimmed.hasNoFinals()) { std::cerr << "Warning: section " << it.first << " had no final state after trimming! Skipping it..." << std::endl; continue; @@ -72,6 +85,9 @@ trim(FILE* file_mono, FILE* file_bi, FILE* file_out) trimmed.minimize(); trans_trim[it.first] = trimmed; } + for (const auto &name : sections_unmatched) { + std::cerr << "Warning: section " << name << " was not found in both transducers! Skipping if in just one..." << std::endl; + } if (trans_trim.empty()) { std::cerr << "Error: Trimming gave empty transducer!" << std::endl; @@ -91,13 +107,21 @@ int main(int argc, char *argv[]) cli.add_file_arg("analyser_bin_file", false); cli.add_file_arg("bidix_bin_file"); cli.add_file_arg("trimmed_bin_file"); + cli.add_str_arg('s', "match-section", "A section with this name (id@type) will only be trimmed against a section with the same name. This argument may be used multiple times.", "section_name"); cli.parse_args(argc, argv); + auto strs = cli.get_strs(); + std::set match_sections; + if (strs.find("match-section") != strs.end()) { + for (auto &it : strs["match-section"]) { + match_sections.insert(to_ustring(it.c_str())); + } + } FILE* analyser = openInBinFile(cli.get_files()[0]); FILE* bidix = openInBinFile(cli.get_files()[1]); FILE* output = openOutBinFile(cli.get_files()[2]); - trim(analyser, bidix, output); + trim(analyser, bidix, output, match_sections); fclose(analyser); fclose(bidix);