From d30a1f038acebcf392030f4e7d3a0229169fdd5f Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 20 Mar 2021 20:56:45 -0500 Subject: [PATCH 1/2] make lt-comp go a bit faster --- lttoolbox/compiler.cc | 15 +++++++++++++-- lttoolbox/compiler.h | 6 ++++++ lttoolbox/fst_processor.cc | 3 ++- lttoolbox/transducer.cc | 14 +++++++------- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 0089f288..e4dbce89 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -58,6 +58,7 @@ wstring const Compiler::COMPILER_GROUP_ELEM = L"g"; wstring const Compiler::COMPILER_LEMMA_ATTR = L"lm"; wstring const Compiler::COMPILER_IGNORE_ATTR = L"i"; wstring const Compiler::COMPILER_IGNORE_YES_VAL = L"yes"; +wstring const Compiler::COMPILER_REGEX_ATTR = L"regex"; wstring const Compiler::COMPILER_ALT_ATTR = L"alt"; wstring const Compiler::COMPILER_V_ATTR = L"v"; wstring const Compiler::COMPILER_VL_ATTR = L"vl"; @@ -127,6 +128,9 @@ Compiler::parse(string const &file, wstring const &dir) // Minimize transducers for(auto& it : sections) { + if (it.first.size() > 6 && it.first.substr(it.first.size()-6) == L"@regex") { + continue; + } it.second.minimize(); } @@ -208,13 +212,20 @@ Compiler::procParDef() if(type != XML_READER_TYPE_END_ELEMENT) { current_paradigm = attrib(COMPILER_N_ATTR); + current_minimise = true; + if (attrib(COMPILER_REGEX_ATTR) == L"yes") { + current_minimise = false; + } } else { if(!paradigms[current_paradigm].isEmpty()) { - paradigms[current_paradigm].minimize(); - paradigms[current_paradigm].joinFinals(); + if (current_minimise) { + paradigms[current_paradigm].minimize(); + } else { + paradigms[current_paradigm].joinFinals(); + } current_paradigm = L""; } } diff --git a/lttoolbox/compiler.h b/lttoolbox/compiler.h index 1d70c235..b7f62aad 100644 --- a/lttoolbox/compiler.h +++ b/lttoolbox/compiler.h @@ -81,6 +81,11 @@ class Compiler */ wstring current_section; + /** + * Whether the current section of paradigm should be minimised + */ + bool current_minimise; + /** * The direction of the compilation, 'lr' (left-to-right) or 'rl' * (right-to-left) @@ -325,6 +330,7 @@ class Compiler LTTOOLBOX_IMPORTS static wstring const COMPILER_LEMMA_ATTR; LTTOOLBOX_IMPORTS static wstring const COMPILER_IGNORE_ATTR; LTTOOLBOX_IMPORTS static wstring const COMPILER_IGNORE_YES_VAL; + LTTOOLBOX_IMPORTS static wstring const COMPILER_REGEX_ATTR; LTTOOLBOX_IMPORTS static wstring const COMPILER_ALT_ATTR; LTTOOLBOX_IMPORTS static wstring const COMPILER_V_ATTR; LTTOOLBOX_IMPORTS static wstring const COMPILER_VL_ATTR; diff --git a/lttoolbox/fst_processor.cc b/lttoolbox/fst_processor.cc index 6a4fb96e..12526186 100644 --- a/lttoolbox/fst_processor.cc +++ b/lttoolbox/fst_processor.cc @@ -881,7 +881,8 @@ FSTProcessor::classifyFinals() inconditional.insert(it->second.getFinals().begin(), it->second.getFinals().end()); } - else if(endsWith(it->first, L"@standard")) + else if(endsWith(it->first, L"@standard") || + endsWith(it->first, L"@regex")) { standard.insert(it->second.getFinals().begin(), it->second.getFinals().end()); diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index 5aa8d749..fc6fe8c4 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -319,6 +319,11 @@ Transducer::determinize(int const epsilon_tag) int t = 0; + set finals_state; + for(auto& it2 : finals) { + finals_state.insert(it2.first); + } + while(size_Q_prime != Q_prime.size()) { size_Q_prime = Q_prime.size(); @@ -326,11 +331,6 @@ Transducer::determinize(int const epsilon_tag) for(auto& it : R[t]) { - set finals_state; - for(auto& it2 : finals) - { - finals_state.insert(it2.first); - } if(!isEmptyIntersection(Q_prime[it], finals_state)) { double w = default_weight; @@ -378,8 +378,8 @@ Transducer::determinize(int const epsilon_tag) t = (t+1)%2; } - transitions = transitions_prime; - finals = finals_prime; + transitions.swap(transitions_prime); + finals.swap(finals_prime); initial = initial_prime; } From c34b64cccd897a6b62f71eb2c683a4c0c7f8cedc Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Sat, 20 Mar 2021 21:04:55 -0500 Subject: [PATCH 2/2] update DTD and whatnot --- lttoolbox/dix.dtd | 3 ++- lttoolbox/dix.rnc | 4 ++-- lttoolbox/dix.rng | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lttoolbox/dix.dtd b/lttoolbox/dix.dtd index 3eb94236..ff56034e 100644 --- a/lttoolbox/dix.dtd +++ b/lttoolbox/dix.dtd @@ -44,6 +44,7 @@ diff --git a/lttoolbox/dix.rnc b/lttoolbox/dix.rnc index b60ecb7d..a5984dfc 100644 --- a/lttoolbox/dix.rnc +++ b/lttoolbox/dix.rnc @@ -38,7 +38,7 @@ attlist.pardefs &= empty # paradigm definition section pardef = element pardef { attlist.pardef, e+ } # paradigm definition -attlist.pardef &= attribute n { text } +attlist.pardef &= attribute n { text }, attribute regex { "no" | "yes" }? # n: paradigm name attlist.pardef &= attribute c { text }? # c: comment about paradigm @@ -47,7 +47,7 @@ section = element section { attlist.section, e+ } attlist.section &= attribute id { xsd:ID }, attribute type { - "standard" | "inconditional" | "postblank" | "preblank" + "standard" | "inconditional" | "postblank" | "preblank" | "regex" } # id: dictionary section identifier diff --git a/lttoolbox/dix.rng b/lttoolbox/dix.rng index e651ca61..6deca105 100644 --- a/lttoolbox/dix.rng +++ b/lttoolbox/dix.rng @@ -103,6 +103,14 @@ + + + + no + yes + + + @@ -130,6 +138,7 @@ inconditional postblank preblank + regex