From 27b8b6a55ced7d7a46ba5b23b3064d47fdf37cec Mon Sep 17 00:00:00 2001 From: Daniel Swanson Date: Mon, 18 Oct 2021 08:40:59 -0400 Subject: [PATCH] Space Manipulation (#44) * error wrapper because it makes things so much nicer * Space manipulation * version bump --- configure.ac | 2 +- src/lsx_compiler.cc | 101 ++++++++++++++++------------------ src/lsx_compiler.h | 16 ++++++ src/lsx_processor.cc | 79 +++++++++++++++----------- src/lsx_processor.h | 2 + tests/data/capitalization.dix | 6 +- tests/data/short-example.dix | 12 ++-- tests/data/spaces.lsx | 26 +++++++++ tests/lsx_proc/__init__.py | 26 +++++++++ 9 files changed, 175 insertions(+), 95 deletions(-) create mode 100644 tests/data/spaces.lsx diff --git a/configure.ac b/configure.ac index 04e42ff..3b10791 100644 --- a/configure.ac +++ b/configure.ac @@ -3,7 +3,7 @@ AC_PREREQ(2.61) m4_define([required_libxml_version], [2.6.17]) m4_define([required_lttoolbox_version], [3.6.0]) -AC_INIT([apertium-separable], [0.4.0], [apertium-stuff@lists.sourceforge.net]) +AC_INIT([apertium-separable], [0.5.0], [apertium-stuff@lists.sourceforge.net]) AM_INIT_AUTOMAKE AC_CONFIG_MACRO_DIR([m4]) diff --git a/src/lsx_compiler.cc b/src/lsx_compiler.cc index 73dd340..b0cc5b4 100644 --- a/src/lsx_compiler.cc +++ b/src/lsx_compiler.cc @@ -31,7 +31,14 @@ using namespace std; UString const Compiler::COMPILER_ANYTAG_ELEM = "t"_u; UString const Compiler::COMPILER_ANYCHAR_ELEM = "w"_u; -UString const Compiler::COMPILER_WB_ELEM = "j"_u; +UString const Compiler::COMPILER_WB_ELEM = "d"_u; +UString const Compiler::COMPILER_SPACE_ATTR = "space"_u; +UString const Compiler::COMPILER_SPACE_YES_VAL = "yes"_u; +UString const Compiler::COMPILER_SPACE_NO_VAL = "no"_u; + +// TODO: these should be in lttoolbox so lt-trim can use them +UString const Compiler::SYMBOL_WB_SPACE = "<$_>"_u; +UString const Compiler::SYMBOL_WB_NO_SPACE = "<$->"_u; void Compiler::parse(string const &fichero, UString const &dir) @@ -47,9 +54,13 @@ Compiler::parse(string const &fichero, UString const &dir) alphabet.includeSymbol(Transducer::ANY_TAG_SYMBOL); alphabet.includeSymbol(Transducer::ANY_CHAR_SYMBOL); alphabet.includeSymbol(Transducer::LSX_BOUNDARY_SYMBOL); - any_tag = alphabet(Transducer::ANY_TAG_SYMBOL); - any_char = alphabet(Transducer::ANY_CHAR_SYMBOL); - word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL); + alphabet.includeSymbol(Compiler::SYMBOL_WB_SPACE); + alphabet.includeSymbol(Compiler::SYMBOL_WB_NO_SPACE); + any_tag = alphabet(Transducer::ANY_TAG_SYMBOL); + any_char = alphabet(Transducer::ANY_CHAR_SYMBOL); + word_boundary = alphabet(Transducer::LSX_BOUNDARY_SYMBOL); + word_boundary_s = alphabet(Compiler::SYMBOL_WB_SPACE); + word_boundary_ns = alphabet(Compiler::SYMBOL_WB_NO_SPACE); int ret = xmlTextReaderRead(reader); while(ret == 1) @@ -111,9 +122,7 @@ Compiler::procAlphabet() } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Missing alphabet symbols." << endl; - exit(EXIT_FAILURE); + error("Missing alphabet symbols."); } } } @@ -234,9 +243,7 @@ Compiler::requireEmptyError(UString const &name) { if(!xmlTextReaderIsEmptyElement(reader)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Non-empty element '<" << name << ">' should be empty." << endl; - exit(EXIT_FAILURE); + error("Non-empty element '<%S>' should be empty.", name.c_str()); } } @@ -288,12 +295,15 @@ Compiler::readString(vector &result, UString const &name) if(!alphabet.isSymbolDefined(symbol)) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Undefined symbol '" << symbol << "'." << endl; - exit(EXIT_FAILURE); + error("Undefined symbol '%S'.", symbol.c_str()); } result.push_back(alphabet(symbol)); } + else if(name == COMPILER_JOIN_ELEM) + { + requireEmptyError(name); + result.push_back(static_cast('+')); + } else if(name == COMPILER_ANYTAG_ELEM) { result.push_back(any_tag); @@ -305,15 +315,19 @@ Compiler::readString(vector &result, UString const &name) else if(name == COMPILER_WB_ELEM) { requireEmptyError(name); - result.push_back(word_boundary); + UString mode = attrib(COMPILER_SPACE_ATTR); + if (mode == COMPILER_SPACE_YES_VAL) { + result.push_back(word_boundary_s); + } else if (mode == COMPILER_SPACE_NO_VAL) { + result.push_back(word_boundary_ns); + } else { + result.push_back(word_boundary); + } } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid specification of element '<" << name; - cerr << ">' in this context." << endl; - exit(EXIT_FAILURE); + error("Invalid specification of element '<%S>' in this context.", name.c_str()); } } @@ -326,9 +340,7 @@ Compiler::skipBlanks(UString &name) { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; - exit(EXIT_FAILURE); + error("Invalid construction."); } } @@ -361,9 +373,7 @@ Compiler::skip(UString &name, UString const &elem, bool open) { if(!allBlanks()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid construction." << endl; - exit(EXIT_FAILURE); + error("Invalid construction."); } } xmlTextReaderRead(reader); @@ -372,9 +382,7 @@ Compiler::skip(UString &name, UString const &elem, bool open) if(name != elem) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Expected '<" << slash << elem << ">'." << endl; - exit(EXIT_FAILURE); + error("Expected '<%S%S>'.", slash.c_str(), elem.c_str()); } } @@ -480,16 +488,12 @@ Compiler::procPar() if(!current_paradigm.empty() && nomparadigma == current_paradigm) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Paradigm refers to itself '" << nomparadigma << "'." < const &elements) } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid entry token." << endl; - exit(EXIT_FAILURE); + error("Invalid entry token."); } } t.setFinal(e); @@ -597,12 +599,9 @@ Compiler::requireAttribute(UString const &value, UString const &attrname, UString const &elemname) { if(value.empty()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): '<" << elemname; - cerr << "' element must specify non-void '"; - cerr << attrname << "' attribute." << endl; - exit(EXIT_FAILURE); - } + error("Element '<%S>' must specify a non-void value for attribute '%S'.", + elemname.c_str(), attrname.c_str()); + } } @@ -665,9 +664,7 @@ Compiler::procEntry() int ret = xmlTextReaderRead(reader); if(ret != 1) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Parse error." << endl; - exit(EXIT_FAILURE); + error("Parse error."); } UString name = XMLParseUtil::readName(reader); skipBlanks(name); @@ -700,9 +697,7 @@ Compiler::procEntry() if(paradigms.find(p) == paradigms.end()) { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Undefined paradigm '" << p << "'." <' into '<" << COMPILER_ENTRY_ELEM; - cerr << ">'." << endl; - exit(EXIT_FAILURE); + error("Invalid inclusion of '<%S>' in '<%S>'.", name.c_str(), + COMPILER_ENTRY_ELEM.c_str()); } } @@ -790,9 +783,7 @@ Compiler::procNode() } else { - cerr << "Error (" << xmlTextReaderGetParserLineNumber(reader); - cerr << "): Invalid node '<" << nombre << ">'." << endl; - exit(EXIT_FAILURE); + error("Invalid node '<%S>'.", nombre.c_str()); } } diff --git a/src/lsx_compiler.h b/src/lsx_compiler.h index 3956f16..4075897 100644 --- a/src/lsx_compiler.h +++ b/src/lsx_compiler.h @@ -103,6 +103,8 @@ class Compiler int32_t any_tag = 0; int32_t any_char = 0; int32_t word_boundary = 0; + int32_t word_boundary_s = 0; + int32_t word_boundary_ns = 0; /** * List of named transducers-paradigms @@ -129,6 +131,15 @@ class Compiler */ map> postsuffix_paradigms; + template + void error(const char* fmt, T... args) { + UFILE* out = u_finit(stderr, NULL, NULL); + u_fprintf(out, "Error on line %d: ", + xmlTextReaderGetParserLineNumber(reader)); + u_fprintf(out, fmt, args...); + u_fprintf(out, "\n"); + exit(EXIT_FAILURE); + } /* static string range(char const a, char const b); @@ -306,6 +317,11 @@ class Compiler static UString const COMPILER_ANYTAG_ELEM; static UString const COMPILER_ANYCHAR_ELEM; static UString const COMPILER_WB_ELEM; + static UString const COMPILER_SPACE_ATTR; + static UString const COMPILER_SPACE_YES_VAL; + static UString const COMPILER_SPACE_NO_VAL; + static UString const SYMBOL_WB_SPACE; + static UString const SYMBOL_WB_NO_SPACE; /** diff --git a/src/lsx_processor.cc b/src/lsx_processor.cc index ce1eaf4..7badb81 100644 --- a/src/lsx_processor.cc +++ b/src/lsx_processor.cc @@ -54,6 +54,8 @@ LSXProcessor::load(FILE *input) // symbols alphabet.read(input); word_boundary = alphabet("<$>"_u); + word_boundary_s = alphabet("<$_>"_u); + word_boundary_ns = alphabet("<$->"_u); any_char = alphabet(""_u); any_tag = alphabet(""_u); @@ -247,7 +249,7 @@ LSXProcessor::processWord(InputFile& input, UFILE* output) s.step_override(lu[i], towlower(lu[i]), any_char, lu[i]); } } - s.step(word_boundary); + s.step(word_boundary, word_boundary_s, word_boundary_ns); if(s.isFinal(all_finals)) { last_final = idx+1; @@ -270,23 +272,7 @@ LSXProcessor::processWord(InputFile& input, UFILE* output) lu_queue.pop_front(); return; } - vector out_lus; - size_t pos = 0; - while(pos != UString::npos && pos != last_final_out.size()) - { - size_t start = pos; - pos = last_final_out.find("<$>"_u, start); - if(pos == UString::npos) - { - out_lus.push_back(last_final_out.substr(start)); - } - else - { - out_lus.push_back(last_final_out.substr(start, pos-start)); - pos += 3; - } - } - + UString wblank; for(size_t i = 0; i < last_final; i++) { @@ -308,30 +294,61 @@ LSXProcessor::processWord(InputFile& input, UFILE* output) { wblank += "]]"_u; } - - size_t i = 0; - for(; i < out_lus.size(); i++) + + size_t output_count = 0; + size_t pos = 0; + bool pop_queue = true; + bool replace_empty = false; + while(pos != UString::npos && pos != last_final_out.size()) { - if(i < last_final) + if (pop_queue) { + if (output_count < last_final) { + write(blank_queue[output_count], output); + if (replace_empty && blank_queue[output_count].empty()) { + u_fputc(' ', output); + } + output_count++; + } else { + u_fputc(' ', output); + } + } + write(wblank, output); + u_fputc('^', output); + size_t start = pos; + pos = last_final_out.find("<$"_u, start); + if(pos == UString::npos) { - write(blank_queue[i], output); + write(last_final_out.substr(start), output); + u_fputc('$', output); + break; } else { - u_fputc(' ', output); + write(last_final_out.substr(start, pos-start), output); + u_fputc('$', output); + pos += 2; + if (last_final_out[pos] == '-') { + pop_queue = false; + pos++; + } else if (last_final_out[pos] == '_') { + pop_queue = true; + replace_empty = true; + pos++; + } else { + pop_queue = true; + replace_empty = false; + } + pos++; } - write(wblank, output); - u_fputc('^', output); - write(out_lus[i], output); - u_fputc('$', output); } - for(; i < last_final; i++) + for(; output_count < last_final; output_count++) { - if(blank_queue[i] != " "_u) + if(blank_queue[output_count] != " "_u) { - write(blank_queue[i], output); + write(blank_queue[output_count], output); } } + blank_queue.erase(blank_queue.begin(), blank_queue.begin()+last_final); bound_blank_queue.erase(bound_blank_queue.begin(), bound_blank_queue.begin()+last_final); lu_queue.erase(lu_queue.begin(), lu_queue.begin()+last_final); diff --git a/src/lsx_processor.h b/src/lsx_processor.h index 264a5dd..e81d091 100644 --- a/src/lsx_processor.h +++ b/src/lsx_processor.h @@ -31,6 +31,8 @@ class LSXProcessor void processWord(InputFile& input, UFILE* output); int word_boundary; + int word_boundary_s; + int word_boundary_ns; int any_char; int any_tag; public: diff --git a/tests/data/capitalization.dix b/tests/data/capitalization.dix index 3f8211d..34d3ee5 100644 --- a/tests/data/capitalization.dix +++ b/tests/data/capitalization.dix @@ -9,14 +9,14 @@

- JunAjpu + JunAjpu JunAjpu

- harglebargle - harglebargle + harglebargle + harglebargle

diff --git a/tests/data/short-example.dix b/tests/data/short-example.dix index 530ce03..79aa054 100644 --- a/tests/data/short-example.dix +++ b/tests/data/short-example.dix @@ -23,9 +23,9 @@
-

taketakeout

- -

out

+

taketakeout

+ +

out