fix(sub-word): the word is all its subselements, not just word.text

Sub-words were not working properly everywhere because although the g2p'ing process collected all text from sub elements, other parts of the code just assumed word_el.text woudl give you the word's text. From now on, use readalongs.text.util.get_word_text() to get the word's actual text from it's etree.ElementTree structure.
ReadAlongs · Jul 7, 2022 · 1dc8527 · 1dc8527
1 parent 9751382
commit 1dc8527
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 17 deletions.
diff --git a/readalongs/align.py b/readalongs/align.py
@@ -43,7 +43,13 @@
 from readalongs.text.make_package import create_web_component_html
 from readalongs.text.make_smil import make_smil
 from readalongs.text.tokenize_xml import tokenize_xml
-from readalongs.text.util import parse_time, save_minimal_index_html, save_txt, save_xml
+from readalongs.text.util import (
+    get_word_text,
+    parse_time,
+    save_minimal_index_html,
+    save_txt,
+    save_xml,
+)
 
 MODEL_DIR = os.path.join(os.path.dirname(__file__), "static", "model")
 DEFAULT_ACOUSTIC_MODEL = "cmusphinx-en-us-5.2"
@@ -927,17 +933,9 @@ def save_readalong(
         save_images(config=config, output_dir=output_dir)
 
 
-def get_word_from_id(xml: etree, el_id: str) -> str:
-    """Given an XML document, get the innertext at id
-
-    Args:
-        xml (etree): XML document
-        el_id (str): ID
-
-    Returns:
-        str: Innertext of element with id==el_id in xml
-    """
-    return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]
+def get_word_element(xml: etree.ElementTree, el_id: str) -> etree.ElementTree:
+    """Get the xml etree for a given word by its id"""
+    return xml.xpath(f'//w[@id="{el_id}"]')[0]
 
 
 def get_words_and_sentences(results) -> Tuple[List[dict], List[List[dict]]]:
@@ -975,7 +973,7 @@ def get_words_and_sentences(results) -> Tuple[List[dict], List[List[dict]]]:
             words = []
             current_sent = sent_i
         word = {
-            "text": get_word_from_id(xml, el["id"]),
+            "text": get_word_text(get_word_element(xml, el["id"])),
             "start": el["start"],
             "end": el["end"],
         }

diff --git a/readalongs/text/convert_xml.py b/readalongs/text/convert_xml.py
@@ -46,6 +46,7 @@
 from readalongs.text.util import (
     get_attrib_recursive,
     get_lang_attrib,
+    get_word_text,
     iterate_over_text,
     load_xml,
     save_xml,
@@ -161,7 +162,7 @@ def convert_word(word: str, lang: str):
             arpabet = word.attrib["ARPABET"]
             if not is_arpabet(arpabet):
                 LOGGER.warning(
-                    f'Pre-g2p\'d text "{word.text}" has invalid ARPABET conversion "{arpabet}"'
+                    f'Pre-g2p\'d text "{get_word_text(word)}" has invalid ARPABET conversion "{arpabet}"'
                 )
                 all_g2p_valid = False
             continue

diff --git a/readalongs/text/make_fsg.py b/readalongs/text/make_fsg.py
@@ -14,6 +14,7 @@
 from slugify import slugify
 
 from readalongs.log import LOGGER
+from readalongs.text.util import get_word_text
 
 FSG_TEMPLATE = """FSG_BEGIN {{name}}
 NUM_STATES {{num_states}}
@@ -37,7 +38,7 @@ def get_ids(word_elements: list):
     for e in word_elements:
         if "id" not in e.attrib:  # don't put in elements with no id
             continue
-        if not e.text or not e.text.strip():
+        if not get_word_text(e):
             LOGGER.warning("No text in node %s", e.attrib["id"])
             continue
         yield e.attrib["id"]

diff --git a/readalongs/text/util.py b/readalongs/text/util.py
@@ -69,7 +69,7 @@ def get_attrib_recursive(element, *attribs):
         return None
 
 
-def iterate_over_text(element):
+def iterate_over_text(element: etree.ElementTree):
     """Iterate over all actual text contained with element and its sub-elements
 
     Yields:
@@ -84,7 +84,12 @@ def iterate_over_text(element):
             yield (lang, child.tail)
 
 
-def get_lang_attrib(element):
+def get_word_text(word_element: etree.ElementTree) -> str:
+    """Given a word element, extract all its text"""
+    return "".join(text for _, text in iterate_over_text(word_element))
+
+
+def get_lang_attrib(element: etree.ElementTree):
     """Return the xml:lang (in priority) or lang (fallback) attribute from element
     or its closest ancestor that has either, or None when neither is found.
     """