Skip to content

Commit

Permalink
fix(sub-word): the word is all its subselements, not just word.text
Browse files Browse the repository at this point in the history
Sub-words were not working properly everywhere because although the
g2p'ing process collected all text from sub elements, other parts of the
code just assumed word_el.text woudl give you the word's text. From now
on, use readalongs.text.util.get_word_text() to get the word's actual
text from it's etree.ElementTree structure.
  • Loading branch information
joanise committed Jul 7, 2022
1 parent 9751382 commit 1dc8527
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 17 deletions.
24 changes: 11 additions & 13 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,13 @@
from readalongs.text.make_package import create_web_component_html
from readalongs.text.make_smil import make_smil
from readalongs.text.tokenize_xml import tokenize_xml
from readalongs.text.util import parse_time, save_minimal_index_html, save_txt, save_xml
from readalongs.text.util import (
get_word_text,
parse_time,
save_minimal_index_html,
save_txt,
save_xml,
)

MODEL_DIR = os.path.join(os.path.dirname(__file__), "static", "model")
DEFAULT_ACOUSTIC_MODEL = "cmusphinx-en-us-5.2"
Expand Down Expand Up @@ -927,17 +933,9 @@ def save_readalong(
save_images(config=config, output_dir=output_dir)


def get_word_from_id(xml: etree, el_id: str) -> str:
"""Given an XML document, get the innertext at id
Args:
xml (etree): XML document
el_id (str): ID
Returns:
str: Innertext of element with id==el_id in xml
"""
return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]
def get_word_element(xml: etree.ElementTree, el_id: str) -> etree.ElementTree:
"""Get the xml etree for a given word by its id"""
return xml.xpath(f'//w[@id="{el_id}"]')[0]


def get_words_and_sentences(results) -> Tuple[List[dict], List[List[dict]]]:
Expand Down Expand Up @@ -975,7 +973,7 @@ def get_words_and_sentences(results) -> Tuple[List[dict], List[List[dict]]]:
words = []
current_sent = sent_i
word = {
"text": get_word_from_id(xml, el["id"]),
"text": get_word_text(get_word_element(xml, el["id"])),
"start": el["start"],
"end": el["end"],
}
Expand Down
3 changes: 2 additions & 1 deletion readalongs/text/convert_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from readalongs.text.util import (
get_attrib_recursive,
get_lang_attrib,
get_word_text,
iterate_over_text,
load_xml,
save_xml,
Expand Down Expand Up @@ -161,7 +162,7 @@ def convert_word(word: str, lang: str):
arpabet = word.attrib["ARPABET"]
if not is_arpabet(arpabet):
LOGGER.warning(
f'Pre-g2p\'d text "{word.text}" has invalid ARPABET conversion "{arpabet}"'
f'Pre-g2p\'d text "{get_word_text(word)}" has invalid ARPABET conversion "{arpabet}"'
)
all_g2p_valid = False
continue
Expand Down
3 changes: 2 additions & 1 deletion readalongs/text/make_fsg.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from slugify import slugify

from readalongs.log import LOGGER
from readalongs.text.util import get_word_text

FSG_TEMPLATE = """FSG_BEGIN {{name}}
NUM_STATES {{num_states}}
Expand All @@ -37,7 +38,7 @@ def get_ids(word_elements: list):
for e in word_elements:
if "id" not in e.attrib: # don't put in elements with no id
continue
if not e.text or not e.text.strip():
if not get_word_text(e):
LOGGER.warning("No text in node %s", e.attrib["id"])
continue
yield e.attrib["id"]
Expand Down
9 changes: 7 additions & 2 deletions readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def get_attrib_recursive(element, *attribs):
return None


def iterate_over_text(element):
def iterate_over_text(element: etree.ElementTree):
"""Iterate over all actual text contained with element and its sub-elements
Yields:
Expand All @@ -84,7 +84,12 @@ def iterate_over_text(element):
yield (lang, child.tail)


def get_lang_attrib(element):
def get_word_text(word_element: etree.ElementTree) -> str:
"""Given a word element, extract all its text"""
return "".join(text for _, text in iterate_over_text(word_element))


def get_lang_attrib(element: etree.ElementTree):
"""Return the xml:lang (in priority) or lang (fallback) attribute from element
or its closest ancestor that has either, or None when neither is found.
"""
Expand Down

0 comments on commit 1dc8527

Please sign in to comment.