Skip to content

Commit

Permalink
feat: re-introduce the sub-word functionality in Studio
Browse files Browse the repository at this point in the history
Words can now have sub-word structure. The text go g2p is collected from
all text in the <w> element and its sub-elements.

Sub-elements can have their own lang="lang_code" attribute. Like before,
we collect all text with the same language code and g2p it together,
then concatenate the results from g2p'ing the text in different
languages.

Note, however, that the fallback-langs attribute is only allowed on the
<w> elements and its parents, not on sub-word elements.
  • Loading branch information
joanise committed Jun 7, 2022
1 parent e70927f commit 6349814
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 47 deletions.
109 changes: 69 additions & 40 deletions readalongs/text/convert_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,33 @@
from readalongs.text.util import (
get_attrib_recursive,
get_lang_attrib,
iterate_over_text,
load_xml,
save_xml,
)
from readalongs.util import get_langs


def get_same_language_units(element):
"""Find all the text in element, grouped by units of the same language
Returns: list of (lang, text) pairs
"""
same_language_units = []
current_sublang, current_subword = None, None
for sublang, subword in iterate_over_text(element):
sublang = sublang.strip() if sublang else ""
if current_subword and sublang == current_sublang:
current_subword += subword
else:
if current_subword:
same_language_units.append((current_sublang, current_subword))
current_sublang, current_subword = sublang, subword
if current_subword:
same_language_units.append((current_sublang, current_subword))
return same_language_units


def convert_words( # noqa: C901
xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False
):
Expand Down Expand Up @@ -127,7 +148,7 @@ def convert_word(word: str, lang: str):
f'\nRun "readalongs langs" to list languages supported by ReadAlongs Studio.'
) from e
tg = converter(word)
text = tg.output_string.strip()
text = tg.output_string
valid = converter.check(tg, shallow=True)
if not valid and verbose_warnings:
converter.check(tg, shallow=False, display_warnings=verbose_warnings)
Expand All @@ -145,47 +166,55 @@ def convert_word(word: str, lang: str):
all_g2p_valid = False
continue
# only convert text within words
if not word.text:
same_language_units = get_same_language_units(word)
if not same_language_units:
continue
g2p_lang = get_lang_attrib(word) or "und" # default: Undetermined
g2p_fallbacks = get_attrib_recursive(word, "fallback-langs")
text_to_g2p = word.text
try:
g2p_text, valid = convert_word(text_to_g2p, g2p_lang.strip())
if not valid:
# This is where we apply the g2p cascade
for lang in re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else []:
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" as {g2p_lang}. '
f"Trying fallback: {lang}."
)
g2p_lang = lang.strip()
g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
if valid:
word.attrib["effective-g2p-lang"] = g2p_lang
break
else:
all_g2p_valid = False
LOGGER.warning(
f'No valid g2p conversion found for "{text_to_g2p}". '
f"Check its orthography and language code, "
f"or pick suitable g2p fallback languages."
)

# Save the g2p_text from the last conversion attemps, even when
# it's not valid, so it's in the g2p output if the user wants to
# inspect it manually.
word.attrib["ARPABET"] = g2p_text

except ValueError as e:
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" due to an incorrect '
f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}'
)
all_g2p_valid = False
all_arpabet = ""
for lang, text in same_language_units:
g2p_lang = lang or "und" # default: Undetermined
g2p_fallbacks = get_attrib_recursive(word, "fallback-langs")
text_to_g2p = text.strip()
try:
g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
if not valid:
# This is where we apply the g2p cascade
for lang in (
re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else []
):
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" as {g2p_lang}. '
f"Trying fallback: {lang}."
)
g2p_lang = lang.strip()
g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
if valid:
word.attrib["effective-g2p-lang"] = g2p_lang
break
else:
all_g2p_valid = False
LOGGER.warning(
f'No valid g2p conversion found for "{text_to_g2p}". '
f"Check its orthography and language code, "
f"or pick suitable g2p fallback languages."
)

# Save the g2p_text from the last conversion attemps, even when
# it's not valid, so it's in the g2p output if the user wants to
# inspect it manually.

all_arpabet = all_arpabet + " " + g2p_text.strip()

except ValueError as e:
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" due to an incorrect '
f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}'
)
all_g2p_valid = False

if not verbose_warnings:
break

if not verbose_warnings:
break
word.attrib["ARPABET"] = all_arpabet.strip()

return xml, all_g2p_valid

Expand Down
15 changes: 15 additions & 0 deletions readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,21 @@ def get_attrib_recursive(element, *attribs):
return None


def iterate_over_text(element):
"""Iterate over all actual text contained with element and its sub-elements
Yields:
(language_code, text) pairs
"""
lang = get_lang_attrib(element)
if element.text:
yield (lang, element.text)
for child in element:
yield from iterate_over_text(child)
if child.tail:
yield (lang, child.tail)


def get_lang_attrib(element):
"""Return the xml:lang (in priority) or lang (fallback) attribute from element
or its closest ancestor that has either, or None when neither is found.
Expand Down
1 change: 1 addition & 0 deletions test/data/patrickxtlan.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
<p>
<s><w xml:lang="eng">Patrick</w><w xml:lang="kwk-umista">xtła̱n</w></s>
<s><w xml:lang="und">Patrickxtła̱n</w></s>
<s><w>foo<syl xml:lang="eng">Patrick</syl>bar<syl xml:lang="kwk-umista">xtła̱n</syl>baz</w></s>
</p>
</TEI>
94 changes: 87 additions & 7 deletions test/test_g2p_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""Test suite for the readalongs g2p CLI command"""

import os
import re
from unittest import main

from basic_test_case import BasicTestCase
Expand All @@ -15,6 +16,16 @@
from readalongs.text.convert_xml import convert_xml


def run_convert_xml(input_string):
"""wrap convert_xml to make unit testing easier"""
return etree.tounicode(convert_xml(etree.fromstring(input_string))[0])


def two_xml_elements(xml_text):
"""Extract the opening part of the leading two XML elements in xml_text"""
return xml_text[: xml_text.find("<")]


class TestG2pCli(BasicTestCase):
"""Test suite for the readalongs g2p CLI command"""

Expand Down Expand Up @@ -304,34 +315,103 @@ def test_align_with_preg2p(self):
self.assertIn("HH EH Y", dict_file) # "Hej" in dan
self.assertIn("D G IY T UW P IY D", dict_file) # pre-g2p'd OOV

def run_convert_xml(self, input_string):
"""wrap convert_xml to make unit testing easier"""
return etree.tounicode(convert_xml(etree.fromstring(input_string))[0])

def test_convert_xml(self):
"""unit testing for readalongs.text.convert_xml.convert_xml()
convert_xml() is the inner method in readalongs that calls g2p.
It's not very well named, but it still needs unit testing. :)
"""
self.assertEqual(
self.run_convert_xml("<t><w>word</w><w></w><n>not word</n></t>"),
run_convert_xml("<t><w>word</w><w></w><n>not word</n></t>"),
'<t><w ARPABET="W OW D D">word</w><w/><n>not word</n></t>',
)

self.assertEqual(
self.run_convert_xml(
run_convert_xml(
'<s><w xml:lang="eng">Patrick</w><w xml:lang="kwk-umista">xtła̱n</w></s>'
),
'<s><w xml:lang="eng" ARPABET="P AE T R IH K">Patrick</w>'
'<w xml:lang="kwk-umista" ARPABET="K Y T S AH N">xtła̱n</w></s>',
)

self.assertEqual(
self.run_convert_xml('<s><w xml:lang="und">Patrickxtła̱n</w></s>'),
run_convert_xml('<s><w xml:lang="und">Patrickxtła̱n</w></s>'),
'<s><w xml:lang="und" ARPABET="P AA T D IY CH K K T L AA N">Patrickxtła̱n</w></s>',
)

def test_convert_xml_with_newlines(self):
"""Newlines inside words are weird, but they should not cause errors"""

def compact_arpabet(xml_string: str) -> str:
etree_root = etree.fromstring(xml_string)
arpabet = etree_root[0].attrib["ARPABET"]
return re.sub(r"\s+", " ", arpabet)

converted_1 = run_convert_xml(
"""<s><w>
<part>first part of the word</part>
<part>second part of the word</part>
</w></s>"""
)
converted_2 = run_convert_xml(
"<s><w><part>first part of the word</part><part>second part of the word</part></w></s>"
)
self.assertEqual(compact_arpabet(converted_1), compact_arpabet(converted_2))

def test_convert_xml_subwords(self):
"""Unit testing for reintroducing subword units"""
self.assertEqual(
run_convert_xml(
'<s><w><part xml:lang="eng">Patrick</part><part xml:lang="kwk-umista">xtła̱n</part></w></s>'
),
'<s><w ARPABET="P AE T R IH K K Y T S AH N"><part xml:lang="eng">Patrick</part>'
'<part xml:lang="kwk-umista">xtła̱n</part></w></s>',
)

self.assertEqual(
run_convert_xml(
'<s><w>foo<syl xml:lang="eng">Patrick</syl>bar<syl xml:lang="kwk-umista">xtła̱n</syl>baz</w></s>'
),
'<s><w ARPABET="F OW OW P AE T R IH K B AA D K Y T S AH N B AA Z">'
'foo<syl xml:lang="eng">Patrick</syl>bar<syl xml:lang="kwk-umista">xtła̱n</syl>baz</w></s>',
)

converted_by_syllable = run_convert_xml(
'<s><w xml:lang="und"><syl>abc</syl><syl>def</syl><syl>ghi</syl></w></s>'
)
converted_as_a_whole = run_convert_xml('<s><w xml:lang="und">abcdefghi</w></s>')
self.assertEqual(
two_xml_elements(converted_by_syllable),
two_xml_elements(converted_as_a_whole),
)

moh_example_input_with_highlights = "<s xml:lang='moh'><w><span class='pronoun'>tati</span><span class='root'>atkèn:se</span><span class='aspect'>hkwe'</span></w></s>"
moh_example_input_merged = "<s xml:lang='moh'><w>tatiatkèn:sehkwe'</w></s>"
self.assertEqual(
two_xml_elements(run_convert_xml(moh_example_input_with_highlights)),
two_xml_elements(run_convert_xml(moh_example_input_merged)),
)

moh_example_input_full = """
<document xml:lang='moh'>
<s>
<w><span class='pronoun'>tati</span><span class='root'>atkèn:se</span><span class='aspect'>hkwe'</span></w>
</s>
</document>"""
# print(run_convert_xml(moh_example_input_full))

example_with_fallback_lang = """
<document xml:lang="fra" fallback-langs="eng"><s>
<w><part lang="fra">ceci</part><part lang="iku">not_really_iku</part></w>
</s></document>"""
with self.assertLogs(LOGGER, level="WARNING") as cm:
result = run_convert_xml(example_with_fallback_lang)
self.assertIn("S AH S IY not_really_iku", result)
logger_output = "\n".join(cm.output)
self.assertIn(
'No valid g2p conversion found for "not_really_iku"', logger_output
)

def test_convert_xml_invalid(self):
"""test readalongs.text.convert_xml.convert_xml() with invalid input"""
xml = etree.fromstring('<s><w ARPABET="V AA L IY D">valid</w></s>')
Expand Down

0 comments on commit 6349814

Please sign in to comment.