feat: re-introduce the sub-word functionality in Studio

Words can now have sub-word structure. The text go g2p is collected from all text in the <w> element and its sub-elements. Sub-elements can have their own lang="lang_code" attribute. Like before, we collect all text with the same language code and g2p it together, then concatenate the results from g2p'ing the text in different languages. Note, however, that the fallback-langs attribute is only allowed on the <w> elements and its parents, not on sub-word elements.
ReadAlongs · Jun 7, 2022 · 6349814 · 6349814
1 parent e70927f
commit 6349814
Show file tree

Hide file tree

Showing 4 changed files with 172 additions and 47 deletions.
diff --git a/readalongs/text/convert_xml.py b/readalongs/text/convert_xml.py
@@ -46,12 +46,33 @@
 from readalongs.text.util import (
     get_attrib_recursive,
     get_lang_attrib,
+    iterate_over_text,
     load_xml,
     save_xml,
 )
 from readalongs.util import get_langs
 
 
+def get_same_language_units(element):
+    """Find all the text in element, grouped by units of the same language
+
+    Returns: list of (lang, text) pairs
+    """
+    same_language_units = []
+    current_sublang, current_subword = None, None
+    for sublang, subword in iterate_over_text(element):
+        sublang = sublang.strip() if sublang else ""
+        if current_subword and sublang == current_sublang:
+            current_subword += subword
+        else:
+            if current_subword:
+                same_language_units.append((current_sublang, current_subword))
+            current_sublang, current_subword = sublang, subword
+    if current_subword:
+        same_language_units.append((current_sublang, current_subword))
+    return same_language_units
+
+
 def convert_words(  # noqa: C901
     xml, word_unit="w", output_orthography="eng-arpabet", verbose_warnings=False
 ):
@@ -127,7 +148,7 @@ def convert_word(word: str, lang: str):
                     f'\nRun "readalongs langs" to list languages supported by ReadAlongs Studio.'
                 ) from e
             tg = converter(word)
-            text = tg.output_string.strip()
+            text = tg.output_string
             valid = converter.check(tg, shallow=True)
             if not valid and verbose_warnings:
                 converter.check(tg, shallow=False, display_warnings=verbose_warnings)
@@ -145,47 +166,55 @@ def convert_word(word: str, lang: str):
                 all_g2p_valid = False
             continue
         # only convert text within words
-        if not word.text:
+        same_language_units = get_same_language_units(word)
+        if not same_language_units:
             continue
-        g2p_lang = get_lang_attrib(word) or "und"  # default: Undetermined
-        g2p_fallbacks = get_attrib_recursive(word, "fallback-langs")
-        text_to_g2p = word.text
-        try:
-            g2p_text, valid = convert_word(text_to_g2p, g2p_lang.strip())
-            if not valid:
-                # This is where we apply the g2p cascade
-                for lang in re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else []:
-                    LOGGER.warning(
-                        f'Could not g2p "{text_to_g2p}" as {g2p_lang}. '
-                        f"Trying fallback: {lang}."
-                    )
-                    g2p_lang = lang.strip()
-                    g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
-                    if valid:
-                        word.attrib["effective-g2p-lang"] = g2p_lang
-                        break
-                else:
-                    all_g2p_valid = False
-                    LOGGER.warning(
-                        f'No valid g2p conversion found for "{text_to_g2p}". '
-                        f"Check its orthography and language code, "
-                        f"or pick suitable g2p fallback languages."
-                    )
-
-            # Save the g2p_text from the last conversion attemps, even when
-            # it's not valid, so it's in the g2p output if the user wants to
-            # inspect it manually.
-            word.attrib["ARPABET"] = g2p_text
-
-        except ValueError as e:
-            LOGGER.warning(
-                f'Could not g2p "{text_to_g2p}" due to an incorrect '
-                f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}'
-            )
-            all_g2p_valid = False
+        all_arpabet = ""
+        for lang, text in same_language_units:
+            g2p_lang = lang or "und"  # default: Undetermined
+            g2p_fallbacks = get_attrib_recursive(word, "fallback-langs")
+            text_to_g2p = text.strip()
+            try:
+                g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
+                if not valid:
+                    # This is where we apply the g2p cascade
+                    for lang in (
+                        re.split(r"[,:]", g2p_fallbacks) if g2p_fallbacks else []
+                    ):
+                        LOGGER.warning(
+                            f'Could not g2p "{text_to_g2p}" as {g2p_lang}. '
+                            f"Trying fallback: {lang}."
+                        )
+                        g2p_lang = lang.strip()
+                        g2p_text, valid = convert_word(text_to_g2p, g2p_lang)
+                        if valid:
+                            word.attrib["effective-g2p-lang"] = g2p_lang
+                            break
+                    else:
+                        all_g2p_valid = False
+                        LOGGER.warning(
+                            f'No valid g2p conversion found for "{text_to_g2p}". '
+                            f"Check its orthography and language code, "
+                            f"or pick suitable g2p fallback languages."
+                        )
+
+                # Save the g2p_text from the last conversion attemps, even when
+                # it's not valid, so it's in the g2p output if the user wants to
+                # inspect it manually.
+
+                all_arpabet = all_arpabet + " " + g2p_text.strip()
+
+            except ValueError as e:
+                LOGGER.warning(
+                    f'Could not g2p "{text_to_g2p}" due to an incorrect '
+                    f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}'
+                )
+                all_g2p_valid = False
+
+                if not verbose_warnings:
+                    break
 
-            if not verbose_warnings:
-                break
+        word.attrib["ARPABET"] = all_arpabet.strip()
 
     return xml, all_g2p_valid
 

diff --git a/readalongs/text/util.py b/readalongs/text/util.py
@@ -69,6 +69,21 @@ def get_attrib_recursive(element, *attribs):
         return None
 
 
+def iterate_over_text(element):
+    """Iterate over all actual text contained with element and its sub-elements
+
+    Yields:
+        (language_code, text) pairs
+    """
+    lang = get_lang_attrib(element)
+    if element.text:
+        yield (lang, element.text)
+    for child in element:
+        yield from iterate_over_text(child)
+        if child.tail:
+            yield (lang, child.tail)
+
+
 def get_lang_attrib(element):
     """Return the xml:lang (in priority) or lang (fallback) attribute from element
     or its closest ancestor that has either, or None when neither is found.

diff --git a/test/data/patrickxtlan.xml b/test/data/patrickxtlan.xml
@@ -3,5 +3,6 @@
     <p>
         <s><w xml:lang="eng">Patrick</w><w xml:lang="kwk-umista">xtła̱n</w></s>
         <s><w xml:lang="und">Patrickxtła̱n</w></s>
+	<s><w>foo<syl xml:lang="eng">Patrick</syl>bar<syl xml:lang="kwk-umista">xtła̱n</syl>baz</w></s>
     </p>
 </TEI>
diff --git a/test/test_g2p_cli.py b/test/test_g2p_cli.py
@@ -3,6 +3,7 @@
 """Test suite for the readalongs g2p CLI command"""
 
 import os
+import re
 from unittest import main
 
 from basic_test_case import BasicTestCase
@@ -15,6 +16,16 @@
 from readalongs.text.convert_xml import convert_xml
 
 
+def run_convert_xml(input_string):
+    """wrap convert_xml to make unit testing easier"""
+    return etree.tounicode(convert_xml(etree.fromstring(input_string))[0])
+
+
+def two_xml_elements(xml_text):
+    """Extract the opening part of the leading two XML elements in xml_text"""
+    return xml_text[: xml_text.find("<")]
+
+
 class TestG2pCli(BasicTestCase):
     """Test suite for the readalongs g2p CLI command"""
 
@@ -304,34 +315,103 @@ def test_align_with_preg2p(self):
             self.assertIn("HH EH Y", dict_file)  # "Hej" in dan
             self.assertIn("D G IY T UW P IY D", dict_file)  # pre-g2p'd OOV
 
-    def run_convert_xml(self, input_string):
-        """wrap convert_xml to make unit testing easier"""
-        return etree.tounicode(convert_xml(etree.fromstring(input_string))[0])
-
     def test_convert_xml(self):
         """unit testing for readalongs.text.convert_xml.convert_xml()
 
         convert_xml() is the inner method in readalongs that calls g2p.
         It's not very well named, but it still needs unit testing. :)
         """
         self.assertEqual(
-            self.run_convert_xml("<t><w>word</w><w></w><n>not word</n></t>"),
+            run_convert_xml("<t><w>word</w><w></w><n>not word</n></t>"),
             '<t><w ARPABET="W OW D D">word</w><w/><n>not word</n></t>',
         )
 
         self.assertEqual(
-            self.run_convert_xml(
+            run_convert_xml(
                 '<s><w xml:lang="eng">Patrick</w><w xml:lang="kwk-umista">xtła̱n</w></s>'
             ),
             '<s><w xml:lang="eng" ARPABET="P AE T R IH K">Patrick</w>'
             '<w xml:lang="kwk-umista" ARPABET="K Y T S AH N">xtła̱n</w></s>',
         )
 
         self.assertEqual(
-            self.run_convert_xml('<s><w xml:lang="und">Patrickxtła̱n</w></s>'),
+            run_convert_xml('<s><w xml:lang="und">Patrickxtła̱n</w></s>'),
             '<s><w xml:lang="und" ARPABET="P AA T D IY CH K K T L AA N">Patrickxtła̱n</w></s>',
         )
 
+    def test_convert_xml_with_newlines(self):
+        """Newlines inside words are weird, but they should not cause errors"""
+
+        def compact_arpabet(xml_string: str) -> str:
+            etree_root = etree.fromstring(xml_string)
+            arpabet = etree_root[0].attrib["ARPABET"]
+            return re.sub(r"\s+", " ", arpabet)
+
+        converted_1 = run_convert_xml(
+            """<s><w>
+               <part>first part of the word</part>
+               <part>second part of the word</part>
+               </w></s>"""
+        )
+        converted_2 = run_convert_xml(
+            "<s><w><part>first part of the word</part><part>second part of the word</part></w></s>"
+        )
+        self.assertEqual(compact_arpabet(converted_1), compact_arpabet(converted_2))
+
+    def test_convert_xml_subwords(self):
+        """Unit testing for reintroducing subword units"""
+        self.assertEqual(
+            run_convert_xml(
+                '<s><w><part xml:lang="eng">Patrick</part><part xml:lang="kwk-umista">xtła̱n</part></w></s>'
+            ),
+            '<s><w ARPABET="P AE T R IH K K Y T S AH N"><part xml:lang="eng">Patrick</part>'
+            '<part xml:lang="kwk-umista">xtła̱n</part></w></s>',
+        )
+
+        self.assertEqual(
+            run_convert_xml(
+                '<s><w>foo<syl xml:lang="eng">Patrick</syl>bar<syl xml:lang="kwk-umista">xtła̱n</syl>baz</w></s>'
+            ),
+            '<s><w ARPABET="F OW OW P AE T R IH K B AA D K Y T S AH N B AA Z">'
+            'foo<syl xml:lang="eng">Patrick</syl>bar<syl xml:lang="kwk-umista">xtła̱n</syl>baz</w></s>',
+        )
+
+        converted_by_syllable = run_convert_xml(
+            '<s><w xml:lang="und"><syl>abc</syl><syl>def</syl><syl>ghi</syl></w></s>'
+        )
+        converted_as_a_whole = run_convert_xml('<s><w xml:lang="und">abcdefghi</w></s>')
+        self.assertEqual(
+            two_xml_elements(converted_by_syllable),
+            two_xml_elements(converted_as_a_whole),
+        )
+
+        moh_example_input_with_highlights = "<s xml:lang='moh'><w><span class='pronoun'>tati</span><span class='root'>atkèn:se</span><span class='aspect'>hkwe'</span></w></s>"
+        moh_example_input_merged = "<s xml:lang='moh'><w>tatiatkèn:sehkwe'</w></s>"
+        self.assertEqual(
+            two_xml_elements(run_convert_xml(moh_example_input_with_highlights)),
+            two_xml_elements(run_convert_xml(moh_example_input_merged)),
+        )
+
+        moh_example_input_full = """
+            <document xml:lang='moh'>
+              <s>
+                <w><span class='pronoun'>tati</span><span class='root'>atkèn:se</span><span class='aspect'>hkwe'</span></w>
+              </s>
+            </document>"""
+        # print(run_convert_xml(moh_example_input_full))
+
+        example_with_fallback_lang = """
+            <document xml:lang="fra" fallback-langs="eng"><s>
+              <w><part lang="fra">ceci</part><part lang="iku">not_really_iku</part></w>
+            </s></document>"""
+        with self.assertLogs(LOGGER, level="WARNING") as cm:
+            result = run_convert_xml(example_with_fallback_lang)
+        self.assertIn("S AH S IY not_really_iku", result)
+        logger_output = "\n".join(cm.output)
+        self.assertIn(
+            'No valid g2p conversion found for "not_really_iku"', logger_output
+        )
+
     def test_convert_xml_invalid(self):
         """test readalongs.text.convert_xml.convert_xml() with invalid input"""
         xml = etree.fromstring('<s><w ARPABET="V AA L IY D">valid</w></s>')