Skip to content

Commit

Permalink
fix(g2p): better error messages on invalid language codes
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed Nov 16, 2021
1 parent 78ab484 commit 9e71372
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 28 deletions.
2 changes: 1 addition & 1 deletion readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def align_audio( # noqa: C901
if not valid:
raise RuntimeError(
"Some words could not be g2p'd correctly. Aborting. "
"Run with --g2p-verbose for detailed g2p error logs."
"Run with --g2p-verbose for more detailed g2p error logs."
)

# Prepare the SoundsSwallower (formerly PocketSphinx) configuration
Expand Down
80 changes: 53 additions & 27 deletions readalongs/text/convert_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
import copy
import os

from g2p import make_g2p
from g2p import NetworkXNoPath, make_g2p
from g2p.mappings.langs.utils import is_arpabet
from g2p.transducer import CompositeTransductionGraph, TransductionGraph

Expand All @@ -55,6 +55,9 @@
load_xml,
save_xml,
)
from readalongs.util import getLangs

LANGS, LANG_NAMES = getLangs()


def convert_word(word: str, lang: str, output_orthography: str, verbose_warnings: bool):
Expand All @@ -79,7 +82,18 @@ def convert_word(word: str, lang: str, output_orthography: str, verbose_warnings
eng_valid = False
return eng_converter, eng_tg, eng_text, eng_indices, eng_valid
else:
converter = make_g2p(lang, output_orthography)
try:
converter = make_g2p(lang, output_orthography)
except FileNotFoundError as e:
raise ValueError(
f'Could not g2p "{word}" as "{lang}": invalid language code. '
f"Use one of {LANGS}"
) from e
except NetworkXNoPath as e:
raise ValueError(
f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". '
f"Use one of {LANGS}"
) from e
tg = converter(word)
text = tg.output_string.strip()
indices = tg.edges
Expand Down Expand Up @@ -109,31 +123,43 @@ def convert_words(
g2p_lang = get_lang_attrib(word) or "und" # default: Undetermined
g2p_fallbacks = get_attrib_recursive(word, "fallback-langs")
text_to_g2p = word.text
converter, tg, g2p_text, indices, valid = convert_word(
text_to_g2p, g2p_lang, output_orthography, verbose_warnings
)
if not valid:
# This is where we apply the g2p cascade
for lang in g2p_fallbacks.split(":") if g2p_fallbacks else []:
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" as {g2p_lang}. Trying fallback: {lang}.'
)
g2p_lang = lang
converter, tg, g2p_text, indices, valid = convert_word(
text_to_g2p, g2p_lang, output_orthography, verbose_warnings
)
if valid:
word.attrib["effective_g2p_lang"] = g2p_lang
break
else:
all_g2p_valid = False
LOGGER.warning(
f'No valid g2p conversion found for "{text_to_g2p}". '
f"Check its orthography and language code, "
f"or pick suitable g2p fallback languages."
)

word.attrib["ARPABET"] = g2p_text
try:
converter, tg, g2p_text, indices, valid = convert_word(
text_to_g2p, g2p_lang, output_orthography, verbose_warnings
)
if not valid:
# This is where we apply the g2p cascade
for lang in g2p_fallbacks.split(":") if g2p_fallbacks else []:
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" as {g2p_lang}. '
f"Trying fallback: {lang}."
)
g2p_lang = lang
converter, tg, g2p_text, indices, valid = convert_word(
text_to_g2p, g2p_lang, output_orthography, verbose_warnings
)
if valid:
word.attrib["effective-g2p-lang"] = g2p_lang
break
else:
all_g2p_valid = False
LOGGER.warning(
f'No valid g2p conversion found for "{text_to_g2p}". '
f"Check its orthography and language code, "
f"or pick suitable g2p fallback languages."
)

# Save the g2p_text from the last conversion attemps, even when
# it's not valid, so it's in the g2p output if the user wants to
# inspect it manually.
word.attrib["ARPABET"] = g2p_text

except ValueError as e:
LOGGER.warning(
f'Could not g2p "{text_to_g2p}" due to an incorrect '
f'"xml:lang", "lang" or "fallback-langs" attribute in the XML: {e}'
)
all_g2p_valid = False

return xml, all_g2p_valid

Expand Down
17 changes: 17 additions & 0 deletions test/test_g2p_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from readalongs.align import align_audio
from readalongs.cli import align, g2p, prepare, tokenize
from readalongs.log import LOGGER
from readalongs.text.convert_xml import convert_xml


Expand Down Expand Up @@ -322,6 +323,22 @@ def test_convert_xml_invalid(self):
)
self.assertFalse(valid, "convert_xml with invalid pre-g2p'd text")

def test_invalid_langs_in_xml(self):
xml = etree.fromstring(
"""
<s>
<w lang="eng" fallback-langs="foo">français falls back to invalid foo</w>
<w lang="crx-syl">no path to arpabet</w>
</s>
"""
)
with self.assertLogs(LOGGER, level="WARNING") as cm:
c_xml, valid = convert_xml(xml)
self.assertFalse(valid)
logger_output = "\n".join(cm.output)
self.assertIn('"foo": invalid language code', logger_output)
self.assertIn('"crx-syl": no path to "eng-arpabet"', logger_output)


if __name__ == "__main__":
main()

0 comments on commit 9e71372

Please sign in to comment.