From 5465d22e7d72de6e09beb60e0e4244b8a0b29a11 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 31 Jan 2023 11:39:47 -0500 Subject: [PATCH] feat: basically s/tei/readalong/gi --- readalongs/align.py | 22 +++++++++++----------- readalongs/cli.py | 8 ++++---- readalongs/text/add_ids_to_xml.py | 2 +- readalongs/text/convert_xml.py | 2 +- readalongs/text/end_to_end.py | 2 +- readalongs/web_api.py | 13 +++++++------ 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/readalongs/align.py b/readalongs/align.py index 5f41962f..96048304 100644 --- a/readalongs/align.py +++ b/readalongs/align.py @@ -176,7 +176,7 @@ def parse_and_make_xml( """Parse XML input and run tokenization and G2P. Args: - xml_path (str): Path to XML input file in TEI-like format + xml_path (str): Path to XML input file in RAS format config (dict): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings @@ -541,7 +541,7 @@ def align_audio( """Align an XML input file to an audio file. Args: - xml_path (str): Path to XML input file in TEI-like format + xml_path (str): Path to XML input file in RAS format audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; @@ -1120,8 +1120,8 @@ def convert_to_xhtml(tokenized_xml, title="Book"): head.append(link_element) -TEI_TEMPLATE = """ - +RAS_TEMPLATE = """ + {{#pages}} @@ -1137,12 +1137,12 @@ def convert_to_xhtml(tokenized_xml, title="Book"): {{/pages}} - + """ -def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str: - """Create input xml in TEI standard. +def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str: + """Create input xml in RAS format. Uses the line sequence to infer paragraph and sentence structure from plain text: Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. @@ -1184,11 +1184,11 @@ def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> paragraphs.append({"sentences": sentences}) if paragraphs: pages.append({"paragraphs": paragraphs}) - return chevron.render(TEI_TEMPLATE, {**kwargs, **{"pages": pages}}) + return chevron.render(RAS_TEMPLATE, {**kwargs, **{"pages": pages}}) -def create_input_tei(**kwargs): - """Create input xml in TEI standard. +def create_input_ras(**kwargs): + """Create input xml in RAS format. Uses readlines to infer paragraph and sentence structure from plain text. Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. @@ -1241,7 +1241,7 @@ def create_input_tei(**kwargs): prefix="readalongs_xml_", suffix=".xml", delete=True ) filename = outfile.name - xml = create_tei_from_text(text, text_langs) + xml = create_ras_from_text(text, text_langs) outfile.write(xml.encode("utf-8")) outfile.flush() outfile.close() diff --git a/readalongs/cli.py b/readalongs/cli.py index cd9dfd1d..42aff681 100644 --- a/readalongs/cli.py +++ b/readalongs/cli.py @@ -20,7 +20,7 @@ from lxml import etree from readalongs._version import __version__ -from readalongs.align import align_audio, create_input_tei, save_readalong +from readalongs.align import align_audio, create_input_ras, save_readalong from readalongs.log import LOGGER from readalongs.text.add_ids_to_xml import add_ids from readalongs.text.convert_xml import convert_xml @@ -369,7 +369,7 @@ def align(**kwargs): # noqa: C901 # some versions of flake8 need this here ins languages.append("und") plain_textfile = kwargs["textfile"] try: - _, xml_textfile = create_input_tei( + _, xml_textfile = create_input_ras( input_file_name=plain_textfile, text_languages=languages, save_temps=temp_base, @@ -532,7 +532,7 @@ def make_xml(**kwargs): try: if out_file == "-": - _, filename = create_input_tei( + _, filename = create_input_ras( input_file_handle=input_file, text_languages=languages ) with io.open(filename, encoding="utf-8-sig") as f: @@ -545,7 +545,7 @@ def make_xml(**kwargs): "Output file %s exists already, use -f to overwrite." % out_file ) - _, filename = create_input_tei( + _, filename = create_input_ras( input_file_handle=input_file, text_languages=languages, output_file=out_file, diff --git a/readalongs/text/add_ids_to_xml.py b/readalongs/text/add_ids_to_xml.py index ce0148f2..f75546f0 100644 --- a/readalongs/text/add_ids_to_xml.py +++ b/readalongs/text/add_ids_to_xml.py @@ -9,7 +9,7 @@ # # The auto-generated IDs have formats like "s0w2m1" meaning # "sentence 0, word 2, morpheme 1". But it's flexible if some elements -# already have ids, or if the markup uses different tags than a TEI document. +# already have ids, or if the markup uses different tags than a RAS document. # ################################################### diff --git a/readalongs/text/convert_xml.py b/readalongs/text/convert_xml.py index c34c4179..ccfe1b13 100644 --- a/readalongs/text/convert_xml.py +++ b/readalongs/text/convert_xml.py @@ -21,7 +21,7 @@ # second part to the Kwak'wala pipeline. # # The only assumption made by this module about the structure of the XML -# is that it has word tags (using , the convention used by TEI formats.) +# is that it has word tags (using , the convention used by RAS and TEI formats.) # The reason for this is that the word is the domain over which phonological # rules apply, and we particularly need to know it to be able to perform # phonological rules at word boundaries. We also only convert text that diff --git a/readalongs/text/end_to_end.py b/readalongs/text/end_to_end.py index 5229eef2..31e674dd 100644 --- a/readalongs/text/end_to_end.py +++ b/readalongs/text/end_to_end.py @@ -5,7 +5,7 @@ # # end_to_end.py # -# Takes an XML file (preferrably using TEI conventions) and +# Takes an XML file (preferrably using TEI conventions or RAS format) and # makes: # # 1. An XML file with added IDs for elements (if the elements didn't diff --git a/readalongs/web_api.py b/readalongs/web_api.py index e1fd4eb3..a192ce7c 100644 --- a/readalongs/web_api.py +++ b/readalongs/web_api.py @@ -35,7 +35,7 @@ from pydantic import BaseModel, Field from starlette.background import BackgroundTask -from readalongs.align import create_tei_from_text, save_label_files, save_subtitles +from readalongs.align import create_ras_from_text, save_label_files, save_subtitles from readalongs.log import LOGGER from readalongs.text.add_ids_to_xml import add_ids from readalongs.text.convert_xml import convert_xml @@ -132,7 +132,7 @@ async def assemble( "xml": { "summary": "A basic example with xml input", "value": { - "xml": "

hej verden

", + "xml": "

hej verden

", "text_languages": ["dan", "und"], "debug": False, }, @@ -140,7 +140,7 @@ async def assemble( } ) ): - """Create an input TEI from the given text (as plain text or XML). + """Create an input RAS from the given text (as plain text or XML). Also creates the required grammar, pronunciation dictionary, and text needed by the decoder. @@ -174,7 +174,7 @@ async def assemble( parsed = io.StringIO(request.text).readlines() parsed = etree.fromstring( bytes( - create_tei_from_text(parsed, text_languages=request.text_languages), + create_ras_from_text(parsed, text_languages=request.text_languages), encoding="utf-8", ), parser=etree.XMLParser(resolve_entities=False), @@ -240,7 +240,8 @@ class ConvertRequest(BaseModel): example=dedent( """\ - + +
@@ -250,7 +251,7 @@ class ConvertRequest(BaseModel):
-
""" + """ ), )