From 5465d22e7d72de6e09beb60e0e4244b8a0b29a11 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhdaines@gmail.com>
Date: Tue, 31 Jan 2023 11:39:47 -0500
Subject: [PATCH] feat: basically s/tei/readalong/gi

---
 readalongs/align.py               | 22 +++++++++++-----------
 readalongs/cli.py                 |  8 ++++----
 readalongs/text/add_ids_to_xml.py |  2 +-
 readalongs/text/convert_xml.py    |  2 +-
 readalongs/text/end_to_end.py     |  2 +-
 readalongs/web_api.py             | 13 +++++++------
 6 files changed, 25 insertions(+), 24 deletions(-)
diff --git a/readalongs/align.py b/readalongs/align.py
index 5f41962f..96048304 100644
--- a/readalongs/align.py
+++ b/readalongs/align.py
@@ -176,7 +176,7 @@ def parse_and_make_xml(
     """Parse XML input and run tokenization and G2P.
 
     Args:
-        xml_path (str): Path to XML input file in TEI-like format
+        xml_path (str): Path to XML input file in RAS format
         config (dict): Optional; ReadAlong-Studio configuration to use
         save_temps (str): Optional; Save temporary files, by default None
         verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
@@ -541,7 +541,7 @@ def align_audio(
     """Align an XML input file to an audio file.
 
     Args:
-        xml_path (str): Path to XML input file in TEI-like format
+        xml_path (str): Path to XML input file in RAS format
         audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
         unit (str): Optional; Element to create alignments for, by default 'w'
         bare (boolean): Optional;
@@ -1120,8 +1120,8 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
     head.append(link_element)
 
 
-TEI_TEMPLATE = """<?xml version='1.0' encoding='utf-8'?>
-<TEI>
+RAS_TEMPLATE = """<?xml version='1.0' encoding='utf-8'?>
+<readalong>
     <text xml:lang="{{main_lang}}" fallback-langs="{{fallback_langs}}">
         <body>
         {{#pages}}
@@ -1137,12 +1137,12 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
         {{/pages}}
         </body>
     </text>
-</TEI>
+</readalong>
 """
 
 
-def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
-    """Create input xml in TEI standard.
+def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
+    """Create input xml in RAS format.
         Uses the line sequence to infer paragraph and sentence structure from plain text:
         Assumes a double blank line marks a page break, and a single blank line
         marks a paragraph break.
@@ -1184,11 +1184,11 @@ def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->
         paragraphs.append({"sentences": sentences})
     if paragraphs:
         pages.append({"paragraphs": paragraphs})
-    return chevron.render(TEI_TEMPLATE, {**kwargs, **{"pages": pages}})
+    return chevron.render(RAS_TEMPLATE, {**kwargs, **{"pages": pages}})
 
 
-def create_input_tei(**kwargs):
-    """Create input xml in TEI standard.
+def create_input_ras(**kwargs):
+    """Create input xml in RAS format.
         Uses readlines to infer paragraph and sentence structure from plain text.
         Assumes a double blank line marks a page break, and a single blank line
         marks a paragraph break.
@@ -1241,7 +1241,7 @@ def create_input_tei(**kwargs):
             prefix="readalongs_xml_", suffix=".xml", delete=True
         )
         filename = outfile.name
-    xml = create_tei_from_text(text, text_langs)
+    xml = create_ras_from_text(text, text_langs)
     outfile.write(xml.encode("utf-8"))
     outfile.flush()
     outfile.close()
diff --git a/readalongs/cli.py b/readalongs/cli.py
index cd9dfd1d..42aff681 100644
--- a/readalongs/cli.py
+++ b/readalongs/cli.py
@@ -20,7 +20,7 @@
 from lxml import etree
 
 from readalongs._version import __version__
-from readalongs.align import align_audio, create_input_tei, save_readalong
+from readalongs.align import align_audio, create_input_ras, save_readalong
 from readalongs.log import LOGGER
 from readalongs.text.add_ids_to_xml import add_ids
 from readalongs.text.convert_xml import convert_xml
@@ -369,7 +369,7 @@ def align(**kwargs):  # noqa: C901  # some versions of flake8 need this here ins
             languages.append("und")
         plain_textfile = kwargs["textfile"]
         try:
-            _, xml_textfile = create_input_tei(
+            _, xml_textfile = create_input_ras(
                 input_file_name=plain_textfile,
                 text_languages=languages,
                 save_temps=temp_base,
@@ -532,7 +532,7 @@ def make_xml(**kwargs):
 
     try:
         if out_file == "-":
-            _, filename = create_input_tei(
+            _, filename = create_input_ras(
                 input_file_handle=input_file, text_languages=languages
             )
             with io.open(filename, encoding="utf-8-sig") as f:
@@ -545,7 +545,7 @@ def make_xml(**kwargs):
                     "Output file %s exists already, use -f to overwrite." % out_file
                 )
 
-            _, filename = create_input_tei(
+            _, filename = create_input_ras(
                 input_file_handle=input_file,
                 text_languages=languages,
                 output_file=out_file,
diff --git a/readalongs/text/add_ids_to_xml.py b/readalongs/text/add_ids_to_xml.py
index ce0148f2..f75546f0 100644
--- a/readalongs/text/add_ids_to_xml.py
+++ b/readalongs/text/add_ids_to_xml.py
@@ -9,7 +9,7 @@
 #
 # The auto-generated IDs have formats like "s0w2m1" meaning
 # "sentence 0, word 2, morpheme 1".  But it's flexible if some elements
-# already have ids, or if the markup uses different tags than a TEI document.
+# already have ids, or if the markup uses different tags than a RAS document.
 #
 ###################################################
 
diff --git a/readalongs/text/convert_xml.py b/readalongs/text/convert_xml.py
index c34c4179..ccfe1b13 100644
--- a/readalongs/text/convert_xml.py
+++ b/readalongs/text/convert_xml.py
@@ -21,7 +21,7 @@
 # second part to the Kwak'wala pipeline.
 #
 # The only assumption made by this module about the structure of the XML
-# is that it has word tags (using <w>, the convention used by TEI formats.)
+# is that it has word tags (using <w>, the convention used by RAS and TEI formats.)
 # The reason for this is that the word is the domain over which phonological
 # rules apply, and we particularly need to know it to be able to perform
 # phonological rules at word boundaries.  We also only convert text that
diff --git a/readalongs/text/end_to_end.py b/readalongs/text/end_to_end.py
index 5229eef2..31e674dd 100644
--- a/readalongs/text/end_to_end.py
+++ b/readalongs/text/end_to_end.py
@@ -5,7 +5,7 @@
 #
 # end_to_end.py
 #
-# Takes an XML file (preferrably using TEI conventions) and
+# Takes an XML file (preferrably using TEI conventions or RAS format) and
 # makes:
 #
 # 1. An XML file with added IDs for elements (if the elements didn't
diff --git a/readalongs/web_api.py b/readalongs/web_api.py
index e1fd4eb3..a192ce7c 100644
--- a/readalongs/web_api.py
+++ b/readalongs/web_api.py
@@ -35,7 +35,7 @@
 from pydantic import BaseModel, Field
 from starlette.background import BackgroundTask
 
-from readalongs.align import create_tei_from_text, save_label_files, save_subtitles
+from readalongs.align import create_ras_from_text, save_label_files, save_subtitles
 from readalongs.log import LOGGER
 from readalongs.text.add_ids_to_xml import add_ids
 from readalongs.text.convert_xml import convert_xml
@@ -132,7 +132,7 @@ async def assemble(
             "xml": {
                 "summary": "A basic example with xml input",
                 "value": {
-                    "xml": "<?xml version='1.0' encoding='utf-8'?><TEI><text><p><s>hej verden</s></p></text></TEI>",
+                    "xml": "<?xml version='1.0' encoding='utf-8'?><readalong><text><p><s>hej verden</s></p></text></readalong>",
                     "text_languages": ["dan", "und"],
                     "debug": False,
                 },
@@ -140,7 +140,7 @@ async def assemble(
         }
     )
 ):
-    """Create an input TEI from the given text (as plain text or XML).
+    """Create an input RAS from the given text (as plain text or XML).
     Also creates the required grammar, pronunciation dictionary,
     and text needed by the decoder.
 
@@ -174,7 +174,7 @@ async def assemble(
         parsed = io.StringIO(request.text).readlines()
         parsed = etree.fromstring(
             bytes(
-                create_tei_from_text(parsed, text_languages=request.text_languages),
+                create_ras_from_text(parsed, text_languages=request.text_languages),
                 encoding="utf-8",
             ),
             parser=etree.XMLParser(resolve_entities=False),
@@ -240,7 +240,8 @@ class ConvertRequest(BaseModel):
         example=dedent(
             """\
             <?xml version='1.0' encoding='utf-8'?>
-            <TEI>
+            <!DOCDTYPE readalong SYSTEM "readalong.dtd">
+            <readalong>
                 <text xml:lang="dan" fallback-langs="und" id="t0">
                     <body id="t0b0">
                         <div type="page" id="t0b0d0">
@@ -250,7 +251,7 @@ class ConvertRequest(BaseModel):
                         </div>
                     </body>
                 </text>
-            </TEI>"""
+            </readalong>"""
         ),
     )