Skip to content

Commit

Permalink
feat: basically s/tei/readalong/gi
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Jan 31, 2023
1 parent f94174b commit 5465d22
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 24 deletions.
22 changes: 11 additions & 11 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def parse_and_make_xml(
"""Parse XML input and run tokenization and G2P.
Args:
xml_path (str): Path to XML input file in TEI-like format
xml_path (str): Path to XML input file in RAS format
config (dict): Optional; ReadAlong-Studio configuration to use
save_temps (str): Optional; Save temporary files, by default None
verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
Expand Down Expand Up @@ -541,7 +541,7 @@ def align_audio(
"""Align an XML input file to an audio file.
Args:
xml_path (str): Path to XML input file in TEI-like format
xml_path (str): Path to XML input file in RAS format
audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
unit (str): Optional; Element to create alignments for, by default 'w'
bare (boolean): Optional;
Expand Down Expand Up @@ -1120,8 +1120,8 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
head.append(link_element)


TEI_TEMPLATE = """<?xml version='1.0' encoding='utf-8'?>
<TEI>
RAS_TEMPLATE = """<?xml version='1.0' encoding='utf-8'?>
<readalong>
<text xml:lang="{{main_lang}}" fallback-langs="{{fallback_langs}}">
<body>
{{#pages}}
Expand All @@ -1137,12 +1137,12 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
{{/pages}}
</body>
</text>
</TEI>
</readalong>
"""


def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
"""Create input xml in TEI standard.
def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
"""Create input xml in RAS format.
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down Expand Up @@ -1184,11 +1184,11 @@ def create_tei_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->
paragraphs.append({"sentences": sentences})
if paragraphs:
pages.append({"paragraphs": paragraphs})
return chevron.render(TEI_TEMPLATE, {**kwargs, **{"pages": pages}})
return chevron.render(RAS_TEMPLATE, {**kwargs, **{"pages": pages}})


def create_input_tei(**kwargs):
"""Create input xml in TEI standard.
def create_input_ras(**kwargs):
"""Create input xml in RAS format.
Uses readlines to infer paragraph and sentence structure from plain text.
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down Expand Up @@ -1241,7 +1241,7 @@ def create_input_tei(**kwargs):
prefix="readalongs_xml_", suffix=".xml", delete=True
)
filename = outfile.name
xml = create_tei_from_text(text, text_langs)
xml = create_ras_from_text(text, text_langs)
outfile.write(xml.encode("utf-8"))
outfile.flush()
outfile.close()
Expand Down
8 changes: 4 additions & 4 deletions readalongs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from lxml import etree

from readalongs._version import __version__
from readalongs.align import align_audio, create_input_tei, save_readalong
from readalongs.align import align_audio, create_input_ras, save_readalong
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.convert_xml import convert_xml
Expand Down Expand Up @@ -369,7 +369,7 @@ def align(**kwargs): # noqa: C901 # some versions of flake8 need this here ins
languages.append("und")
plain_textfile = kwargs["textfile"]
try:
_, xml_textfile = create_input_tei(
_, xml_textfile = create_input_ras(
input_file_name=plain_textfile,
text_languages=languages,
save_temps=temp_base,
Expand Down Expand Up @@ -532,7 +532,7 @@ def make_xml(**kwargs):

try:
if out_file == "-":
_, filename = create_input_tei(
_, filename = create_input_ras(
input_file_handle=input_file, text_languages=languages
)
with io.open(filename, encoding="utf-8-sig") as f:
Expand All @@ -545,7 +545,7 @@ def make_xml(**kwargs):
"Output file %s exists already, use -f to overwrite." % out_file
)

_, filename = create_input_tei(
_, filename = create_input_ras(
input_file_handle=input_file,
text_languages=languages,
output_file=out_file,
Expand Down
2 changes: 1 addition & 1 deletion readalongs/text/add_ids_to_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#
# The auto-generated IDs have formats like "s0w2m1" meaning
# "sentence 0, word 2, morpheme 1". But it's flexible if some elements
# already have ids, or if the markup uses different tags than a TEI document.
# already have ids, or if the markup uses different tags than a RAS document.
#
###################################################

Expand Down
2 changes: 1 addition & 1 deletion readalongs/text/convert_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# second part to the Kwak'wala pipeline.
#
# The only assumption made by this module about the structure of the XML
# is that it has word tags (using <w>, the convention used by TEI formats.)
# is that it has word tags (using <w>, the convention used by RAS and TEI formats.)
# The reason for this is that the word is the domain over which phonological
# rules apply, and we particularly need to know it to be able to perform
# phonological rules at word boundaries. We also only convert text that
Expand Down
2 changes: 1 addition & 1 deletion readalongs/text/end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#
# end_to_end.py
#
# Takes an XML file (preferrably using TEI conventions) and
# Takes an XML file (preferrably using TEI conventions or RAS format) and
# makes:
#
# 1. An XML file with added IDs for elements (if the elements didn't
Expand Down
13 changes: 7 additions & 6 deletions readalongs/web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from pydantic import BaseModel, Field
from starlette.background import BackgroundTask

from readalongs.align import create_tei_from_text, save_label_files, save_subtitles
from readalongs.align import create_ras_from_text, save_label_files, save_subtitles
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.convert_xml import convert_xml
Expand Down Expand Up @@ -132,15 +132,15 @@ async def assemble(
"xml": {
"summary": "A basic example with xml input",
"value": {
"xml": "<?xml version='1.0' encoding='utf-8'?><TEI><text><p><s>hej verden</s></p></text></TEI>",
"xml": "<?xml version='1.0' encoding='utf-8'?><readalong><text><p><s>hej verden</s></p></text></readalong>",
"text_languages": ["dan", "und"],
"debug": False,
},
},
}
)
):
"""Create an input TEI from the given text (as plain text or XML).
"""Create an input RAS from the given text (as plain text or XML).
Also creates the required grammar, pronunciation dictionary,
and text needed by the decoder.
Expand Down Expand Up @@ -174,7 +174,7 @@ async def assemble(
parsed = io.StringIO(request.text).readlines()
parsed = etree.fromstring(
bytes(
create_tei_from_text(parsed, text_languages=request.text_languages),
create_ras_from_text(parsed, text_languages=request.text_languages),
encoding="utf-8",
),
parser=etree.XMLParser(resolve_entities=False),
Expand Down Expand Up @@ -240,7 +240,8 @@ class ConvertRequest(BaseModel):
example=dedent(
"""\
<?xml version='1.0' encoding='utf-8'?>
<TEI>
<!DOCDTYPE readalong SYSTEM "readalong.dtd">
<readalong>
<text xml:lang="dan" fallback-langs="und" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0">
Expand All @@ -250,7 +251,7 @@ class ConvertRequest(BaseModel):
</div>
</body>
</text>
</TEI>"""
</readalong>"""
),
)

Expand Down

0 comments on commit 5465d22

Please sign in to comment.