Skip to content

Commit

Permalink
fix: remove unsupported encoding argument from web_api
Browse files Browse the repository at this point in the history
The utf-8 assumption is baked in in various subtle places, so just document
it is required and remove the parameter.

Did some refactoring in test_web_api.py at the same time.
  • Loading branch information
joanise committed Sep 28, 2022
1 parent 4520ad8 commit 41d5912
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 94 deletions.
34 changes: 5 additions & 29 deletions readalongs/web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ class RequestBase(BaseModel):
"""Base request for assemble"""

text_languages: List[str]
encoding: str = "utf-8"
debug: bool = False


Expand All @@ -92,18 +91,6 @@ class AssembleResponse(BaseModel):
g2ped: Optional[str]


def process_xml(func):
# Wrapper for processing XML, reads the XML with proper encoding,
# then applies the given function to it,
# then converts the result back to utf-8 XML and returns it
def wrapper(xml, **kwargs):
parsed = etree.fromstring(bytes(xml.xml, encoding=xml.encoding))
processed = func(parsed, **kwargs)
return etree.tostring(processed, encoding="utf-8", xml_declaration=True)

return wrapper


@v1.get("/langs", response_model=Dict[str, str])
async def langs() -> Dict[str, str]:
"""Return the list of supported languages and their names as a dict.
Expand Down Expand Up @@ -131,7 +118,6 @@ async def assemble(
"value": {
"text": "hej verden",
"text_languages": ["dan", "und"],
"encoding": "utf-8",
"debug": False,
},
},
Expand All @@ -140,7 +126,6 @@ async def assemble(
"value": {
"xml": "<?xml version='1.0' encoding='utf-8'?><TEI><text><p><s>hej verden</s></p></text></TEI>",
"text_languages": ["dan", "und"],
"encoding": "utf-8",
"debug": False,
},
},
Expand All @@ -151,9 +136,10 @@ async def assemble(
Also creates the required grammar, pronunciation dictionary,
and text needed by the decoder.
Encoding: all input and output is in UTF-8.
Args (as dict items in the request body):
- text_languages: the list of languages for g2p processing
- encoding: encoding (default: "utf-8")
- debug: set to true for debugging (default: False)
- either text or xml:
- text: the input text as plain text
Expand All @@ -168,7 +154,7 @@ async def assemble(

if isinstance(input, XMLRequest):
try:
parsed = etree.fromstring(bytes(input.xml, encoding=input.encoding))
parsed = etree.fromstring(bytes(input.xml, encoding="utf-8"))
except etree.XMLSyntaxError as e:
raise HTTPException(
status_code=422, detail="XML provided is not valid"
Expand Down Expand Up @@ -227,11 +213,6 @@ class ConvertRequest(BaseModel):
title="The length of the audio used to create the alignment, in seconds.",
)

encoding: str = Field(
example="utf-8",
title="Only utf-8 is supported now, but contact us if you might need support for a different enciding.",
)

output_format: str = Field(
example="TextGrid",
regex="^(?i)(eaf|TextGrid|srt|vtt)$",
Expand Down Expand Up @@ -307,9 +288,10 @@ def slurp_file(filename):
async def convert_alignment(input: ConvertRequest) -> ConvertResponse:
"""Convert an alignment to a different format.
Encoding: all input and output is in UTF-8.
Args (as dict items in the request body):
- audio_length: duration in seconds of the audio file used to create the alignment
- encoding: use utf-8, other encodings are not supported (yet)
- output_format: one of TextGrid, eaf, srt, vtt
- xml: the XML file produced by /assemble
- smil: the SMIL file produced by SoundSwallower(.js)
Expand All @@ -332,12 +314,6 @@ async def convert_alignment(input: ConvertRequest) -> ConvertResponse:
- other_file_name: a suggested name for the second file
- other_file_contents: the contents of the second file
"""
if input.encoding not in ["utf-8", "utf8", "UTF-8", "UTF8", ""]:
raise HTTPException(
status_code=422,
detail="Please use utf-8 as your encoding, or contact us with a description of how and why you would like to use a different encoding",
)

try:
parsed_xml = etree.fromstring(bytes(input.xml, encoding="utf-8"))
except etree.XMLSyntaxError as e:
Expand Down
108 changes: 43 additions & 65 deletions test/test_web_api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python3

import os
from copy import deepcopy
from textwrap import dedent
from unittest import main

Expand All @@ -13,23 +12,23 @@
from readalongs.text.convert_xml import convert_xml
from readalongs.text.tokenize_xml import tokenize_xml
from readalongs.util import get_langs
from readalongs.web_api import XMLRequest, create_grammar, process_xml, web_api_app
from readalongs.web_api import create_grammar, web_api_app

API_CLIENT = TestClient(web_api_app)


class TestWebApi(BasicTestCase):
def setUp(self):
super().setUp()
self.basicRequest = {"encoding": "utf-8", "debug": False}
def slurp_data_file(self, filename: str) -> str:
"""Convenience function to slurp a whole file in self.data_dir"""
with open(os.path.join(self.data_dir, filename), encoding="utf8") as f:
return f.read().strip()

def test_assemble_from_plain_text(self):
# Test the assemble endpoint with plain text
with open(os.path.join(self.data_dir, "ej-fra.txt"), encoding="utf8") as f:
data = f.read().strip()
request = deepcopy(self.basicRequest)
request["text"] = data
request["text_languages"] = ["fra"]
request = {
"text": self.slurp_data_file("ej-fra.txt"),
"text_languages": ["fra"],
}
response = API_CLIENT.post("/api/v1/assemble", json=request)
self.assertEqual(response.status_code, 200)

Expand All @@ -45,37 +44,28 @@ def test_bad_method(self):

def test_assemble_from_xml(self):
# Test the assemble endpoint with XML
with open(os.path.join(self.data_dir, "ej-fra.xml"), encoding="utf8") as f:
data = f.read().strip()
request = deepcopy(self.basicRequest)
request["xml"] = data
request["text_languages"] = ["fra"]
request = {
"encoding": "utf-8", # for bwd compat, make sure the encoding is allowed but ignored
"xml": self.slurp_data_file("ej-fra.xml"),
"text_languages": ["fra"],
}
response = API_CLIENT.post("/api/v1/assemble", json=request)
self.assertEqual(response.status_code, 200)

def test_wrapper(self):
# Test the xml processing wrapper
with open(os.path.join(self.data_dir, "ej-fra.xml"), encoding="utf8") as f:
data = f.read().strip()
xml_request = XMLRequest(xml=data, text_languages=["test"])
self.assertAlmostEqual(
data, process_xml(lambda x: x)(xml_request).decode("utf-8")
)

def test_bad_xml(self):
# Test the assemble endpoint with invalid XML
data = "this is not xml"
request = deepcopy(self.basicRequest)
request["xml"] = data
request["text_languages"] = ["fra"]
request = {
"xml": "this is not xml",
"text_languages": ["fra"],
}
response = API_CLIENT.post("/api/v1/assemble", json=request)
self.assertEqual(response.status_code, 422)

def test_create_grammar(self):
# Test the create grammar function
with open(os.path.join(self.data_dir, "ej-fra.xml"), encoding="utf8") as f:
data = f.read().strip()
parsed = etree.fromstring(bytes(data, encoding="utf8"))
parsed = etree.fromstring(
bytes(self.slurp_data_file("ej-fra.xml"), encoding="utf8")
)
tokenized = tokenize_xml(parsed)
ids_added = add_ids(tokenized)
g2ped, valid = convert_xml(ids_added)
Expand All @@ -87,11 +77,10 @@ def test_create_grammar(self):

def test_bad_g2p(self):
# Test the assemble endpoint with invalid g2p languages
with open(os.path.join(self.data_dir, "ej-fra.txt"), encoding="utf8") as f:
data = f.read().strip()
request = deepcopy(self.basicRequest)
request["text"] = data
request["text_languages"] = ["test"]
request = {
"text": "blah blah",
"text_languages": ["test"],
}
response = API_CLIENT.post("/api/v1/assemble", json=request)
self.assertEqual(response.status_code, 422)

Expand All @@ -103,19 +92,30 @@ def test_langs(self):

def test_debug(self):
# Test the assemble endpoint with debug mode on
with open(os.path.join(self.data_dir, "ej-fra.txt"), encoding="utf8") as f:
data = f.read().strip()
request = deepcopy(self.basicRequest)
request["text"] = data
request["debug"] = True
request["text_languages"] = ["fra"]
request = {
"text": self.slurp_data_file("ej-fra.txt"),
"debug": True,
"text_languages": ["fra"],
}
response = API_CLIENT.post("/api/v1/assemble", json=request)
content = response.json()
self.assertEqual(content["input"], request)
self.assertGreater(len(content["tokenized"]), 10)
self.assertGreater(len(content["parsed"]), 10)
self.assertGreater(len(content["g2ped"]), 10)

# Test that debug mode is off by default
request = {
"text": "Ceci est un test.",
"text_languages": ["fra"],
}
response = API_CLIENT.post("/api/v1/assemble", json=request)
content = response.json()
self.assertIsNone(content["input"])
self.assertIsNone(content["tokenized"])
self.assertIsNone(content["parsed"])
self.assertIsNone(content["g2ped"])

hej_verden_xml = dedent(
"""\
<?xml version='1.0' encoding='utf-8'?>
Expand Down Expand Up @@ -152,7 +152,6 @@ def test_debug(self):

def test_convert_to_TextGrid_errors(self):
request = {
"encoding": "utf-8",
"audio_length": 83.1,
"output_format": "TextGrid",
"xml": "this is not XML",
Expand All @@ -162,7 +161,6 @@ def test_convert_to_TextGrid_errors(self):
self.assertEqual(response.status_code, 422, "Invalid XML should fail.")

request = {
"encoding": "utf-8",
"audio_length": 83.1,
"output_format": "TextGrid",
"xml": self.hej_verden_xml,
Expand All @@ -172,7 +170,6 @@ def test_convert_to_TextGrid_errors(self):
self.assertEqual(response.status_code, 422, "Invalid SMIL should fail.")

request = {
"encoding": "utf-8",
"audio_length": -10.0,
"output_format": "TextGrid",
"xml": self.hej_verden_xml,
Expand All @@ -181,23 +178,8 @@ def test_convert_to_TextGrid_errors(self):
response = API_CLIENT.post("/api/v1/convert_alignment", json=request)
self.assertEqual(response.status_code, 422, "Negative duration should fail.")

request = {
"encoding": "latin-1",
"audio_length": 83.1,
"output_format": "TextGrid",
"xml": self.hej_verden_xml,
"smil": self.hej_verden_smil,
}
response = API_CLIENT.post("/api/v1/convert_alignment", json=request)
# Figure out how to exercise this case, but for now only utf-8 is supported...
# print("latin-1", response.status_code, response.json())
self.assertEqual(response.status_code, 422, "only utf-8 is supported for now")
# Or, once we do support latin-1:
# self.assertEqual(response.status_code, 400)

def test_convert_to_TextGrid(self):
request = {
"encoding": "utf-8",
"audio_length": 83.1,
"output_format": "TextGrid",
"xml": self.hej_verden_xml,
Expand Down Expand Up @@ -264,7 +246,6 @@ class = "IntervalTier"

def test_convert_to_eaf(self):
request = {
"encoding": "utf-8",
"audio_length": 83.1,
"output_format": "eaf",
"xml": self.hej_verden_xml,
Expand All @@ -277,7 +258,6 @@ def test_convert_to_eaf(self):

def test_convert_to_srt(self):
request = {
"encoding": "utf-8",
"audio_length": 83.1,
"output_format": "srt",
"xml": self.hej_verden_xml,
Expand Down Expand Up @@ -316,7 +296,7 @@ def test_convert_to_srt(self):

def test_convert_to_vtt(self):
request = {
"encoding": "utf-8",
"encoding": "utf-8", # for bwd compat, make sure the encoding is allowed but ignored
"audio_length": 83.1,
"output_format": "vtt",
"xml": self.hej_verden_xml,
Expand Down Expand Up @@ -354,7 +334,6 @@ def test_convert_to_vtt(self):

def test_convert_to_bad_format(self):
request = {
"encoding": "utf-8",
"audio_length": 83.1,
"output_format": "not_a_known_format",
"xml": self.hej_verden_xml,
Expand All @@ -364,7 +343,6 @@ def test_convert_to_bad_format(self):
self.assertEqual(response.status_code, 422)

request = {
"encoding": "utf-8",
"audio_length": 83.1,
# "output_format" just missing
"xml": self.hej_verden_xml,
Expand Down

0 comments on commit 41d5912

Please sign in to comment.