fix: remove unsupported encoding argument from web_api

The utf-8 assumption is baked in in various subtle places, so just document it is required and remove the parameter. Did some refactoring in test_web_api.py at the same time.
ReadAlongs · Sep 28, 2022 · 41d5912 · 41d5912
1 parent 4520ad8
commit 41d5912
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 94 deletions.
diff --git a/readalongs/web_api.py b/readalongs/web_api.py
@@ -65,7 +65,6 @@ class RequestBase(BaseModel):
     """Base request for assemble"""
 
     text_languages: List[str]
-    encoding: str = "utf-8"
     debug: bool = False
 
 
@@ -92,18 +91,6 @@ class AssembleResponse(BaseModel):
     g2ped: Optional[str]
 
 
-def process_xml(func):
-    # Wrapper for processing XML, reads the XML with proper encoding,
-    # then applies the given function to it,
-    # then converts the result back to utf-8 XML and returns it
-    def wrapper(xml, **kwargs):
-        parsed = etree.fromstring(bytes(xml.xml, encoding=xml.encoding))
-        processed = func(parsed, **kwargs)
-        return etree.tostring(processed, encoding="utf-8", xml_declaration=True)
-
-    return wrapper
-
-
 @v1.get("/langs", response_model=Dict[str, str])
 async def langs() -> Dict[str, str]:
     """Return the list of supported languages and their names as a dict.
@@ -131,7 +118,6 @@ async def assemble(
                 "value": {
                     "text": "hej verden",
                     "text_languages": ["dan", "und"],
-                    "encoding": "utf-8",
                     "debug": False,
                 },
             },
@@ -140,7 +126,6 @@ async def assemble(
                 "value": {
                     "xml": "<?xml version='1.0' encoding='utf-8'?><TEI><text><p><s>hej verden</s></p></text></TEI>",
                     "text_languages": ["dan", "und"],
-                    "encoding": "utf-8",
                     "debug": False,
                 },
             },
@@ -151,9 +136,10 @@ async def assemble(
     Also creates the required grammar, pronunciation dictionary,
     and text needed by the decoder.
 
+    Encoding: all input and output is in UTF-8.
+
     Args (as dict items in the request body):
      - text_languages: the list of languages for g2p processing
-     - encoding: encoding (default: "utf-8")
      - debug: set to true for debugging (default: False)
      - either text or xml:
         - text: the input text as plain text
@@ -168,7 +154,7 @@ async def assemble(
 
     if isinstance(input, XMLRequest):
         try:
-            parsed = etree.fromstring(bytes(input.xml, encoding=input.encoding))
+            parsed = etree.fromstring(bytes(input.xml, encoding="utf-8"))
         except etree.XMLSyntaxError as e:
             raise HTTPException(
                 status_code=422, detail="XML provided is not valid"
@@ -227,11 +213,6 @@ class ConvertRequest(BaseModel):
         title="The length of the audio used to create the alignment, in seconds.",
     )
 
-    encoding: str = Field(
-        example="utf-8",
-        title="Only utf-8 is supported now, but contact us if you might need support for a different enciding.",
-    )
-
     output_format: str = Field(
         example="TextGrid",
         regex="^(?i)(eaf|TextGrid|srt|vtt)$",
@@ -307,9 +288,10 @@ def slurp_file(filename):
 async def convert_alignment(input: ConvertRequest) -> ConvertResponse:
     """Convert an alignment to a different format.
 
+    Encoding: all input and output is in UTF-8.
+
     Args (as dict items in the request body):
      - audio_length: duration in seconds of the audio file used to create the alignment
-     - encoding: use utf-8, other encodings are not supported (yet)
      - output_format: one of TextGrid, eaf, srt, vtt
      - xml: the XML file produced by /assemble
      - smil: the SMIL file produced by SoundSwallower(.js)
@@ -332,12 +314,6 @@ async def convert_alignment(input: ConvertRequest) -> ConvertResponse:
      - other_file_name: a suggested name for the second file
      - other_file_contents: the contents of the second file
     """
-    if input.encoding not in ["utf-8", "utf8", "UTF-8", "UTF8", ""]:
-        raise HTTPException(
-            status_code=422,
-            detail="Please use utf-8 as your encoding, or contact us with a description of how and why you would like to use a different encoding",
-        )
-
     try:
         parsed_xml = etree.fromstring(bytes(input.xml, encoding="utf-8"))
     except etree.XMLSyntaxError as e:

diff --git a/test/test_web_api.py b/test/test_web_api.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 
 import os
-from copy import deepcopy
 from textwrap import dedent
 from unittest import main
 
@@ -13,23 +12,23 @@
 from readalongs.text.convert_xml import convert_xml
 from readalongs.text.tokenize_xml import tokenize_xml
 from readalongs.util import get_langs
-from readalongs.web_api import XMLRequest, create_grammar, process_xml, web_api_app
+from readalongs.web_api import create_grammar, web_api_app
 
 API_CLIENT = TestClient(web_api_app)
 
 
 class TestWebApi(BasicTestCase):
-    def setUp(self):
-        super().setUp()
-        self.basicRequest = {"encoding": "utf-8", "debug": False}
+    def slurp_data_file(self, filename: str) -> str:
+        """Convenience function to slurp a whole file in self.data_dir"""
+        with open(os.path.join(self.data_dir, filename), encoding="utf8") as f:
+            return f.read().strip()
 
     def test_assemble_from_plain_text(self):
         # Test the assemble endpoint with plain text
-        with open(os.path.join(self.data_dir, "ej-fra.txt"), encoding="utf8") as f:
-            data = f.read().strip()
-        request = deepcopy(self.basicRequest)
-        request["text"] = data
-        request["text_languages"] = ["fra"]
+        request = {
+            "text": self.slurp_data_file("ej-fra.txt"),
+            "text_languages": ["fra"],
+        }
         response = API_CLIENT.post("/api/v1/assemble", json=request)
         self.assertEqual(response.status_code, 200)
 
@@ -45,37 +44,28 @@ def test_bad_method(self):
 
     def test_assemble_from_xml(self):
         # Test the assemble endpoint with XML
-        with open(os.path.join(self.data_dir, "ej-fra.xml"), encoding="utf8") as f:
-            data = f.read().strip()
-        request = deepcopy(self.basicRequest)
-        request["xml"] = data
-        request["text_languages"] = ["fra"]
+        request = {
+            "encoding": "utf-8",  # for bwd compat, make sure the encoding is allowed but ignored
+            "xml": self.slurp_data_file("ej-fra.xml"),
+            "text_languages": ["fra"],
+        }
         response = API_CLIENT.post("/api/v1/assemble", json=request)
         self.assertEqual(response.status_code, 200)
 
-    def test_wrapper(self):
-        # Test the xml processing wrapper
-        with open(os.path.join(self.data_dir, "ej-fra.xml"), encoding="utf8") as f:
-            data = f.read().strip()
-        xml_request = XMLRequest(xml=data, text_languages=["test"])
-        self.assertAlmostEqual(
-            data, process_xml(lambda x: x)(xml_request).decode("utf-8")
-        )
-
     def test_bad_xml(self):
         # Test the assemble endpoint with invalid XML
-        data = "this is not xml"
-        request = deepcopy(self.basicRequest)
-        request["xml"] = data
-        request["text_languages"] = ["fra"]
+        request = {
+            "xml": "this is not xml",
+            "text_languages": ["fra"],
+        }
         response = API_CLIENT.post("/api/v1/assemble", json=request)
         self.assertEqual(response.status_code, 422)
 
     def test_create_grammar(self):
         # Test the create grammar function
-        with open(os.path.join(self.data_dir, "ej-fra.xml"), encoding="utf8") as f:
-            data = f.read().strip()
-        parsed = etree.fromstring(bytes(data, encoding="utf8"))
+        parsed = etree.fromstring(
+            bytes(self.slurp_data_file("ej-fra.xml"), encoding="utf8")
+        )
         tokenized = tokenize_xml(parsed)
         ids_added = add_ids(tokenized)
         g2ped, valid = convert_xml(ids_added)
@@ -87,11 +77,10 @@ def test_create_grammar(self):
 
     def test_bad_g2p(self):
         # Test the assemble endpoint with invalid g2p languages
-        with open(os.path.join(self.data_dir, "ej-fra.txt"), encoding="utf8") as f:
-            data = f.read().strip()
-        request = deepcopy(self.basicRequest)
-        request["text"] = data
-        request["text_languages"] = ["test"]
+        request = {
+            "text": "blah blah",
+            "text_languages": ["test"],
+        }
         response = API_CLIENT.post("/api/v1/assemble", json=request)
         self.assertEqual(response.status_code, 422)
 
@@ -103,19 +92,30 @@ def test_langs(self):
 
     def test_debug(self):
         # Test the assemble endpoint with debug mode on
-        with open(os.path.join(self.data_dir, "ej-fra.txt"), encoding="utf8") as f:
-            data = f.read().strip()
-        request = deepcopy(self.basicRequest)
-        request["text"] = data
-        request["debug"] = True
-        request["text_languages"] = ["fra"]
+        request = {
+            "text": self.slurp_data_file("ej-fra.txt"),
+            "debug": True,
+            "text_languages": ["fra"],
+        }
         response = API_CLIENT.post("/api/v1/assemble", json=request)
         content = response.json()
         self.assertEqual(content["input"], request)
         self.assertGreater(len(content["tokenized"]), 10)
         self.assertGreater(len(content["parsed"]), 10)
         self.assertGreater(len(content["g2ped"]), 10)
 
+        # Test that debug mode is off by default
+        request = {
+            "text": "Ceci est un test.",
+            "text_languages": ["fra"],
+        }
+        response = API_CLIENT.post("/api/v1/assemble", json=request)
+        content = response.json()
+        self.assertIsNone(content["input"])
+        self.assertIsNone(content["tokenized"])
+        self.assertIsNone(content["parsed"])
+        self.assertIsNone(content["g2ped"])
+
     hej_verden_xml = dedent(
         """\
         <?xml version='1.0' encoding='utf-8'?>
@@ -152,7 +152,6 @@ def test_debug(self):
 
     def test_convert_to_TextGrid_errors(self):
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             "output_format": "TextGrid",
             "xml": "this is not XML",
@@ -162,7 +161,6 @@ def test_convert_to_TextGrid_errors(self):
         self.assertEqual(response.status_code, 422, "Invalid XML should fail.")
 
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             "output_format": "TextGrid",
             "xml": self.hej_verden_xml,
@@ -172,7 +170,6 @@ def test_convert_to_TextGrid_errors(self):
         self.assertEqual(response.status_code, 422, "Invalid SMIL should fail.")
 
         request = {
-            "encoding": "utf-8",
             "audio_length": -10.0,
             "output_format": "TextGrid",
             "xml": self.hej_verden_xml,
@@ -181,23 +178,8 @@ def test_convert_to_TextGrid_errors(self):
         response = API_CLIENT.post("/api/v1/convert_alignment", json=request)
         self.assertEqual(response.status_code, 422, "Negative duration should fail.")
 
-        request = {
-            "encoding": "latin-1",
-            "audio_length": 83.1,
-            "output_format": "TextGrid",
-            "xml": self.hej_verden_xml,
-            "smil": self.hej_verden_smil,
-        }
-        response = API_CLIENT.post("/api/v1/convert_alignment", json=request)
-        # Figure out how to exercise this case, but for now only utf-8 is supported...
-        # print("latin-1", response.status_code, response.json())
-        self.assertEqual(response.status_code, 422, "only utf-8 is supported for now")
-        # Or, once we do support latin-1:
-        # self.assertEqual(response.status_code, 400)
-
     def test_convert_to_TextGrid(self):
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             "output_format": "TextGrid",
             "xml": self.hej_verden_xml,
@@ -264,7 +246,6 @@ class = "IntervalTier"
 
     def test_convert_to_eaf(self):
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             "output_format": "eaf",
             "xml": self.hej_verden_xml,
@@ -277,7 +258,6 @@ def test_convert_to_eaf(self):
 
     def test_convert_to_srt(self):
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             "output_format": "srt",
             "xml": self.hej_verden_xml,
@@ -316,7 +296,7 @@ def test_convert_to_srt(self):
 
     def test_convert_to_vtt(self):
         request = {
-            "encoding": "utf-8",
+            "encoding": "utf-8",  # for bwd compat, make sure the encoding is allowed but ignored
             "audio_length": 83.1,
             "output_format": "vtt",
             "xml": self.hej_verden_xml,
@@ -354,7 +334,6 @@ def test_convert_to_vtt(self):
 
     def test_convert_to_bad_format(self):
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             "output_format": "not_a_known_format",
             "xml": self.hej_verden_xml,
@@ -364,7 +343,6 @@ def test_convert_to_bad_format(self):
         self.assertEqual(response.status_code, 422)
 
         request = {
-            "encoding": "utf-8",
             "audio_length": 83.1,
             # "output_format" just missing
             "xml": self.hej_verden_xml,