Skip to content

Commit

Permalink
test: add test of RAS XML validation
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Feb 10, 2023
1 parent 903ddd5 commit 7dd3072
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 2 deletions.
2 changes: 1 addition & 1 deletion test/data/ej-fra-dna.ras
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<p>
<s>Bonjour.</s>
<s>Je m&#x27;appelle Éric Joanis.</s>
<s>Je suis programmeur au sein de l&#x27;équipe des <foo do-not-align="true">some text to exclude mid-sentence.</foo> technologies pour les langues autochtones au CNRC.</s>
<s>Je suis programmeur au sein de l&#x27;équipe des <span do-not-align="true">some text to exclude mid-sentence.</span> technologies pour les langues autochtones au CNRC.</s>
</p>
</div>
<div type="page">
Expand Down
27 changes: 27 additions & 0 deletions test/data/ej-fra-subword.ras
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along>
<!-- To exclude any element from alignment, add the do-not-align="true" attribute to
it, e.g., <p do-not-align="true">...</p>, or
<s>Some text <foo do-not-align="true">do not align this</foo> more text</s> -->
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0">
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165"><span id="t0b0d0p0s0w0span0">Bon</span><span id="t0b0d0p0s0w0span1">jour</span></w>.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760" dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="0.220">Éric</w> <w id="t0b0d0p0s1w4" time="2.220" dur="0.370">Joanis</w>.</s>
<s id="t0b0d0p0s2"><w id="t0b0d0p0s2w0" time="2.590" dur="0.030">Je</w> <w id="t0b0d0p0s2w1" time="2.620" dur="0.330">suis</w> <w id="t0b0d0p0s2w2" time="2.950" dur="0.870">programmeur</w> <w id="t0b0d0p0s2w3" time="3.820" dur="0.100">au</w> <w id="t0b0d0p0s2w4" time="3.920" dur="0.230">sein</w> <w id="t0b0d0p0s2w5" time="4.150" dur="0.040">de</w> <w id="t0b0d0p0s2w6" time="4.190" dur="0.030">l</w>'<w id="t0b0d0p0s2w7" time="4.220" dur="0.360">équipe</w> <w id="t0b0d0p0s2w8" time="4.580" dur="0.110">des</w> <w id="t0b0d0p0s2w9" time="4.690" dur="0.560">technologies</w> <w id="t0b0d0p0s2w10" time="5.250" dur="0.310">pour</w> <w id="t0b0d0p0s2w11" time="5.560" dur="0.030">les</w> <w id="t0b0d0p0s2w12" time="5.590" dur="0.280">langues</w> <w id="t0b0d0p0s2w13" time="5.870" dur="0.460">autochtones</w> <w id="t0b0d0p0s2w14" time="6.330" dur="0.080">au</w> <w id="t0b0d0p0s2w15" time="6.410" dur="1.145">CNRC</w>.</s>
</p>
</div>
<div type="page" id="t0b0d1">
<p id="t0b0d1p0">
<s id="t0b0d1p0s0"><w id="t0b0d1p0s0w0" time="7.555" dur="0.485">J</w>'<w id="t0b0d1p0s0w1" time="8.040" dur="0.050">ai</w> <w id="t0b0d1p0s0w2" time="8.090" dur="0.190">fait</w> <w id="t0b0d1p0s0w3" time="8.280" dur="0.060">une</w> <w id="t0b0d1p0s0w4" time="8.340" dur="0.170">bonne</w> <w id="t0b0d1p0s0w5" time="8.510" dur="0.270">partie</w> <w id="t0b0d1p0s0w6" time="8.780" dur="0.030">de</w> <w id="t0b0d1p0s0w7" time="8.810" dur="0.120">ma</w> <w id="t0b0d1p0s0w8" time="8.930" dur="0.350">carrière</w> <w id="t0b0d1p0s0w9" time="9.280" dur="0.110">en</w> <w id="t0b0d1p0s0w10" time="9.390" dur="0.530">traduction</w> <w id="t0b0d1p0s0w11" time="9.920" dur="0.470">automatique</w> <w id="t0b0d1p0s0w12" time="10.390" dur="0.905">statistique</w>, <w id="t0b0d1p0s0w13" time="11.295" dur="0.255">mais</w> <w id="t0b0d1p0s0w14" time="11.550" dur="0.300">maintenant</w> <w id="t0b0d1p0s0w15" time="11.850" dur="0.220">cette</w> <w id="t0b0d1p0s0w16" time="12.070" dur="0.290">approche</w> <w id="t0b0d1p0s0w17" time="12.360" dur="0.050">est</w> <w id="t0b0d1p0s0w18" time="12.410" dur="0.500">déclassée</w> <w id="t0b0d1p0s0w19" time="12.910" dur="0.170">par</w> <w id="t0b0d1p0s0w20" time="13.080" dur="0.030">l</w>'<w id="t0b0d1p0s0w21" time="13.110" dur="0.600">apprentissage</w> <w id="t0b0d1p0s0w22" time="13.710" dur="0.755">profond</w>.</s>
<s id="t0b0d1p0s1"><w id="t0b0d1p0s1w0" time="14.465" dur="0.485">En</w> <w id="t0b0d1p0s1w1" time="14.950" dur="0.050">ce</w> <w id="t0b0d1p0s1w2" time="15.000" dur="0.300">moment</w> <w id="t0b0d1p0s1w3" time="15.300" dur="0.050">je</w> <w id="t0b0d1p0s1w4" time="15.350" dur="0.290">travaille</w> <w id="t0b0d1p0s1w5" time="15.640" dur="0.030">à</w> <w id="t0b0d1p0s1w6" time="15.670" dur="0.030">l</w>'<w id="t0b0d1p0s1w7" time="15.700" dur="0.470">alignement</w> <w id="t0b0d1p0s1w8" time="16.170" dur="0.110">du</w> <w id="t0b0d1p0s1w9" time="16.280" dur="0.540">hansard</w> <w id="t0b0d1p0s1w10" time="16.820" dur="0.070">du</w> <w id="t0b0d1p0s1w11" time="16.890" dur="0.480">Nunavut</w> <w id="t0b0d1p0s1w12" time="17.370" dur="0.130">pour</w> <w id="t0b0d1p0s1w13" time="17.500" dur="0.580">produire</w> <w id="t0b0d1p0s1w14" time="18.080" dur="0.100">un</w> <w id="t0b0d1p0s1w15" time="18.180" dur="0.280">corpus</w> <w id="t0b0d1p0s1w16" time="18.460" dur="0.500">bilingue</w> <w id="t0b0d1p0s1w17" time="18.960" dur="0.360">anglais</w>-<w id="t0b0d1p0s1w18" time="19.320" dur="1.330">inuktitut</w>.</s>
<s id="t0b0d1p0s2"><w id="t0b0d1p0s2w0" time="20.650" dur="0.490">Ce</w> <w id="t0b0d1p0s2w1" time="21.140" dur="0.690">corpus</w> <w id="t0b0d1p0s2w2" time="21.830" dur="0.710">permettra</w> <w id="t0b0d1p0s2w3" time="22.540" dur="0.080">d</w>'<w id="t0b0d1p0s2w4" time="22.620" dur="0.470">entraîner</w> <w id="t0b0d1p0s2w5" time="23.090" dur="0.090">la</w> <w id="t0b0d1p0s2w6" time="23.180" dur="0.360">TA</w>, <w id="t0b0d1p0s2w7" time="23.540" dur="0.600">neuronale</w> <w id="t0b0d1p0s2w8" time="24.140" dur="0.030">ou</w> <w id="t0b0d1p0s2w9" time="24.170" dur="0.920">statistique</w>, <w id="t0b0d1p0s2w10" time="25.090" dur="0.420">ainsi</w> <w id="t0b0d1p0s2w11" time="25.510" dur="0.060">que</w> <w id="t0b0d1p0s2w12" time="25.570" dur="0.100">d</w>'<w id="t0b0d1p0s2w13" time="25.670" dur="0.410">autres</w> <w id="t0b0d1p0s2w14" time="26.080" dur="0.680">applications</w> <w id="t0b0d1p0s2w15" time="26.760" dur="0.030">de</w> <w id="t0b0d1p0s2w16" time="26.790" dur="0.260">traitement</w> <w id="t0b0d1p0s2w17" time="27.050" dur="0.160">du</w> <w id="t0b0d1p0s2w18" time="27.210" dur="0.790">langage</w> <w id="t0b0d1p0s2w19" time="28.000" dur="0.710">naturel</w>.</s>
</p>
<p id="t0b0d1p1">
<s id="t0b0d1p1s0"><w id="t0b0d1p1s0w0" time="28.710" dur="0.510">En</w> <w id="t0b0d1p1s0w1" time="29.220" dur="0.680">parallèle</w>, <w id="t0b0d1p1s0w2" time="29.900" dur="0.130">j</w>'<w id="t0b0d1p1s0w3" time="30.030" dur="0.180">aide</w> <w id="t0b0d1p1s0w4" time="30.210" dur="0.030">à</w> <w id="t0b0d1p1s0w5" time="30.240" dur="0.530">écrire</w> <w id="t0b0d1p1s0w6" time="30.770" dur="0.040">des</w> <w id="t0b0d1p1s0w7" time="30.810" dur="0.310">tests</w> <w id="t0b0d1p1s0w8" time="31.120" dur="0.170">pour</w> <w id="t0b0d1p1s0w9" time="31.290" dur="0.310">rendre</w> <w id="t0b0d1p1s0w10" time="31.600" dur="0.030">le</w> <w id="t0b0d1p1s0w11" time="31.630" dur="0.510">ReadAlong</w>-<w id="t0b0d1p1s0w12" time="32.140" dur="0.520">Studio</w> <w id="t0b0d1p1s0w13" time="32.660" dur="0.110">plus</w> <w id="t0b0d1p1s0w14" time="32.770" dur="0.610">robuste</w>.</s>
</p>
</div>
</body>
</text>
</read-along>
20 changes: 20 additions & 0 deletions test/data/ej-fra-translated.ras
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along>
<!-- To exclude any element from alignment, add the do-not-align="true" attribute to
it, e.g., <p do-not-align="true">...</p>, or
<s>Some text <foo do-not-align="true">do not align this</foo> more text</s> -->
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0" class="two-column-layout-page">
<graphic url="avatar.png" id="t0b0d0graphic0"/>
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0" class="sentence__translation">Hello.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760" dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1" class="sentence__translation">My name is Éric Joanis.</s>
</p>
</div>

</body>
</text>
</read-along>
4 changes: 3 additions & 1 deletion test/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,18 @@
from test_config import TestConfig
from test_dna_text import TestDNAText
from test_dna_utils import TestDNAUtils
from test_dtd import TestDTD
from test_force_align import TestForceAlignment, TestXHTML
from test_g2p_cli import TestG2pCli
from test_make_xml_cli import TestMakeXMLCli
from test_misc import TestMisc
from test_package_urls import TestPackageURLs
from test_silence import TestSilence
from test_smil import TestSmilUtilities
from test_temp_file import TestTempFile
from test_tokenize_cli import TestTokenizeCli
from test_tokenize_xml import TestTokenizer
from test_web_api import TestWebApi
from test_smil import TestSmilUtilities

from readalongs.log import LOGGER

Expand Down Expand Up @@ -69,6 +70,7 @@
TestSmilUtilities,
TestPackageURLs,
TestWebApi,
TestDTD,
]
]

Expand Down
63 changes: 63 additions & 0 deletions test/test_dtd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python

"""Test our XML DTD to make sure all valid examples validate and invalid ones don't"""

import os
from os.path import dirname
from unittest import TestCase, main

from lxml import etree

DTDPATH = os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-0.2.dtd"
)

VALID_RAS = """
ej-fra-anchors2.ras
ej-fra-anchors.ras
ej-fra-converted.ras
ej-fra-dna.ras
ej-fra-package.ras
ej-fra.ras
ej-fra-silence.ras
ej-fra-subword.ras
ej-fra-translated.ras
fra-prepared.ras
fra-tokenized.ras
mixed-langs.g2p.ras
mixed-langs.ras
mixed-langs.tokenized.ras
patrickxtlan.ras
""".strip().split()
INVALID_RAS = """
ej-fra-invalid.ras
""".strip().split()


class TestDTD(TestCase):
"""Test the XML DTD"""

def setUp(self):
with open(DTDPATH, "rt") as infh:
self.dtd = etree.DTD(infh)

def test_valid_inputs(self):
for name in VALID_RAS:
path = os.path.join(dirname(__file__), "data", name)
# DTD is text, XML is binary... okay
with open(path, "rb") as infh:
parsed = etree.parse(infh)
self.assertTrue(self.dtd.validate(parsed), f"{name} does not validate")

def test_invalid_inputs(self):
for name in INVALID_RAS:
path = os.path.join(dirname(__file__), "data", name)
with open(path, "rb") as infh:
parsed = etree.parse(infh)
self.assertFalse(
self.dtd.validate(parsed), f"{name} validates but shouldn't"
)


if __name__ == "__main__":
main()

0 comments on commit 7dd3072

Please sign in to comment.