Skip to content

Commit

Permalink
feat: add silence insertion feature
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Oct 5, 2021
1 parent f189614 commit 1663779
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 5 deletions.
47 changes: 42 additions & 5 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import io
import os
import shutil
from collections import defaultdict
from dataclasses import dataclass
from datetime import timedelta
from typing import Dict, List, Union
Expand All @@ -21,6 +22,7 @@
import regex as re
import soundswallower
from lxml import etree
from pydub import AudioSegment
from pydub.exceptions import CouldntEncodeError
from pympi.Praat import TextGrid
from webvtt import Caption, WebVTT
Expand Down Expand Up @@ -193,7 +195,7 @@ def align_audio( # noqa: C901
Raises:
TODO
"""
results: Dict[str, List] = {"words": []}
results: Dict[str, List] = {"words": [], "audio": None}

# First do G2P
try:
Expand Down Expand Up @@ -413,7 +415,36 @@ def frames_to_time(frames):
)

split_silences(results["words"], final_end, dna_for_silence_splitting)

words_dict = {
x["id"]: {"start": x["start"], "end": x["end"]} for x in results["words"]
}
silence_offsets = defaultdict(int)
silence = 0
if results["tokenized"].xpath("//*[@silence]"):
endpoint = 0
for el in results["tokenized"].xpath("//*"):
if "silence" in el.attrib:
silence_ms = int(el.attrib["silence"]) # get silence (ms)
silence_segment = AudioSegment.silent(
duration=silence_ms
) # create silence segment
silence += silence_ms # add silence length to total silence
audio = (
audio[:endpoint] + silence_segment + audio[endpoint:]
) # insert silence at previous endpoint
endpoint += silence_ms # add silence to previous endpoint
if el.tag == "w":
silence_offsets[el.attrib["id"]] += (
silence / 1000
) # add silence to silence offset for word id
endpoint = (
words_dict[el.attrib["id"]]["end"] * 1000
) + silence # bump endpoint and include silence
if silence:
for word in results["words"]:
word["start"] += silence_offsets[word["id"]]
word["end"] += silence_offsets[word["id"]]
results["audio"] = audio
return results


Expand All @@ -429,6 +460,7 @@ def save_readalong(
closed_captioning: bool = False,
output_xhtml: bool = False,
audiofile: str,
audiosegment: AudioSegment = None,
html: bool = False,
):
"""Save the results from align_audio() into the otuput files required for a
Expand All @@ -444,14 +476,15 @@ def save_readalong(
closed_captioning (bool, optional): if True, also save in .vtt and .srt subtitle formats
output_xhtml (bool, optional): if True, convert XML into XHTML format before writing
audiofile (str): path to the audio file passed to align_audio()
audiosegment (AudioSegment): a pydub.AudioSegment object of processed audio.
if None, then original audio will be saved at `audiofile`
Returns:
None
Raises:
[TODO]
"""

# Round all times to three digits, anything more is excess precision
# poluting the output files, and usually due to float rounding errors anyway.
for w in align_results["words"]:
Expand Down Expand Up @@ -485,8 +518,12 @@ def save_readalong(
save_xml(tokenized_xml_path, align_results["tokenized"])

_, audio_ext = os.path.splitext(audiofile)
audio_path = output_base + audio_ext
shutil.copy(audiofile, audio_path)
if audiosegment:
audio_path = output_base + ".wav"
audiosegment.export(audio_path, format="wav")
else:
audio_path = output_base + audio_ext
shutil.copy(audiofile, audio_path)

smil_path = output_base + ".smil"
smil = make_smil(
Expand Down
1 change: 1 addition & 0 deletions readalongs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ def align(**kwargs): # noqa: C901
closed_captioning=kwargs["closed_captioning"],
output_xhtml=kwargs["output_xhtml"],
audiofile=kwargs["audiofile"],
audiosegment=results["audio"],
html=kwargs["html"],
)

Expand Down
27 changes: 27 additions & 0 deletions test/data/ej-fra-silence.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version='1.0' encoding='utf-8'?>
<TEI>
<!-- To exclude any element from alignment, add the do-not-align="true" attribute to
it, e.g., <p do-not-align="true">...</p>, or
<s>Some text <foo do-not-align="true">do not align this</foo> more text</s> -->
<text xml:lang="fra">
<body>
<div type="page">
<p>
<s><span silence="1000"></span>Bonjour.</s>
<s>Je m'appelle Éric Joanis.</s>
<s>Je suis <span silence="1382"></span> programmeur au sein <span silence="500"></span> de l'équipe des technologies pour les langues autochtones au CNRC.</s>
</p>
</div>
<div type="page">
<p>
<s>J'ai fait une bonne partie de ma carrière en traduction automatique statistique, mais maintenant cette approche est déclassée par l'apprentissage profond.</s>
<s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire un corpus bilingue anglais-inuktitut.</s>
<s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que d'autres applications de traitement du langage naturel.</s>
</p>
<p>
<s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus robuste.</s>
</p>
</div>
</body>
</text>
</TEI>
2 changes: 2 additions & 0 deletions test/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from test_indices import TestIndices
from test_misc import TestMisc
from test_prepare_cli import TestPrepareCli
from test_silence import TestSilence
from test_temp_file import TestTempFile
from test_tokenize_cli import TestTokenizeCli
from test_tokenize_xml import TestTokenizer
Expand Down Expand Up @@ -44,6 +45,7 @@
TestAlignCli,
TestG2pCli,
TestMisc,
TestSilence,
]
]

Expand Down
45 changes: 45 additions & 0 deletions test/test_silence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
from unittest import main

from lxml import etree
from pydub import AudioSegment
from utils import BasicTestCase

from readalongs.cli import align


class TestSilence(BasicTestCase):
def test_basic_silence_insertion(self):
output = os.path.join(self.tempdir, "silence")
# Run align from xml
results = self.runner.invoke(
align,
[
"-s",
"-C",
"-t",
"-l",
"fra",
os.path.join(self.data_dir, "ej-fra-silence.xml"),
os.path.join(self.data_dir, "ej-fra.m4a"),
output,
],
)
self.assertEqual(results.exit_code, 0)
self.assertTrue(os.path.exists(os.path.join(output, "silence.wav")))
# test silence spans in output xml
with open(os.path.join(output, "silence.xml"), "rb") as f:
xml_bytes = f.read()
root = etree.fromstring(xml_bytes)
silence_spans = root.xpath("//*[@silence]")
self.assertEqual(len(silence_spans), 3)
# test audio has correct amount of silence added
original_audio = AudioSegment.from_file(
os.path.join(self.data_dir, "ej-fra.m4a")
)
new_audio = AudioSegment.from_wav(os.path.join(output, "silence.wav"))
self.assertEqual(len(new_audio) - len(original_audio), 2882)


if __name__ == "__main__":
main()

0 comments on commit 1663779

Please sign in to comment.