bpe-handling jobs

rwth-i6 · Nov 30, 2023 · 2cbdc8b · 2cbdc8b
1 parent 6c577b8
commit 2cbdc8b
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 12 deletions.
diff --git a/users/gruev/corpus/convert.py b/users/gruev/corpus/convert.py
@@ -1,15 +1,17 @@
-__all__ = ["CorpusReplaceOrthFromCallableJob"]
+__all__ = ["CorpusMapOrthJob", "CorpusTextToWordsJob"]
+
+from typing import Callable
 
-from typing import Optional, Callable
 from i6_core.lib import corpus
+from i6_core.util import uopen
 from sisyphus import *
 
 Path = setup_path(__package__)
 
 
-class CorpusReplaceOrthFromCallableJob(Job):
+class CorpusMapOrthJob(Job):
     """
-    Maps the orth tag according to a function
+    Maps the orth tag according to a (preprocessing) function
     """
 
     def __init__(self, bliss_corpus: Path, preprocess_function: Callable[[str], str]):
@@ -32,3 +34,30 @@ def run(self):
             s.orth = self.preprocess_function(s.orth)
 
         c.dump(self.out_corpus.get_path())
+
+
+class CorpusTextToWordsJob(Job):
+    """
+    Converts a corpus text file to a words file, one word per line
+    """
+
+    def __init__(self, text_file: Path):
+        """
+        :param text_file:
+        """
+        self.text_file = text_file
+        self.out_txt = self.output_path("words.txt")
+
+    def tasks(self):
+        yield Task("run", mini_task=True)
+
+    def run(self):
+        words = set()
+        with uopen(self.text_file, "rt") as f:
+            for line in f:
+                words.update(line.strip().split())
+
+        words = sorted(list(words))
+        with uopen(self.out_txt.get_path(), "wt") as f:
+            for word in words:
+                f.write(word + "\n")
diff --git a/users/gruev/corpus/filter.py b/users/gruev/corpus/filter.py
@@ -1,10 +1,12 @@
-__all__ = ["FilterCorpusRemoveUnknownWordSegmentsJob"]
+__all__ = ["FilterCorpusRemoveUnknownWordSegmentsJob", "FilterTextJob"]
 
 import gzip
 import xml.etree.cElementTree as ET
 from itertools import compress
+from typing import List, Callable
 
 from i6_core.lib import corpus
+from i6_core.util import uopen
 
 from sisyphus import *
 
@@ -20,8 +22,8 @@ class FilterCorpusRemoveUnknownWordSegmentsJob(Job):
 
     def __init__(
         self,
-        bliss_corpus: tk.Path,
-        bliss_lexicon: tk.Path,
+        bliss_corpus: Path,
+        bliss_lexicon: Path,
         case_sensitive: bool = False,
         all_unknown: bool = True,
     ):
@@ -88,3 +90,53 @@ def unknown_filter(corpus: corpus.Corpus, recording: corpus.Recording, segment:
 
         c.filter_segments(unknown_filter)
         c.dump(self.out_corpus.get_path())
+
+
+# Adapted from Zoltán Tüske's mapRemoveFragSilNoise
+def es_filter(text):
+    wordMap = {
+        "[laughter]-": "[laughter]",
+        "[noise]-": "[noise]",
+        "[vocalized-noise]-": "[vocalized-noise]",
+        "s[laughter]": "[laughter]",
+        "<unk>": "",
+    }
+
+    text = text.strip()
+    words = text.split()
+    newwords = []
+    for c1, word in enumerate(words):
+        if word in wordMap:
+            word = wordMap[word]
+        if word == "":
+            continue
+        if word.startswith("[") or word.endswith("]"):
+            continue
+        if word.startswith("-") or word.endswith("-"):
+            continue
+        newwords.append(word)
+    text = " ".join(newwords)
+    return text + "\n"
+
+
+class FilterTextJob(Job):
+    """Filter a text file (i.e. corpus transcription file)"""
+
+    def __init__(self, text_file: Path, filter: Callable[[str], str] = es_filter, gzip: bool = False):
+        """
+        :param text_file:
+        :param gzip:
+        """
+        self.text_file = text_file
+        self.filter = filter
+        self.gzip = gzip
+
+        self.out_txt = self.output_path("filtered.txt" + (".gz" if gzip else ""))
+
+    def tasks(self):
+        yield Task("run", mini_task=True)
+
+    def run(self):
+        with uopen(self.text_file, "rt") as in_f, uopen(self.out_txt.get_path(), "wt") as out_f:
+            for line in in_f:
+                out_f.write(self.filter(line))
diff --git a/users/gruev/lexicon/modification.py b/users/gruev/lexicon/modification.py
@@ -14,14 +14,14 @@ class HandleSpecialLemmataInLexicon(Job):
     def __init__(
         self,
         bliss_lexicon: tk.Path,
-        symbol_filter: Optional[List[str]] = [],
+        blacklist: Optional[List[str]] = [],
     ):
         """
         :param tk.Path bliss_lexicon
-        :param Optional[List[str]] symbol_filter
+        :param Optional[List[str]] blacklist
         """
         self.bliss_lexicon = bliss_lexicon
-        self.symbol_filter = symbol_filter
+        self.blacklist = blacklist
         self.out_lexicon = self.output_path("lexicon.xml")
 
     def tasks(self):
@@ -40,7 +40,7 @@ def run(self):
         out_lexicon.phonemes.update(in_lexicon.phonemes)
 
         # Remove blacklisted phonemes
-        for symbol in self.symbol_filter:
+        for symbol in self.blacklist:
             out_lexicon.remove_phoneme(symbol)
 
         # Special lemmata
@@ -59,7 +59,7 @@ def run(self):
         out_lexicon.lemmata += [
             lemma
             for lemma in in_lexicon.lemmata
-            if not any(symbol in lemma.orth or symbol in lemma.phon for symbol in self.symbol_filter)
+            if not any(symbol in lemma.orth or symbol in lemma.phon for symbol in self.blacklist)
         ]
 
         write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml())
diff --git a/users/gruev/text/processing.py b/users/gruev/text/processing.py
@@ -0,0 +1,36 @@
+__all__ = ["PasteTextJob"]
+
+from typing import List
+from i6_core.util import uopen
+from sisyphus import *
+
+
+class PasteTextJob(Job):
+    """Merges the lines of text files, similar to the 'paste' command"""
+
+    def __init__(self, text_files: List[tk.Path]):
+        """
+        :param text_files:
+        """
+        self.text_files = text_files
+
+        self.out_txt = self.output_path("pasted.txt")
+
+    def tasks(self):
+        yield Task("run", mini_task=True)
+
+    def run(self):
+        file_handles = [uopen(text_file, "rt") for text_file in self.text_files]
+
+        with uopen(self.out_txt.get_path(), "wt") as f:
+            while True:
+                lines = [fh.readline().strip() for fh in file_handles]
+                if any(line == "" for line in lines):
+                    break
+
+                f.write(" ".join(lines))
+                if not all(line == "" for line in lines):
+                    f.write("\n")
+
+        for fh in file_handles:
+            fh.close()