Skip to content

Commit

Permalink
bpe-handling jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
Atanas Gruev committed Nov 30, 2023
1 parent 6c577b8 commit 2cbdc8b
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 12 deletions.
37 changes: 33 additions & 4 deletions users/gruev/corpus/convert.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
__all__ = ["CorpusReplaceOrthFromCallableJob"]
__all__ = ["CorpusMapOrthJob", "CorpusTextToWordsJob"]

from typing import Callable

from typing import Optional, Callable
from i6_core.lib import corpus
from i6_core.util import uopen
from sisyphus import *

Path = setup_path(__package__)


class CorpusReplaceOrthFromCallableJob(Job):
class CorpusMapOrthJob(Job):
"""
Maps the orth tag according to a function
Maps the orth tag according to a (preprocessing) function
"""

def __init__(self, bliss_corpus: Path, preprocess_function: Callable[[str], str]):
Expand All @@ -32,3 +34,30 @@ def run(self):
s.orth = self.preprocess_function(s.orth)

c.dump(self.out_corpus.get_path())


class CorpusTextToWordsJob(Job):
"""
Converts a corpus text file to a words file, one word per line
"""

def __init__(self, text_file: Path):
"""
:param text_file:
"""
self.text_file = text_file
self.out_txt = self.output_path("words.txt")

def tasks(self):
yield Task("run", mini_task=True)

def run(self):
words = set()
with uopen(self.text_file, "rt") as f:
for line in f:
words.update(line.strip().split())

words = sorted(list(words))
with uopen(self.out_txt.get_path(), "wt") as f:
for word in words:
f.write(word + "\n")
58 changes: 55 additions & 3 deletions users/gruev/corpus/filter.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
__all__ = ["FilterCorpusRemoveUnknownWordSegmentsJob"]
__all__ = ["FilterCorpusRemoveUnknownWordSegmentsJob", "FilterTextJob"]

import gzip
import xml.etree.cElementTree as ET
from itertools import compress
from typing import List, Callable

from i6_core.lib import corpus
from i6_core.util import uopen

from sisyphus import *

Expand All @@ -20,8 +22,8 @@ class FilterCorpusRemoveUnknownWordSegmentsJob(Job):

def __init__(
self,
bliss_corpus: tk.Path,
bliss_lexicon: tk.Path,
bliss_corpus: Path,
bliss_lexicon: Path,
case_sensitive: bool = False,
all_unknown: bool = True,
):
Expand Down Expand Up @@ -88,3 +90,53 @@ def unknown_filter(corpus: corpus.Corpus, recording: corpus.Recording, segment:

c.filter_segments(unknown_filter)
c.dump(self.out_corpus.get_path())


# Adapted from Zoltán Tüske's mapRemoveFragSilNoise
def es_filter(text):
wordMap = {
"[laughter]-": "[laughter]",
"[noise]-": "[noise]",
"[vocalized-noise]-": "[vocalized-noise]",
"s[laughter]": "[laughter]",
"<unk>": "",
}

text = text.strip()
words = text.split()
newwords = []
for c1, word in enumerate(words):
if word in wordMap:
word = wordMap[word]
if word == "":
continue
if word.startswith("[") or word.endswith("]"):
continue
if word.startswith("-") or word.endswith("-"):
continue
newwords.append(word)
text = " ".join(newwords)
return text + "\n"


class FilterTextJob(Job):
"""Filter a text file (i.e. corpus transcription file)"""

def __init__(self, text_file: Path, filter: Callable[[str], str] = es_filter, gzip: bool = False):
"""
:param text_file:
:param gzip:
"""
self.text_file = text_file
self.filter = filter
self.gzip = gzip

self.out_txt = self.output_path("filtered.txt" + (".gz" if gzip else ""))

def tasks(self):
yield Task("run", mini_task=True)

def run(self):
with uopen(self.text_file, "rt") as in_f, uopen(self.out_txt.get_path(), "wt") as out_f:
for line in in_f:
out_f.write(self.filter(line))
10 changes: 5 additions & 5 deletions users/gruev/lexicon/modification.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ class HandleSpecialLemmataInLexicon(Job):
def __init__(
self,
bliss_lexicon: tk.Path,
symbol_filter: Optional[List[str]] = [],
blacklist: Optional[List[str]] = [],
):
"""
:param tk.Path bliss_lexicon
:param Optional[List[str]] symbol_filter
:param Optional[List[str]] blacklist
"""
self.bliss_lexicon = bliss_lexicon
self.symbol_filter = symbol_filter
self.blacklist = blacklist
self.out_lexicon = self.output_path("lexicon.xml")

def tasks(self):
Expand All @@ -40,7 +40,7 @@ def run(self):
out_lexicon.phonemes.update(in_lexicon.phonemes)

# Remove blacklisted phonemes
for symbol in self.symbol_filter:
for symbol in self.blacklist:
out_lexicon.remove_phoneme(symbol)

# Special lemmata
Expand All @@ -59,7 +59,7 @@ def run(self):
out_lexicon.lemmata += [
lemma
for lemma in in_lexicon.lemmata
if not any(symbol in lemma.orth or symbol in lemma.phon for symbol in self.symbol_filter)
if not any(symbol in lemma.orth or symbol in lemma.phon for symbol in self.blacklist)
]

write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml())
36 changes: 36 additions & 0 deletions users/gruev/text/processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
__all__ = ["PasteTextJob"]

from typing import List
from i6_core.util import uopen
from sisyphus import *


class PasteTextJob(Job):
"""Merges the lines of text files, similar to the 'paste' command"""

def __init__(self, text_files: List[tk.Path]):
"""
:param text_files:
"""
self.text_files = text_files

self.out_txt = self.output_path("pasted.txt")

def tasks(self):
yield Task("run", mini_task=True)

def run(self):
file_handles = [uopen(text_file, "rt") for text_file in self.text_files]

with uopen(self.out_txt.get_path(), "wt") as f:
while True:
lines = [fh.readline().strip() for fh in file_handles]
if any(line == "" for line in lines):
break

f.write(" ".join(lines))
if not all(line == "" for line in lines):
f.write("\n")

for fh in file_handles:
fh.close()

0 comments on commit 2cbdc8b

Please sign in to comment.