Skip to content

Commit

Permalink
Add left/right context orth to lib.corpus (#564)
Browse files Browse the repository at this point in the history
Co-authored-by: Albert Zeyer <zeyer@cs.rwth-aachen.de>
Co-authored-by: Eugen Beck <curufinwe@users.noreply.github.com>
  • Loading branch information
3 people authored Dec 9, 2024
1 parent ab07cf4 commit 84f12cf
Showing 1 changed file with 43 additions and 9 deletions.
52 changes: 43 additions & 9 deletions lib/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,15 +116,15 @@ def startElement(self, name: str, attrs: Dict[str, str]):
def endElement(self, name: str):
e = self.elements[-1]

if name == "orth":
if name in {"orth", "left-context-orth", "right-context-orth"}:
assert isinstance(e, Segment)
# we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have
# multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during
# writing, thus we remove multiple spaces and newlines
text = self.chars.strip()
text = re.sub(" +", " ", text)
text = re.sub("\n", "", text)
e.orth = text
setattr(e, name.replace("-", "_"), text)
elif isinstance(e, Speaker) and name != "speaker-description":
# we allow all sorts of elements within a speaker description
e.attribs[name] = self.chars.strip()
Expand Down Expand Up @@ -356,15 +356,39 @@ def get_segment_mapping(self) -> Dict[str, Segment]:


class Segment(NamedEntity):
def __init__(self):
def __init__(
self,
*,
start: float = 0.0,
end: float = 0.0,
track: Optional[int] = None,
orth: Optional[str] = None,
left_context_orth: Optional[str] = None,
right_context_orth: Optional[str] = None,
speaker_name: Optional[str] = None,
recording: Optional[Recording] = None,
):
"""
:param start: Segment start.
:param end: Segment end.
:param track: Segment track/channel.
:param orth: Segment text.
:param left_context_orth: Optional left context when aligning (specific for RASR alignment).
:param right_context_orth: Optional right context when aligning (specific for RASR alignment).
:param speaker_name: Speaker name.
:param recording: Recording in which the segment is embedded.
"""
super().__init__()
self.start = 0.0
self.end = 0.0
self.track: Optional[int] = None
self.orth: Optional[str] = None
self.speaker_name: Optional[str] = None

self.recording: Optional[Recording] = None
self.start = start
self.end = end
self.track = track
self.orth = orth
self.left_context_orth = left_context_orth
self.right_context_orth = right_context_orth
self.speaker_name = speaker_name

self.recording = recording

def fullname(self) -> str:
return self.recording.fullname() + "/" + self.name
Expand All @@ -384,6 +408,16 @@ def dump(self, out: TextIO, indentation: str = ""):
out.write('%s <speaker name="%s"/>\n' % (indentation, self.speaker_name))
if self.orth is not None:
out.write("%s <orth> %s </orth>\n" % (indentation, saxutils.escape(self.orth)))
if self.left_context_orth is not None:
out.write(
"%s <left-context-orth> %s </left-context-orth>\n"
% (indentation, saxutils.escape(self.left_context_orth))
)
if self.right_context_orth is not None:
out.write(
"%s <right-context-orth> %s </right-context-orth>\n"
% (indentation, saxutils.escape(self.right_context_orth))
)
if has_child_element:
out.write("%s</segment>\n" % indentation)
else:
Expand Down

0 comments on commit 84f12cf

Please sign in to comment.