Skip to content

Commit

Permalink
Add tests for LazyNeMoIterator and fix case with metadata_only=True a…
Browse files Browse the repository at this point in the history
…nd offsets in manifest (NVIDIA#10198)

* Add tests for LazyNeMoIterator and fix case with manifest_only=True and offsets in manifest

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
  • Loading branch information
pzelasko authored and WoodieDudy committed Aug 26, 2024
1 parent 8482971 commit 18bae50
Show file tree
Hide file tree
Showing 2 changed files with 228 additions and 15 deletions.
55 changes: 40 additions & 15 deletions nemo/collections/common/data/lhotse/nemo_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import lhotse.serialization
import soundfile
from cytoolz import groupby
from lhotse import AudioSource, Recording, SupervisionSegment
from lhotse import AudioSource, MonoCut, Recording, SupervisionSegment
from lhotse.audio.backend import LibsndfileBackend
from lhotse.cut import Cut
from lhotse.dataset.dataloading import resolve_seed
Expand Down Expand Up @@ -112,11 +112,9 @@ def __iter__(self) -> Generator[Cut, None, None]:
audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path))
duration = data.pop("duration")
offset = data.pop("offset", None)
recording = self._create_recording(audio_path, duration, data.pop("sampling_rate", None))
cut = recording.to_cut()
if offset is not None:
cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
cut = self._create_cut(
audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None)
)
# Note that start=0 and not start=offset because supervision's start if relative to the
# start of the cut; and cut.start is already set to offset
cut.supervisions.append(
Expand All @@ -140,6 +138,42 @@ def __len__(self) -> int:
def __add__(self, other):
return LazyIteratorChain(self, other)

def _create_cut(
self,
audio_path: str,
offset: float,
duration: float,
sampling_rate: int | None = None,
) -> Cut:
if not self.metadata_only:
recording = self._create_recording(audio_path, duration, sampling_rate)
cut = recording.to_cut()
if offset is not None:
cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
else:
# Only metadata requested.
# We'll provide accurate metadata for Cut but inaccurate metadata for Recording to avoid
# incurring IO penalty (note that Lhotse manifests contain more information than
# NeMo manifests, so for actual dataloading we have to fill it using the audio file).
sr = ifnone(sampling_rate, 16000) # fake sampling rate
offset = ifnone(offset, 0.0)
cut = MonoCut(
id=audio_path,
start=offset,
duration=duration,
channel=0,
supervisions=[],
recording=Recording(
id=audio_path,
sources=[AudioSource(type="dummy", channels=[0], source="")],
sampling_rate=sr,
duration=offset + duration,
num_samples=compute_num_samples(offset + duration, sr),
),
)
return cut

def _create_recording(
self,
audio_path: str,
Expand All @@ -156,15 +190,6 @@ def _create_recording(
duration=duration,
channel_ids=[0],
)
elif self.metadata_only:
return Recording(
id=audio_path,
sources=[AudioSource(type="file", channels=[0], source=audio_path)],
sampling_rate=-1,
num_samples=-1,
duration=duration,
channel_ids=[0],
)
else:
return Recording.from_file(audio_path)

Expand Down
188 changes: 188 additions & 0 deletions tests/collections/common/test_lhotse_nemo_adapters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import numpy as np
import pytest
from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment
from lhotse.serialization import save_to_jsonl
from lhotse.testing.dummies import DummyManifest

from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator


@pytest.fixture
def nemo_manifest_path(tmp_path_factory):
"""2 utterances of length 1s as a NeMo manifest."""
tmpdir = tmp_path_factory.mktemp("nemo_data")
cuts = DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True).save_audios(tmpdir, progress_bar=False)
nemo = []
for c in cuts:
nemo.append(
{
"audio_filepath": c.recording.sources[0].source,
"text": "irrelevant",
"duration": c.duration,
"lang": "en",
}
)
p = tmpdir / "nemo_manifest.json"
save_to_jsonl(nemo, p)
return p


def test_lazy_nemo_iterator(nemo_manifest_path):
cuts = CutSet(LazyNeMoIterator(nemo_manifest_path))

assert len(cuts) == 2

for c in cuts:
assert isinstance(c, MonoCut)
assert c.start == 0.0
assert c.duration == 1.0
assert c.num_channels == 1
assert c.sampling_rate == 16000
assert c.num_samples == 16000

assert c.has_recording
assert isinstance(c.recording, Recording)
assert c.recording.duration == 1.0
assert c.recording.num_channels == 1
assert c.recording.num_samples == 16000
assert len(c.recording.sources) == 1
assert isinstance(c.recording.sources[0], AudioSource)
assert c.recording.sources[0].type == "file"

audio = c.load_audio()
assert isinstance(audio, np.ndarray)
assert audio.shape == (1, 16000)
assert audio.dtype == np.float32

assert len(c.supervisions) == 1
s = c.supervisions[0]
assert isinstance(s, SupervisionSegment)
assert s.start == 0
assert s.duration == 1
assert s.channel == 0
assert s.text == "irrelevant"
assert s.language == "en"


@pytest.fixture
def nemo_offset_manifest_path(tmp_path_factory):
"""
4 utterances of length 0.5s as a NeMo manifest.
They are dervied from two audio files of 1s duration, so
two of them have offset 0 and the other two have offset 0.5.
"""
tmpdir = tmp_path_factory.mktemp("nemo_data_offset")
cuts = (
DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True)
.save_audios(tmpdir, progress_bar=False)
.cut_into_windows(duration=0.5, hop=0.5)
)
nemo = []
for c in cuts:
nemo.append(
{
"audio_filepath": c.recording.sources[0].source,
"text": "irrelevant",
"offset": c.start,
"duration": c.duration,
"lang": "en",
}
)
p = tmpdir / "nemo_manifest.json"
save_to_jsonl(nemo, p)
return p


def test_lazy_nemo_iterator_with_offset(nemo_offset_manifest_path):
cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path))

assert len(cuts) == 4

for idx, c in enumerate(cuts):
# Note we originally had 1 cut per 1s audio file.
# Then we cut them into 0.5s cuts, so we have 4 cuts in total,
# 2 of them start at 0s and the other 2 start at 0.5s.
is_even = idx % 2 == 0

assert isinstance(c, MonoCut)
if is_even:
assert c.start == 0.0
else:
assert c.start == 0.5
assert c.duration == 0.5
assert c.num_channels == 1
assert c.sampling_rate == 16000
assert c.num_samples == 8000

assert c.has_recording
assert isinstance(c.recording, Recording)
assert c.recording.duration == 1.0
assert c.recording.num_channels == 1
assert c.recording.num_samples == 16000
assert len(c.recording.sources) == 1
assert isinstance(c.recording.sources[0], AudioSource)
assert c.recording.sources[0].type == "file"

audio = c.load_audio()
assert isinstance(audio, np.ndarray)
assert audio.shape == (1, 8000)
assert audio.dtype == np.float32

assert len(c.supervisions) == 1
s = c.supervisions[0]
assert isinstance(s, SupervisionSegment)
assert s.start == 0
assert s.duration == 0.5
assert s.channel == 0
assert s.text == "irrelevant"
assert s.language == "en"


def test_lazy_nemo_iterator_with_offset_metadata_only(nemo_offset_manifest_path):
cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path, metadata_only=True))

assert len(cuts) == 4

for idx, c in enumerate(cuts):
# Note we originally had 1 cut per 1s audio file.
# Then we cut them into 0.5s cuts, so we have 4 cuts in total,
# 2 of them start at 0s and the other 2 start at 0.5s.
is_even = idx % 2 == 0

assert isinstance(c, MonoCut)
if is_even:
assert c.start == 0.0
else:
assert c.start == 0.5
assert c.duration == 0.5
assert c.num_channels == 1
assert c.sampling_rate == 16000
assert c.num_samples == 8000

# With metadata_only=True we can't actually check what's in the Recording.
# The metadata for it may be incorrect (but is correct for the actual Cut),
# but we don't have to perform any I/O to read the file for info.
assert c.has_recording
assert isinstance(c.recording, Recording)
if is_even:
assert c.recording.duration == 0.5
assert c.recording.num_samples == 8000
else:
assert c.recording.duration == 1.0
assert c.recording.num_samples == 16000
assert c.recording.num_channels == 1
assert len(c.recording.sources) == 1
assert isinstance(c.recording.sources[0], AudioSource)
assert c.recording.sources[0].type == "dummy"

with pytest.raises(AssertionError):
c.load_audio()

assert len(c.supervisions) == 1
s = c.supervisions[0]
assert isinstance(s, SupervisionSegment)
assert s.start == 0
assert s.duration == 0.5
assert s.channel == 0
assert s.text == "irrelevant"
assert s.language == "en"

0 comments on commit 18bae50

Please sign in to comment.