Add tests for LazyNeMoIterator and fix case with metadata_only=True a…

…nd offsets in manifest (NVIDIA#10198) * Add tests for LazyNeMoIterator and fix case with manifest_only=True and offsets in manifest Signed-off-by: Piotr Żelasko <petezor@gmail.com> * Address code review Signed-off-by: Piotr Żelasko <petezor@gmail.com> * fix tests Signed-off-by: Piotr Żelasko <petezor@gmail.com> * fix tests Signed-off-by: Piotr Żelasko <petezor@gmail.com> --------- Signed-off-by: Piotr Żelasko <petezor@gmail.com>
WoodieDudy · Aug 26, 2024 · 18bae50 · 18bae50
1 parent 8482971
commit 18bae50
Show file tree

Hide file tree

Showing 2 changed files with 228 additions and 15 deletions.
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -24,7 +24,7 @@
 import lhotse.serialization
 import soundfile
 from cytoolz import groupby
-from lhotse import AudioSource, Recording, SupervisionSegment
+from lhotse import AudioSource, MonoCut, Recording, SupervisionSegment
 from lhotse.audio.backend import LibsndfileBackend
 from lhotse.cut import Cut
 from lhotse.dataset.dataloading import resolve_seed
@@ -112,11 +112,9 @@ def __iter__(self) -> Generator[Cut, None, None]:
             audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path))
             duration = data.pop("duration")
             offset = data.pop("offset", None)
-            recording = self._create_recording(audio_path, duration, data.pop("sampling_rate", None))
-            cut = recording.to_cut()
-            if offset is not None:
-                cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
-                cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
+            cut = self._create_cut(
+                audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None)
+            )
             # Note that start=0 and not start=offset because supervision's start if relative to the
             # start of the cut; and cut.start is already set to offset
             cut.supervisions.append(
@@ -140,6 +138,42 @@ def __len__(self) -> int:
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
+    def _create_cut(
+        self,
+        audio_path: str,
+        offset: float,
+        duration: float,
+        sampling_rate: int | None = None,
+    ) -> Cut:
+        if not self.metadata_only:
+            recording = self._create_recording(audio_path, duration, sampling_rate)
+            cut = recording.to_cut()
+            if offset is not None:
+                cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
+                cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
+        else:
+            # Only metadata requested.
+            # We'll provide accurate metadata for Cut but inaccurate metadata for Recording to avoid
+            # incurring IO penalty (note that Lhotse manifests contain more information than
+            # NeMo manifests, so for actual dataloading we have to fill it using the audio file).
+            sr = ifnone(sampling_rate, 16000)  # fake sampling rate
+            offset = ifnone(offset, 0.0)
+            cut = MonoCut(
+                id=audio_path,
+                start=offset,
+                duration=duration,
+                channel=0,
+                supervisions=[],
+                recording=Recording(
+                    id=audio_path,
+                    sources=[AudioSource(type="dummy", channels=[0], source="")],
+                    sampling_rate=sr,
+                    duration=offset + duration,
+                    num_samples=compute_num_samples(offset + duration, sr),
+                ),
+            )
+        return cut
+
     def _create_recording(
         self,
         audio_path: str,
@@ -156,15 +190,6 @@ def _create_recording(
                 duration=duration,
                 channel_ids=[0],
             )
-        elif self.metadata_only:
-            return Recording(
-                id=audio_path,
-                sources=[AudioSource(type="file", channels=[0], source=audio_path)],
-                sampling_rate=-1,
-                num_samples=-1,
-                duration=duration,
-                channel_ids=[0],
-            )
         else:
             return Recording.from_file(audio_path)
 

diff --git a/tests/collections/common/test_lhotse_nemo_adapters.py b/tests/collections/common/test_lhotse_nemo_adapters.py
@@ -0,0 +1,188 @@
+import numpy as np
+import pytest
+from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment
+from lhotse.serialization import save_to_jsonl
+from lhotse.testing.dummies import DummyManifest
+
+from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
+
+
+@pytest.fixture
+def nemo_manifest_path(tmp_path_factory):
+    """2 utterances of length 1s as a NeMo manifest."""
+    tmpdir = tmp_path_factory.mktemp("nemo_data")
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True).save_audios(tmpdir, progress_bar=False)
+    nemo = []
+    for c in cuts:
+        nemo.append(
+            {
+                "audio_filepath": c.recording.sources[0].source,
+                "text": "irrelevant",
+                "duration": c.duration,
+                "lang": "en",
+            }
+        )
+    p = tmpdir / "nemo_manifest.json"
+    save_to_jsonl(nemo, p)
+    return p
+
+
+def test_lazy_nemo_iterator(nemo_manifest_path):
+    cuts = CutSet(LazyNeMoIterator(nemo_manifest_path))
+
+    assert len(cuts) == 2
+
+    for c in cuts:
+        assert isinstance(c, MonoCut)
+        assert c.start == 0.0
+        assert c.duration == 1.0
+        assert c.num_channels == 1
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 16000
+
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        assert c.recording.duration == 1.0
+        assert c.recording.num_channels == 1
+        assert c.recording.num_samples == 16000
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "file"
+
+        audio = c.load_audio()
+        assert isinstance(audio, np.ndarray)
+        assert audio.shape == (1, 16000)
+        assert audio.dtype == np.float32
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 1
+        assert s.channel == 0
+        assert s.text == "irrelevant"
+        assert s.language == "en"
+
+
+@pytest.fixture
+def nemo_offset_manifest_path(tmp_path_factory):
+    """
+    4 utterances of length 0.5s as a NeMo manifest.
+    They are dervied from two audio files of 1s duration, so
+    two of them have offset 0 and the other two have offset 0.5.
+    """
+    tmpdir = tmp_path_factory.mktemp("nemo_data_offset")
+    cuts = (
+        DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True)
+        .save_audios(tmpdir, progress_bar=False)
+        .cut_into_windows(duration=0.5, hop=0.5)
+    )
+    nemo = []
+    for c in cuts:
+        nemo.append(
+            {
+                "audio_filepath": c.recording.sources[0].source,
+                "text": "irrelevant",
+                "offset": c.start,
+                "duration": c.duration,
+                "lang": "en",
+            }
+        )
+    p = tmpdir / "nemo_manifest.json"
+    save_to_jsonl(nemo, p)
+    return p
+
+
+def test_lazy_nemo_iterator_with_offset(nemo_offset_manifest_path):
+    cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path))
+
+    assert len(cuts) == 4
+
+    for idx, c in enumerate(cuts):
+        # Note we originally had 1 cut per 1s audio file.
+        # Then we cut them into 0.5s cuts, so we have 4 cuts in total,
+        # 2 of them start at 0s and the other 2 start at 0.5s.
+        is_even = idx % 2 == 0
+
+        assert isinstance(c, MonoCut)
+        if is_even:
+            assert c.start == 0.0
+        else:
+            assert c.start == 0.5
+        assert c.duration == 0.5
+        assert c.num_channels == 1
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 8000
+
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        assert c.recording.duration == 1.0
+        assert c.recording.num_channels == 1
+        assert c.recording.num_samples == 16000
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "file"
+
+        audio = c.load_audio()
+        assert isinstance(audio, np.ndarray)
+        assert audio.shape == (1, 8000)
+        assert audio.dtype == np.float32
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 0.5
+        assert s.channel == 0
+        assert s.text == "irrelevant"
+        assert s.language == "en"
+
+
+def test_lazy_nemo_iterator_with_offset_metadata_only(nemo_offset_manifest_path):
+    cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path, metadata_only=True))
+
+    assert len(cuts) == 4
+
+    for idx, c in enumerate(cuts):
+        # Note we originally had 1 cut per 1s audio file.
+        # Then we cut them into 0.5s cuts, so we have 4 cuts in total,
+        # 2 of them start at 0s and the other 2 start at 0.5s.
+        is_even = idx % 2 == 0
+
+        assert isinstance(c, MonoCut)
+        if is_even:
+            assert c.start == 0.0
+        else:
+            assert c.start == 0.5
+        assert c.duration == 0.5
+        assert c.num_channels == 1
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 8000
+
+        # With metadata_only=True we can't actually check what's in the Recording.
+        # The metadata for it may be incorrect (but is correct for the actual Cut),
+        # but we don't have to perform any I/O to read the file for info.
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        if is_even:
+            assert c.recording.duration == 0.5
+            assert c.recording.num_samples == 8000
+        else:
+            assert c.recording.duration == 1.0
+            assert c.recording.num_samples == 16000
+        assert c.recording.num_channels == 1
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "dummy"
+
+        with pytest.raises(AssertionError):
+            c.load_audio()
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 0.5
+        assert s.channel == 0
+        assert s.text == "irrelevant"
+        assert s.language == "en"