From 936416ea99cbf419e314123d671d6f5e7f5efc58 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Thu, 31 Mar 2022 12:23:26 +0100
Subject: [PATCH 01/11] add LibriSpeechFineTune dataset

---
 torchaudio/datasets/librispeech_finetune.py | 100 ++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 torchaudio/datasets/librispeech_finetune.py

diff --git a/torchaudio/datasets/librispeech_finetune.py b/torchaudio/datasets/librispeech_finetune.py
new file mode 100644
index 0000000000..951e23d2ea
--- /dev/null
+++ b/torchaudio/datasets/librispeech_finetune.py
@@ -0,0 +1,100 @@
+import os
+from pathlib import Path
+from typing import Tuple, Union
+
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import (
+    extract_archive,
+)
+
+
+FOLDER_IN_ARCHIVE = "librispeech_finetuning"
+URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
+CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
+
+
+def _get_files(path, split, _ext_audio):
+    if split == "10min":
+        files = sorted(str(p) for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio))
+    elif split in ["1h", "10h"]:
+        files = [str(p) for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)]
+        if split == "10h":
+            files += [str(p) for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)]
+        files = sorted(files)
+    else:
+        raise ValueError(f"Unsupported split value. Found {split}.")
+    return files
+
+
+def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, int, str, int, int, int]:
+    fileid = os.path.basename(file_path)
+    path = os.path.dirname(file_path)
+    speaker_id, chapter_id, utterance_id = fileid.replace(ext_audio, "").split("-")
+
+    file_text = speaker_id + "-" + chapter_id + ext_txt
+    file_text = os.path.join(path, file_text)
+
+    fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id
+    file_audio = fileid_audio + ext_audio
+    file_audio = os.path.join(path, file_audio)
+
+    # Load audio
+    waveform, sample_rate = torchaudio.load(file_audio)
+
+    # Load text
+    with open(file_text) as ft:
+        for line in ft:
+            fileid_text, transcript = line.strip().split(" ", 1)
+            if fileid_audio == fileid_text:
+                break
+        else:
+            # Translation not found
+            raise FileNotFoundError("Translation not found for " + fileid_audio)
+
+    return (
+        waveform,
+        sample_rate,
+        transcript,
+        int(speaker_id),
+        int(chapter_id),
+        int(utterance_id),
+    )
+
+
+class LibriSpeechFineTune(Dataset):
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        split: str,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+    ) -> None:
+        root = os.fspath(root)
+        self._path = os.path.join(root, folder_in_archive)
+        archive = os.path.join(root, FOLDER_IN_ARCHIVE + ".tgz")
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    download_url_to_file(URL, archive, hash_prefix=CHECKSUM)
+                extract_archive(archive)
+        self._files = _get_files(self._path, split, self._ext_audio)
+
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
+        """
+        file_path = self._files[n]
+        return _load_item(file_path, self._ext_audio, self._ext_txt)
+
+    def __len__(self) -> int:
+        return len(self._files)

From 06586f5673828a23199c7822e3a7ca7917267bd9 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Mon, 11 Apr 2022 17:11:01 +0100
Subject: [PATCH 02/11] address comments, add unit test

---
 docs/source/datasets.rst                      |   8 ++
 .../datasets/librispeechfintune_test.py       | 115 ++++++++++++++++++
 torchaudio/datasets/__init__.py               |   2 +
 torchaudio/datasets/librispeech_finetune.py   |  11 +-
 4 files changed, 130 insertions(+), 6 deletions(-)
 create mode 100644 test/torchaudio_unittest/datasets/librispeechfintune_test.py

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index 50dbdac554..df99f24bc7 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -66,6 +66,14 @@ LIBRISPEECH
   :special-members: __getitem__
 
 
+LibriSpeechFineTune
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: LibriSpeechFineTune
+  :members:
+  :special-members: __getitem__
+
+
 LIBRITTS
 ~~~~~~~~
 
diff --git a/test/torchaudio_unittest/datasets/librispeechfintune_test.py b/test/torchaudio_unittest/datasets/librispeechfintune_test.py
new file mode 100644
index 0000000000..ad2177d76f
--- /dev/null
+++ b/test/torchaudio_unittest/datasets/librispeechfintune_test.py
@@ -0,0 +1,115 @@
+import os
+
+from torchaudio.datasets import librispeech_finetune
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+)
+
+
+# Used to generate a unique transcript for each dummy audio file
+_NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"]
+
+
+def _save_sample(dataset_dir):
+    mocked_data = []
+    sample_rate = 16000  # 16kHz
+    seed = 0
+    for subset in ["clean", "other"]:
+        subset_dir = os.path.join(dataset_dir, subset)
+        os.makedirs(subset_dir, exist_ok=True)
+
+        for speaker_id in range(5):
+            speaker_path = os.path.join(subset_dir, str(speaker_id))
+            os.makedirs(speaker_path, exist_ok=True)
+
+            for chapter_id in range(3):
+                chapter_path = os.path.join(speaker_path, str(chapter_id))
+                os.makedirs(chapter_path, exist_ok=True)
+                trans_content = []
+
+                for utterance_id in range(3):
+                    filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
+                    path = os.path.join(chapter_path, filename)
+
+                    transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]])
+                    trans_content.append(f"{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}")
+
+                    data = get_whitenoise(
+                        sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed
+                    )
+                    save_wav(path, data, sample_rate)
+                    print(path)
+                    sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
+                    mocked_data.append(sample)
+
+                    seed += 1
+
+                trans_filename = f"{speaker_id}-{chapter_id}.trans.txt"
+                trans_path = os.path.join(chapter_path, trans_filename)
+                with open(trans_path, "w") as f:
+                    f.write("\n".join(trans_content))
+    return mocked_data
+
+
+def get_mock_dataset(root_dir):
+    """
+    root_dir: directory to the mocked dataset
+    """
+    mocked_data_10min, mocked_data_1h, mocked_data_10h = [], [], []
+    dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", "0")
+    os.makedirs(dataset_dir, exist_ok=True)
+    mocked_data_10min = _save_sample(dataset_dir)
+    mocked_data_1h += mocked_data_10min
+    for i in range(1, 6):
+        dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", str(i))
+        os.makedirs(dataset_dir, exist_ok=True)
+        mocked_data_1h += _save_sample(dataset_dir)
+    mocked_data_10h += mocked_data_1h
+
+    dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "9h")
+    os.makedirs(dataset_dir, exist_ok=True)
+    mocked_data_10h += _save_sample(dataset_dir)
+
+    return mocked_data_10min, mocked_data_1h, mocked_data_10h
+
+
+class TestLibriSpeechFineTune(TempDirMixin, TorchaudioTestCase):
+    backend = "default"
+
+    root_dir = None
+    samples_10min = []
+    samples_1h = []
+    samples_10h = []
+
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        (cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_dataset(cls.root_dir)
+
+    def _test_librispeech(self, dataset, samples):
+        num_samples = 0
+        for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset):
+            self.assertEqual(data, samples[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == samples[i][1]
+            assert transcript == samples[i][2]
+            assert speaker_id == samples[i][3]
+            assert chapter_id == samples[i][4]
+            assert utterance_id == samples[i][5]
+            num_samples += 1
+
+        assert num_samples == len(samples)
+
+    def test_librispeech_10min(self):
+        dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10min")
+        self._test_librispeech(dataset, self.samples_10min)
+
+    def test_librispeech_1h(self):
+        dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="1h")
+        self._test_librispeech(dataset, self.samples_1h)
+
+    def test_librispeech_10h(self):
+        dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10h")
+        self._test_librispeech(dataset, self.samples_10h)
diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py
index 75136189e0..d3cc648d51 100644
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -5,6 +5,7 @@
 from .gtzan import GTZAN
 from .librimix import LibriMix
 from .librispeech import LIBRISPEECH
+from .librispeech_finetune import LibriSpeechFineTune
 from .libritts import LIBRITTS
 from .ljspeech import LJSPEECH
 from .quesst14 import QUESST14
@@ -17,6 +18,7 @@
 __all__ = [
     "COMMONVOICE",
     "LIBRISPEECH",
+    "LibriSpeechFineTune",
     "SPEECHCOMMANDS",
     "VCTK_092",
     "DR_VCTK",
diff --git a/torchaudio/datasets/librispeech_finetune.py b/torchaudio/datasets/librispeech_finetune.py
index 951e23d2ea..bdcc0f3dfd 100644
--- a/torchaudio/datasets/librispeech_finetune.py
+++ b/torchaudio/datasets/librispeech_finetune.py
@@ -11,9 +11,8 @@
 )
 
 
-FOLDER_IN_ARCHIVE = "librispeech_finetuning"
-URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
-CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
+_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
+_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
 
 
 def _get_files(path, split, _ext_audio):
@@ -72,16 +71,16 @@ def __init__(
         self,
         root: Union[str, Path],
         split: str,
-        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        folder_in_archive: str = "librispeech_finetuning",
         download: bool = False,
     ) -> None:
         root = os.fspath(root)
         self._path = os.path.join(root, folder_in_archive)
-        archive = os.path.join(root, FOLDER_IN_ARCHIVE + ".tgz")
+        archive = os.path.join(root, folder_in_archive + ".tgz")
         if download:
             if not os.path.isdir(self._path):
                 if not os.path.isfile(archive):
-                    download_url_to_file(URL, archive, hash_prefix=CHECKSUM)
+                    download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
                 extract_archive(archive)
         self._files = _get_files(self._path, split, self._ext_audio)
 

From ff7cf0bac095ec73b57baf7548b0eacdeae1184a Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Fri, 22 Apr 2022 14:26:22 +0100
Subject: [PATCH 03/11] rename LibriSpeechFineTune to LibriLightLimited

---
 docs/source/datasets.rst                               |  6 +++---
 ...speechfintune_test.py => librilightlimited_test.py} | 10 +++++-----
 torchaudio/datasets/__init__.py                        |  4 ++--
 .../{librispeech_finetune.py => libirlight_limited.py} |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)
 rename test/torchaudio_unittest/datasets/{librispeechfintune_test.py => librilightlimited_test.py} (91%)
 rename torchaudio/datasets/{librispeech_finetune.py => libirlight_limited.py} (98%)

diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index df99f24bc7..2e8265ef17 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -66,10 +66,10 @@ LIBRISPEECH
   :special-members: __getitem__
 
 
-LibriSpeechFineTune
-~~~~~~~~~~~~~~~~~~~
+LibriLightLimited
+~~~~~~~~~~~~~~~~~
 
-.. autoclass:: LibriSpeechFineTune
+.. autoclass:: LibriLightLimited
   :members:
   :special-members: __getitem__
 
diff --git a/test/torchaudio_unittest/datasets/librispeechfintune_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py
similarity index 91%
rename from test/torchaudio_unittest/datasets/librispeechfintune_test.py
rename to test/torchaudio_unittest/datasets/librilightlimited_test.py
index ad2177d76f..665a66d6d4 100644
--- a/test/torchaudio_unittest/datasets/librispeechfintune_test.py
+++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py
@@ -1,6 +1,6 @@
 import os
 
-from torchaudio.datasets import librispeech_finetune
+from torchaudio.datasets import librilight_limited
 from torchaudio_unittest.common_utils import (
     TempDirMixin,
     TorchaudioTestCase,
@@ -76,7 +76,7 @@ def get_mock_dataset(root_dir):
     return mocked_data_10min, mocked_data_1h, mocked_data_10h
 
 
-class TestLibriSpeechFineTune(TempDirMixin, TorchaudioTestCase):
+class TestLibriLightLimited(TempDirMixin, TorchaudioTestCase):
     backend = "default"
 
     root_dir = None
@@ -103,13 +103,13 @@ def _test_librispeech(self, dataset, samples):
         assert num_samples == len(samples)
 
     def test_librispeech_10min(self):
-        dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10min")
+        dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10min")
         self._test_librispeech(dataset, self.samples_10min)
 
     def test_librispeech_1h(self):
-        dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="1h")
+        dataset = librilight_limited.LibriLightLimited(self.root_dir, split="1h")
         self._test_librispeech(dataset, self.samples_1h)
 
     def test_librispeech_10h(self):
-        dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10h")
+        dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10h")
         self._test_librispeech(dataset, self.samples_10h)
diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py
index d3cc648d51..c18d8a4c1e 100644
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -5,7 +5,7 @@
 from .gtzan import GTZAN
 from .librimix import LibriMix
 from .librispeech import LIBRISPEECH
-from .librispeech_finetune import LibriSpeechFineTune
+from .librilight_limited import LibriLightLimited
 from .libritts import LIBRITTS
 from .ljspeech import LJSPEECH
 from .quesst14 import QUESST14
@@ -18,7 +18,7 @@
 __all__ = [
     "COMMONVOICE",
     "LIBRISPEECH",
-    "LibriSpeechFineTune",
+    "LibriLightLimited",
     "SPEECHCOMMANDS",
     "VCTK_092",
     "DR_VCTK",
diff --git a/torchaudio/datasets/librispeech_finetune.py b/torchaudio/datasets/libirlight_limited.py
similarity index 98%
rename from torchaudio/datasets/librispeech_finetune.py
rename to torchaudio/datasets/libirlight_limited.py
index bdcc0f3dfd..c2bf48f8ed 100644
--- a/torchaudio/datasets/librispeech_finetune.py
+++ b/torchaudio/datasets/libirlight_limited.py
@@ -63,7 +63,7 @@ def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, in
     )
 
 
-class LibriSpeechFineTune(Dataset):
+class LibriLightLimited(Dataset):
     _ext_txt = ".trans.txt"
     _ext_audio = ".flac"
 

From 22cb38199cc40739c0d32f8792be043cfcd04ad4 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Fri, 22 Apr 2022 14:32:13 +0100
Subject: [PATCH 04/11] fix

---
 torchaudio/datasets/__init__.py                                 | 2 +-
 .../datasets/{libirlight_limited.py => librilight_limited.py}   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename torchaudio/datasets/{libirlight_limited.py => librilight_limited.py} (100%)

diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py
index c18d8a4c1e..c5946e8096 100644
--- a/torchaudio/datasets/__init__.py
+++ b/torchaudio/datasets/__init__.py
@@ -3,9 +3,9 @@
 from .commonvoice import COMMONVOICE
 from .dr_vctk import DR_VCTK
 from .gtzan import GTZAN
+from .librilight_limited import LibriLightLimited
 from .librimix import LibriMix
 from .librispeech import LIBRISPEECH
-from .librilight_limited import LibriLightLimited
 from .libritts import LIBRITTS
 from .ljspeech import LJSPEECH
 from .quesst14 import QUESST14
diff --git a/torchaudio/datasets/libirlight_limited.py b/torchaudio/datasets/librilight_limited.py
similarity index 100%
rename from torchaudio/datasets/libirlight_limited.py
rename to torchaudio/datasets/librilight_limited.py

From 73978b4eeade4d05c0ae0251ab12ffdee6bcc4f9 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Tue, 26 Apr 2022 12:46:13 +0100
Subject: [PATCH 05/11] address comment

---
 .../datasets/librilightlimited_test.py        | 79 +++++++++----------
 torchaudio/datasets/librilight_limited.py     | 30 ++++---
 2 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/test/torchaudio_unittest/datasets/librilightlimited_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py
index 665a66d6d4..ebc179b386 100644
--- a/test/torchaudio_unittest/datasets/librilightlimited_test.py
+++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py
@@ -13,65 +13,64 @@
 _NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"]
 
 
-def _save_sample(dataset_dir):
+def _save_sample(file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed):
+    filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
+    path = os.path.join(file_path, filename)
+    data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed)
+    transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]])
+    save_wav(path, data, sample_rate)
+    sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
+    return sample
+
+
+def get_mock_dataset(dataset_dir: str):
+    """Create mocked dataset for a sub directory.
+
+    Args:
+        dataset_dir (str): the path of the sub directory.
+        The structure is: audio_type/speaker_id/chapter_id/filename.flac
+    """
     mocked_data = []
     sample_rate = 16000  # 16kHz
     seed = 0
-    for subset in ["clean", "other"]:
-        subset_dir = os.path.join(dataset_dir, subset)
-        os.makedirs(subset_dir, exist_ok=True)
-
+    for audio_type in ["clean", "other"]:
         for speaker_id in range(5):
-            speaker_path = os.path.join(subset_dir, str(speaker_id))
-            os.makedirs(speaker_path, exist_ok=True)
-
             for chapter_id in range(3):
-                chapter_path = os.path.join(speaker_path, str(chapter_id))
-                os.makedirs(chapter_path, exist_ok=True)
+                file_path = os.path.join(dataset_dir, audio_type, str(speaker_id), str(chapter_id))
+                os.makedirs(file_path, exist_ok=True)
                 trans_content = []
-
                 for utterance_id in range(3):
-                    filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac"
-                    path = os.path.join(chapter_path, filename)
-
-                    transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]])
-                    trans_content.append(f"{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}")
-
-                    data = get_whitenoise(
-                        sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed
+                    sample = _save_sample(
+                        file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed
                     )
-                    save_wav(path, data, sample_rate)
-                    print(path)
-                    sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
+                    trans_content.append(f"{sample[3]}-{sample[4]}-{sample[5]:04d} {sample[2]}")
                     mocked_data.append(sample)
-
                     seed += 1
-
                 trans_filename = f"{speaker_id}-{chapter_id}.trans.txt"
-                trans_path = os.path.join(chapter_path, trans_filename)
+                trans_path = os.path.join(file_path, trans_filename)
                 with open(trans_path, "w") as f:
                     f.write("\n".join(trans_content))
     return mocked_data
 
 
-def get_mock_dataset(root_dir):
+def get_mock_datasets(root_dir):
     """
     root_dir: directory to the mocked dataset
     """
     mocked_data_10min, mocked_data_1h, mocked_data_10h = [], [], []
     dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", "0")
     os.makedirs(dataset_dir, exist_ok=True)
-    mocked_data_10min = _save_sample(dataset_dir)
+    mocked_data_10min = get_mock_dataset(dataset_dir)
     mocked_data_1h += mocked_data_10min
     for i in range(1, 6):
         dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", str(i))
         os.makedirs(dataset_dir, exist_ok=True)
-        mocked_data_1h += _save_sample(dataset_dir)
+        mocked_data_1h += get_mock_dataset(dataset_dir)
     mocked_data_10h += mocked_data_1h
 
     dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "9h")
     os.makedirs(dataset_dir, exist_ok=True)
-    mocked_data_10h += _save_sample(dataset_dir)
+    mocked_data_10h += get_mock_dataset(dataset_dir)
 
     return mocked_data_10min, mocked_data_1h, mocked_data_10h
 
@@ -87,9 +86,9 @@ class TestLibriLightLimited(TempDirMixin, TorchaudioTestCase):
     @classmethod
     def setUpClass(cls):
         cls.root_dir = cls.get_base_temp_dir()
-        (cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_dataset(cls.root_dir)
+        (cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_datasets(cls.root_dir)
 
-    def _test_librispeech(self, dataset, samples):
+    def _test_librilightlimited(self, dataset, samples):
         num_samples = 0
         for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset):
             self.assertEqual(data, samples[i][0], atol=5e-5, rtol=1e-8)
@@ -102,14 +101,14 @@ def _test_librispeech(self, dataset, samples):
 
         assert num_samples == len(samples)
 
-    def test_librispeech_10min(self):
-        dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10min")
-        self._test_librispeech(dataset, self.samples_10min)
+    def test_librilightlimited_10min(self):
+        dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="10min")
+        self._test_librilightlimited(dataset, self.samples_10min)
 
-    def test_librispeech_1h(self):
-        dataset = librilight_limited.LibriLightLimited(self.root_dir, split="1h")
-        self._test_librispeech(dataset, self.samples_1h)
+    def test_librilightlimited_1h(self):
+        dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="1h")
+        self._test_librilightlimited(dataset, self.samples_1h)
 
-    def test_librispeech_10h(self):
-        dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10h")
-        self._test_librispeech(dataset, self.samples_10h)
+    def test_librilightlimited_10h(self):
+        dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="10h")
+        self._test_librilightlimited(dataset, self.samples_10h)
diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py
index c2bf48f8ed..a765f69453 100644
--- a/torchaudio/datasets/librilight_limited.py
+++ b/torchaudio/datasets/librilight_limited.py
@@ -15,16 +15,16 @@
 _CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
 
 
-def _get_files(path, split, _ext_audio):
-    if split == "10min":
+def _get_files(path, subset, _ext_audio):
+    if subset == "10min":
         files = sorted(str(p) for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio))
-    elif split in ["1h", "10h"]:
+    elif subset in ["1h", "10h"]:
         files = [str(p) for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)]
-        if split == "10h":
+        if subset == "10h":
             files += [str(p) for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)]
         files = sorted(files)
     else:
-        raise ValueError(f"Unsupported split value. Found {split}.")
+        raise ValueError(f"Unsupported subset value. Found {subset}.")
     return files
 
 
@@ -64,25 +64,35 @@ def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, in
 
 
 class LibriLightLimited(Dataset):
+    """Create a Dataset for LibriLightLimited, which is the supervised subset of
+        LibriLight dataset.
+
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        subset (str, optional): The subset to use. Options: [``10min`, ``1h``, ``10h``]
+            (Default: ``10min``).
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+
     _ext_txt = ".trans.txt"
     _ext_audio = ".flac"
 
     def __init__(
         self,
         root: Union[str, Path],
-        split: str,
-        folder_in_archive: str = "librispeech_finetuning",
+        subset: str = "10min",
         download: bool = False,
     ) -> None:
         root = os.fspath(root)
-        self._path = os.path.join(root, folder_in_archive)
-        archive = os.path.join(root, folder_in_archive + ".tgz")
+        self._path = os.path.join(root, "librispeech_finetuning")
+        archive = os.path.join(root, "librispeech_finetuning" + ".tgz")
         if download:
             if not os.path.isdir(self._path):
                 if not os.path.isfile(archive):
                     download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
                 extract_archive(archive)
-        self._files = _get_files(self._path, split, self._ext_audio)
+        self._files = _get_files(self._path, subset, self._ext_audio)
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
         """Load the n-th sample from the dataset.

From a7ecbe050022af6bc8286d21721c8e28a23a03f4 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Tue, 26 Apr 2022 12:49:47 +0100
Subject: [PATCH 06/11] fix

---
 torchaudio/datasets/librilight_limited.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py
index a765f69453..ead2933730 100644
--- a/torchaudio/datasets/librilight_limited.py
+++ b/torchaudio/datasets/librilight_limited.py
@@ -84,6 +84,8 @@ def __init__(
         subset: str = "10min",
         download: bool = False,
     ) -> None:
+        assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']"
+
         root = os.fspath(root)
         self._path = os.path.join(root, "librispeech_finetuning")
         archive = os.path.join(root, "librispeech_finetuning" + ".tgz")

From f5b749132ad75e738c46b8e4b57c092294eaacab Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Tue, 26 Apr 2022 12:56:07 +0100
Subject: [PATCH 07/11] fix

---
 test/torchaudio_unittest/datasets/librilightlimited_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/torchaudio_unittest/datasets/librilightlimited_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py
index ebc179b386..26e65e7e2c 100644
--- a/test/torchaudio_unittest/datasets/librilightlimited_test.py
+++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py
@@ -40,9 +40,7 @@ def get_mock_dataset(dataset_dir: str):
                 os.makedirs(file_path, exist_ok=True)
                 trans_content = []
                 for utterance_id in range(3):
-                    sample = _save_sample(
-                        file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed
-                    )
+                    sample = _save_sample(file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed)
                     trans_content.append(f"{sample[3]}-{sample[4]}-{sample[5]:04d} {sample[2]}")
                     mocked_data.append(sample)
                     seed += 1

From e096ee10bc142e00ef2d8cdb4e4c7a711d9d9033 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Wed, 11 May 2022 17:49:28 +0100
Subject: [PATCH 08/11] assert dataset is downloaded to root path

---
 torchaudio/datasets/librilight_limited.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py
index ead2933730..f334d07826 100644
--- a/torchaudio/datasets/librilight_limited.py
+++ b/torchaudio/datasets/librilight_limited.py
@@ -6,9 +6,7 @@
 from torch import Tensor
 from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
-from torchaudio.datasets.utils import (
-    extract_archive,
-)
+from torchaudio.datasets.utils import extract_archive
 
 
 _URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
@@ -89,11 +87,12 @@ def __init__(
         root = os.fspath(root)
         self._path = os.path.join(root, "librispeech_finetuning")
         archive = os.path.join(root, "librispeech_finetuning" + ".tgz")
-        if download:
-            if not os.path.isdir(self._path):
-                if not os.path.isfile(archive):
-                    download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
-                extract_archive(archive)
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError("Dataset not found. Please use `download=True` to download")
+            if not os.path.isfile(archive):
+                download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
+            extract_archive(archive)
         self._files = _get_files(self._path, subset, self._ext_audio)
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:

From dc3d9ce73456cc1feb464adb27b94327fe91c365 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Fri, 13 May 2022 22:43:14 +0100
Subject: [PATCH 09/11] address comments

---
 torchaudio/datasets/librilight_limited.py | 77 +++++++++--------------
 1 file changed, 29 insertions(+), 48 deletions(-)

diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py
index f334d07826..7c9447b6ca 100644
--- a/torchaudio/datasets/librilight_limited.py
+++ b/torchaudio/datasets/librilight_limited.py
@@ -1,64 +1,45 @@
 import os
 from pathlib import Path
-from typing import Tuple, Union
+from typing import List, Tuple, Union
 
-import torchaudio
 from torch import Tensor
 from torch.hub import download_url_to_file
 from torch.utils.data import Dataset
+from torchaudio.datasets.librispeech import load_librispeech_item
 from torchaudio.datasets.utils import extract_archive
 
 
+_ARCHIVE_NAME = "librispeech_finetuning"
 _URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
 _CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
 
 
-def _get_files(path, subset, _ext_audio):
+def _get_fileids_paths(path, subset, _ext_audio) -> List[Tuple[str, str]]:
+    """Get the file names and the corresponding file paths without `speaker_id`
+    and `chapter_id` directories.
+    The format of path is like:
+        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
+        {root}/{_ARCHIVE_NAME}/9h/[clean, other]
+    """
     if subset == "10min":
-        files = sorted(str(p) for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio))
+        files_paths = [
+            (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+            for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)
+        ]
     elif subset in ["1h", "10h"]:
-        files = [str(p) for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)]
+        files_paths = [
+            (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+            for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)
+        ]
         if subset == "10h":
-            files += [str(p) for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)]
-        files = sorted(files)
+            files_paths += [
+                (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+                for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)
+            ]
     else:
         raise ValueError(f"Unsupported subset value. Found {subset}.")
-    return files
-
-
-def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, int, str, int, int, int]:
-    fileid = os.path.basename(file_path)
-    path = os.path.dirname(file_path)
-    speaker_id, chapter_id, utterance_id = fileid.replace(ext_audio, "").split("-")
-
-    file_text = speaker_id + "-" + chapter_id + ext_txt
-    file_text = os.path.join(path, file_text)
-
-    fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id
-    file_audio = fileid_audio + ext_audio
-    file_audio = os.path.join(path, file_audio)
-
-    # Load audio
-    waveform, sample_rate = torchaudio.load(file_audio)
-
-    # Load text
-    with open(file_text) as ft:
-        for line in ft:
-            fileid_text, transcript = line.strip().split(" ", 1)
-            if fileid_audio == fileid_text:
-                break
-        else:
-            # Translation not found
-            raise FileNotFoundError("Translation not found for " + fileid_audio)
-
-    return (
-        waveform,
-        sample_rate,
-        transcript,
-        int(speaker_id),
-        int(chapter_id),
-        int(utterance_id),
-    )
+    files_paths = sorted(files_paths, key=lambda x: x[0] + x[1])
+    return files_paths
 
 
 class LibriLightLimited(Dataset):
@@ -85,15 +66,15 @@ def __init__(
         assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']"
 
         root = os.fspath(root)
-        self._path = os.path.join(root, "librispeech_finetuning")
-        archive = os.path.join(root, "librispeech_finetuning" + ".tgz")
+        self._path = os.path.join(root, _ARCHIVE_NAME)
+        archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
         if not os.path.isdir(self._path):
             if not download:
                 raise RuntimeError("Dataset not found. Please use `download=True` to download")
             if not os.path.isfile(archive):
                 download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
             extract_archive(archive)
-        self._files = _get_files(self._path, subset, self._ext_audio)
+        self._fileids_paths = _get_fileids_paths(self._path, subset, self._ext_audio)
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
         """Load the n-th sample from the dataset.
@@ -103,8 +84,8 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
             (Tensor, int, str, int, int, int):
             ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
         """
-        file_path = self._files[n]
-        return _load_item(file_path, self._ext_audio, self._ext_txt)
+        file_path, fileid = self._fileids_paths[n]
+        return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt)
 
     def __len__(self) -> int:
         return len(self._files)

From e3faf5913bbc986fe2a32a08295bdcb5954ea174 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Fri, 13 May 2022 22:45:27 +0100
Subject: [PATCH 10/11] fix

---
 torchaudio/datasets/librilight_limited.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py
index 7c9447b6ca..548e465138 100644
--- a/torchaudio/datasets/librilight_limited.py
+++ b/torchaudio/datasets/librilight_limited.py
@@ -88,4 +88,4 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
         return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt)
 
     def __len__(self) -> int:
-        return len(self._files)
+        return len(self._fileids_paths)

From 557064ba28059f6035c7ac308ca47f0f9324c488 Mon Sep 17 00:00:00 2001
From: Zhaoheng Ni <zni@fb.com>
Date: Mon, 23 May 2022 09:04:37 +0100
Subject: [PATCH 11/11] fix lint

---
 test/torchaudio_unittest/datasets/librilightlimited_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/torchaudio_unittest/datasets/librilightlimited_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py
index 26e65e7e2c..9c30955652 100644
--- a/test/torchaudio_unittest/datasets/librilightlimited_test.py
+++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py
@@ -2,10 +2,10 @@
 
 from torchaudio.datasets import librilight_limited
 from torchaudio_unittest.common_utils import (
-    TempDirMixin,
-    TorchaudioTestCase,
     get_whitenoise,
     save_wav,
+    TempDirMixin,
+    TorchaudioTestCase,
 )