From 936416ea99cbf419e314123d671d6f5e7f5efc58 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Thu, 31 Mar 2022 12:23:26 +0100 Subject: [PATCH 01/11] add LibriSpeechFineTune dataset --- torchaudio/datasets/librispeech_finetune.py | 100 ++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 torchaudio/datasets/librispeech_finetune.py diff --git a/torchaudio/datasets/librispeech_finetune.py b/torchaudio/datasets/librispeech_finetune.py new file mode 100644 index 0000000000..951e23d2ea --- /dev/null +++ b/torchaudio/datasets/librispeech_finetune.py @@ -0,0 +1,100 @@ +import os +from pathlib import Path +from typing import Tuple, Union + +import torchaudio +from torch import Tensor +from torch.hub import download_url_to_file +from torch.utils.data import Dataset +from torchaudio.datasets.utils import ( + extract_archive, +) + + +FOLDER_IN_ARCHIVE = "librispeech_finetuning" +URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" +CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" + + +def _get_files(path, split, _ext_audio): + if split == "10min": + files = sorted(str(p) for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)) + elif split in ["1h", "10h"]: + files = [str(p) for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)] + if split == "10h": + files += [str(p) for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)] + files = sorted(files) + else: + raise ValueError(f"Unsupported split value. Found {split}.") + return files + + +def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, int, str, int, int, int]: + fileid = os.path.basename(file_path) + path = os.path.dirname(file_path) + speaker_id, chapter_id, utterance_id = fileid.replace(ext_audio, "").split("-") + + file_text = speaker_id + "-" + chapter_id + ext_txt + file_text = os.path.join(path, file_text) + + fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id + file_audio = fileid_audio + ext_audio + file_audio = os.path.join(path, file_audio) + + # Load audio + waveform, sample_rate = torchaudio.load(file_audio) + + # Load text + with open(file_text) as ft: + for line in ft: + fileid_text, transcript = line.strip().split(" ", 1) + if fileid_audio == fileid_text: + break + else: + # Translation not found + raise FileNotFoundError("Translation not found for " + fileid_audio) + + return ( + waveform, + sample_rate, + transcript, + int(speaker_id), + int(chapter_id), + int(utterance_id), + ) + + +class LibriSpeechFineTune(Dataset): + _ext_txt = ".trans.txt" + _ext_audio = ".flac" + + def __init__( + self, + root: Union[str, Path], + split: str, + folder_in_archive: str = FOLDER_IN_ARCHIVE, + download: bool = False, + ) -> None: + root = os.fspath(root) + self._path = os.path.join(root, folder_in_archive) + archive = os.path.join(root, FOLDER_IN_ARCHIVE + ".tgz") + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + download_url_to_file(URL, archive, hash_prefix=CHECKSUM) + extract_archive(archive) + self._files = _get_files(self._path, split, self._ext_audio) + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + """Load the n-th sample from the dataset. + Args: + n (int): The index of the sample to be loaded + Returns: + (Tensor, int, str, int, int, int): + ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)`` + """ + file_path = self._files[n] + return _load_item(file_path, self._ext_audio, self._ext_txt) + + def __len__(self) -> int: + return len(self._files) From 06586f5673828a23199c7822e3a7ca7917267bd9 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Mon, 11 Apr 2022 17:11:01 +0100 Subject: [PATCH 02/11] address comments, add unit test --- docs/source/datasets.rst | 8 ++ .../datasets/librispeechfintune_test.py | 115 ++++++++++++++++++ torchaudio/datasets/__init__.py | 2 + torchaudio/datasets/librispeech_finetune.py | 11 +- 4 files changed, 130 insertions(+), 6 deletions(-) create mode 100644 test/torchaudio_unittest/datasets/librispeechfintune_test.py diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 50dbdac554..df99f24bc7 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -66,6 +66,14 @@ LIBRISPEECH :special-members: __getitem__ +LibriSpeechFineTune +~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: LibriSpeechFineTune + :members: + :special-members: __getitem__ + + LIBRITTS ~~~~~~~~ diff --git a/test/torchaudio_unittest/datasets/librispeechfintune_test.py b/test/torchaudio_unittest/datasets/librispeechfintune_test.py new file mode 100644 index 0000000000..ad2177d76f --- /dev/null +++ b/test/torchaudio_unittest/datasets/librispeechfintune_test.py @@ -0,0 +1,115 @@ +import os + +from torchaudio.datasets import librispeech_finetune +from torchaudio_unittest.common_utils import ( + TempDirMixin, + TorchaudioTestCase, + get_whitenoise, + save_wav, +) + + +# Used to generate a unique transcript for each dummy audio file +_NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"] + + +def _save_sample(dataset_dir): + mocked_data = [] + sample_rate = 16000 # 16kHz + seed = 0 + for subset in ["clean", "other"]: + subset_dir = os.path.join(dataset_dir, subset) + os.makedirs(subset_dir, exist_ok=True) + + for speaker_id in range(5): + speaker_path = os.path.join(subset_dir, str(speaker_id)) + os.makedirs(speaker_path, exist_ok=True) + + for chapter_id in range(3): + chapter_path = os.path.join(speaker_path, str(chapter_id)) + os.makedirs(chapter_path, exist_ok=True) + trans_content = [] + + for utterance_id in range(3): + filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac" + path = os.path.join(chapter_path, filename) + + transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]) + trans_content.append(f"{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}") + + data = get_whitenoise( + sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed + ) + save_wav(path, data, sample_rate) + print(path) + sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) + mocked_data.append(sample) + + seed += 1 + + trans_filename = f"{speaker_id}-{chapter_id}.trans.txt" + trans_path = os.path.join(chapter_path, trans_filename) + with open(trans_path, "w") as f: + f.write("\n".join(trans_content)) + return mocked_data + + +def get_mock_dataset(root_dir): + """ + root_dir: directory to the mocked dataset + """ + mocked_data_10min, mocked_data_1h, mocked_data_10h = [], [], [] + dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", "0") + os.makedirs(dataset_dir, exist_ok=True) + mocked_data_10min = _save_sample(dataset_dir) + mocked_data_1h += mocked_data_10min + for i in range(1, 6): + dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", str(i)) + os.makedirs(dataset_dir, exist_ok=True) + mocked_data_1h += _save_sample(dataset_dir) + mocked_data_10h += mocked_data_1h + + dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "9h") + os.makedirs(dataset_dir, exist_ok=True) + mocked_data_10h += _save_sample(dataset_dir) + + return mocked_data_10min, mocked_data_1h, mocked_data_10h + + +class TestLibriSpeechFineTune(TempDirMixin, TorchaudioTestCase): + backend = "default" + + root_dir = None + samples_10min = [] + samples_1h = [] + samples_10h = [] + + @classmethod + def setUpClass(cls): + cls.root_dir = cls.get_base_temp_dir() + (cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_dataset(cls.root_dir) + + def _test_librispeech(self, dataset, samples): + num_samples = 0 + for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset): + self.assertEqual(data, samples[i][0], atol=5e-5, rtol=1e-8) + assert sample_rate == samples[i][1] + assert transcript == samples[i][2] + assert speaker_id == samples[i][3] + assert chapter_id == samples[i][4] + assert utterance_id == samples[i][5] + num_samples += 1 + + assert num_samples == len(samples) + + def test_librispeech_10min(self): + dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10min") + self._test_librispeech(dataset, self.samples_10min) + + def test_librispeech_1h(self): + dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="1h") + self._test_librispeech(dataset, self.samples_1h) + + def test_librispeech_10h(self): + dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10h") + self._test_librispeech(dataset, self.samples_10h) diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py index 75136189e0..d3cc648d51 100644 --- a/torchaudio/datasets/__init__.py +++ b/torchaudio/datasets/__init__.py @@ -5,6 +5,7 @@ from .gtzan import GTZAN from .librimix import LibriMix from .librispeech import LIBRISPEECH +from .librispeech_finetune import LibriSpeechFineTune from .libritts import LIBRITTS from .ljspeech import LJSPEECH from .quesst14 import QUESST14 @@ -17,6 +18,7 @@ __all__ = [ "COMMONVOICE", "LIBRISPEECH", + "LibriSpeechFineTune", "SPEECHCOMMANDS", "VCTK_092", "DR_VCTK", diff --git a/torchaudio/datasets/librispeech_finetune.py b/torchaudio/datasets/librispeech_finetune.py index 951e23d2ea..bdcc0f3dfd 100644 --- a/torchaudio/datasets/librispeech_finetune.py +++ b/torchaudio/datasets/librispeech_finetune.py @@ -11,9 +11,8 @@ ) -FOLDER_IN_ARCHIVE = "librispeech_finetuning" -URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" -CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" +_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" +_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" def _get_files(path, split, _ext_audio): @@ -72,16 +71,16 @@ def __init__( self, root: Union[str, Path], split: str, - folder_in_archive: str = FOLDER_IN_ARCHIVE, + folder_in_archive: str = "librispeech_finetuning", download: bool = False, ) -> None: root = os.fspath(root) self._path = os.path.join(root, folder_in_archive) - archive = os.path.join(root, FOLDER_IN_ARCHIVE + ".tgz") + archive = os.path.join(root, folder_in_archive + ".tgz") if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): - download_url_to_file(URL, archive, hash_prefix=CHECKSUM) + download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) extract_archive(archive) self._files = _get_files(self._path, split, self._ext_audio) From ff7cf0bac095ec73b57baf7548b0eacdeae1184a Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Fri, 22 Apr 2022 14:26:22 +0100 Subject: [PATCH 03/11] rename LibriSpeechFineTune to LibriLightLimited --- docs/source/datasets.rst | 6 +++--- ...speechfintune_test.py => librilightlimited_test.py} | 10 +++++----- torchaudio/datasets/__init__.py | 4 ++-- .../{librispeech_finetune.py => libirlight_limited.py} | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) rename test/torchaudio_unittest/datasets/{librispeechfintune_test.py => librilightlimited_test.py} (91%) rename torchaudio/datasets/{librispeech_finetune.py => libirlight_limited.py} (98%) diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index df99f24bc7..2e8265ef17 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -66,10 +66,10 @@ LIBRISPEECH :special-members: __getitem__ -LibriSpeechFineTune -~~~~~~~~~~~~~~~~~~~ +LibriLightLimited +~~~~~~~~~~~~~~~~~ -.. autoclass:: LibriSpeechFineTune +.. autoclass:: LibriLightLimited :members: :special-members: __getitem__ diff --git a/test/torchaudio_unittest/datasets/librispeechfintune_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py similarity index 91% rename from test/torchaudio_unittest/datasets/librispeechfintune_test.py rename to test/torchaudio_unittest/datasets/librilightlimited_test.py index ad2177d76f..665a66d6d4 100644 --- a/test/torchaudio_unittest/datasets/librispeechfintune_test.py +++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py @@ -1,6 +1,6 @@ import os -from torchaudio.datasets import librispeech_finetune +from torchaudio.datasets import librilight_limited from torchaudio_unittest.common_utils import ( TempDirMixin, TorchaudioTestCase, @@ -76,7 +76,7 @@ def get_mock_dataset(root_dir): return mocked_data_10min, mocked_data_1h, mocked_data_10h -class TestLibriSpeechFineTune(TempDirMixin, TorchaudioTestCase): +class TestLibriLightLimited(TempDirMixin, TorchaudioTestCase): backend = "default" root_dir = None @@ -103,13 +103,13 @@ def _test_librispeech(self, dataset, samples): assert num_samples == len(samples) def test_librispeech_10min(self): - dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10min") + dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10min") self._test_librispeech(dataset, self.samples_10min) def test_librispeech_1h(self): - dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="1h") + dataset = librilight_limited.LibriLightLimited(self.root_dir, split="1h") self._test_librispeech(dataset, self.samples_1h) def test_librispeech_10h(self): - dataset = librispeech_finetune.LibriSpeechFineTune(self.root_dir, split="10h") + dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10h") self._test_librispeech(dataset, self.samples_10h) diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py index d3cc648d51..c18d8a4c1e 100644 --- a/torchaudio/datasets/__init__.py +++ b/torchaudio/datasets/__init__.py @@ -5,7 +5,7 @@ from .gtzan import GTZAN from .librimix import LibriMix from .librispeech import LIBRISPEECH -from .librispeech_finetune import LibriSpeechFineTune +from .librilight_limited import LibriLightLimited from .libritts import LIBRITTS from .ljspeech import LJSPEECH from .quesst14 import QUESST14 @@ -18,7 +18,7 @@ __all__ = [ "COMMONVOICE", "LIBRISPEECH", - "LibriSpeechFineTune", + "LibriLightLimited", "SPEECHCOMMANDS", "VCTK_092", "DR_VCTK", diff --git a/torchaudio/datasets/librispeech_finetune.py b/torchaudio/datasets/libirlight_limited.py similarity index 98% rename from torchaudio/datasets/librispeech_finetune.py rename to torchaudio/datasets/libirlight_limited.py index bdcc0f3dfd..c2bf48f8ed 100644 --- a/torchaudio/datasets/librispeech_finetune.py +++ b/torchaudio/datasets/libirlight_limited.py @@ -63,7 +63,7 @@ def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, in ) -class LibriSpeechFineTune(Dataset): +class LibriLightLimited(Dataset): _ext_txt = ".trans.txt" _ext_audio = ".flac" From 22cb38199cc40739c0d32f8792be043cfcd04ad4 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Fri, 22 Apr 2022 14:32:13 +0100 Subject: [PATCH 04/11] fix --- torchaudio/datasets/__init__.py | 2 +- .../datasets/{libirlight_limited.py => librilight_limited.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename torchaudio/datasets/{libirlight_limited.py => librilight_limited.py} (100%) diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py index c18d8a4c1e..c5946e8096 100644 --- a/torchaudio/datasets/__init__.py +++ b/torchaudio/datasets/__init__.py @@ -3,9 +3,9 @@ from .commonvoice import COMMONVOICE from .dr_vctk import DR_VCTK from .gtzan import GTZAN +from .librilight_limited import LibriLightLimited from .librimix import LibriMix from .librispeech import LIBRISPEECH -from .librilight_limited import LibriLightLimited from .libritts import LIBRITTS from .ljspeech import LJSPEECH from .quesst14 import QUESST14 diff --git a/torchaudio/datasets/libirlight_limited.py b/torchaudio/datasets/librilight_limited.py similarity index 100% rename from torchaudio/datasets/libirlight_limited.py rename to torchaudio/datasets/librilight_limited.py From 73978b4eeade4d05c0ae0251ab12ffdee6bcc4f9 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Tue, 26 Apr 2022 12:46:13 +0100 Subject: [PATCH 05/11] address comment --- .../datasets/librilightlimited_test.py | 79 +++++++++---------- torchaudio/datasets/librilight_limited.py | 30 ++++--- 2 files changed, 59 insertions(+), 50 deletions(-) diff --git a/test/torchaudio_unittest/datasets/librilightlimited_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py index 665a66d6d4..ebc179b386 100644 --- a/test/torchaudio_unittest/datasets/librilightlimited_test.py +++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py @@ -13,65 +13,64 @@ _NUMBERS = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "SEVEN", "EIGHT", "NINE"] -def _save_sample(dataset_dir): +def _save_sample(file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed): + filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac" + path = os.path.join(file_path, filename) + data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed) + transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]) + save_wav(path, data, sample_rate) + sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) + return sample + + +def get_mock_dataset(dataset_dir: str): + """Create mocked dataset for a sub directory. + + Args: + dataset_dir (str): the path of the sub directory. + The structure is: audio_type/speaker_id/chapter_id/filename.flac + """ mocked_data = [] sample_rate = 16000 # 16kHz seed = 0 - for subset in ["clean", "other"]: - subset_dir = os.path.join(dataset_dir, subset) - os.makedirs(subset_dir, exist_ok=True) - + for audio_type in ["clean", "other"]: for speaker_id in range(5): - speaker_path = os.path.join(subset_dir, str(speaker_id)) - os.makedirs(speaker_path, exist_ok=True) - for chapter_id in range(3): - chapter_path = os.path.join(speaker_path, str(chapter_id)) - os.makedirs(chapter_path, exist_ok=True) + file_path = os.path.join(dataset_dir, audio_type, str(speaker_id), str(chapter_id)) + os.makedirs(file_path, exist_ok=True) trans_content = [] - for utterance_id in range(3): - filename = f"{speaker_id}-{chapter_id}-{utterance_id:04d}.flac" - path = os.path.join(chapter_path, filename) - - transcript = " ".join([_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]) - trans_content.append(f"{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}") - - data = get_whitenoise( - sample_rate=sample_rate, duration=0.01, n_channels=1, dtype="float32", seed=seed + sample = _save_sample( + file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed ) - save_wav(path, data, sample_rate) - print(path) - sample = (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) + trans_content.append(f"{sample[3]}-{sample[4]}-{sample[5]:04d} {sample[2]}") mocked_data.append(sample) - seed += 1 - trans_filename = f"{speaker_id}-{chapter_id}.trans.txt" - trans_path = os.path.join(chapter_path, trans_filename) + trans_path = os.path.join(file_path, trans_filename) with open(trans_path, "w") as f: f.write("\n".join(trans_content)) return mocked_data -def get_mock_dataset(root_dir): +def get_mock_datasets(root_dir): """ root_dir: directory to the mocked dataset """ mocked_data_10min, mocked_data_1h, mocked_data_10h = [], [], [] dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", "0") os.makedirs(dataset_dir, exist_ok=True) - mocked_data_10min = _save_sample(dataset_dir) + mocked_data_10min = get_mock_dataset(dataset_dir) mocked_data_1h += mocked_data_10min for i in range(1, 6): dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "1h", str(i)) os.makedirs(dataset_dir, exist_ok=True) - mocked_data_1h += _save_sample(dataset_dir) + mocked_data_1h += get_mock_dataset(dataset_dir) mocked_data_10h += mocked_data_1h dataset_dir = os.path.join(root_dir, "librispeech_finetuning", "9h") os.makedirs(dataset_dir, exist_ok=True) - mocked_data_10h += _save_sample(dataset_dir) + mocked_data_10h += get_mock_dataset(dataset_dir) return mocked_data_10min, mocked_data_1h, mocked_data_10h @@ -87,9 +86,9 @@ class TestLibriLightLimited(TempDirMixin, TorchaudioTestCase): @classmethod def setUpClass(cls): cls.root_dir = cls.get_base_temp_dir() - (cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_dataset(cls.root_dir) + (cls.samples_10min, cls.samples_1h, cls.samples_10h) = get_mock_datasets(cls.root_dir) - def _test_librispeech(self, dataset, samples): + def _test_librilightlimited(self, dataset, samples): num_samples = 0 for i, (data, sample_rate, transcript, speaker_id, chapter_id, utterance_id) in enumerate(dataset): self.assertEqual(data, samples[i][0], atol=5e-5, rtol=1e-8) @@ -102,14 +101,14 @@ def _test_librispeech(self, dataset, samples): assert num_samples == len(samples) - def test_librispeech_10min(self): - dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10min") - self._test_librispeech(dataset, self.samples_10min) + def test_librilightlimited_10min(self): + dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="10min") + self._test_librilightlimited(dataset, self.samples_10min) - def test_librispeech_1h(self): - dataset = librilight_limited.LibriLightLimited(self.root_dir, split="1h") - self._test_librispeech(dataset, self.samples_1h) + def test_librilightlimited_1h(self): + dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="1h") + self._test_librilightlimited(dataset, self.samples_1h) - def test_librispeech_10h(self): - dataset = librilight_limited.LibriLightLimited(self.root_dir, split="10h") - self._test_librispeech(dataset, self.samples_10h) + def test_librilightlimited_10h(self): + dataset = librilight_limited.LibriLightLimited(self.root_dir, subset="10h") + self._test_librilightlimited(dataset, self.samples_10h) diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py index c2bf48f8ed..a765f69453 100644 --- a/torchaudio/datasets/librilight_limited.py +++ b/torchaudio/datasets/librilight_limited.py @@ -15,16 +15,16 @@ _CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" -def _get_files(path, split, _ext_audio): - if split == "10min": +def _get_files(path, subset, _ext_audio): + if subset == "10min": files = sorted(str(p) for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)) - elif split in ["1h", "10h"]: + elif subset in ["1h", "10h"]: files = [str(p) for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)] - if split == "10h": + if subset == "10h": files += [str(p) for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)] files = sorted(files) else: - raise ValueError(f"Unsupported split value. Found {split}.") + raise ValueError(f"Unsupported subset value. Found {subset}.") return files @@ -64,25 +64,35 @@ def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, in class LibriLightLimited(Dataset): + """Create a Dataset for LibriLightLimited, which is the supervised subset of + LibriLight dataset. + + Args: + root (str or Path): Path to the directory where the dataset is found or downloaded. + subset (str, optional): The subset to use. Options: [``10min`, ``1h``, ``10h``] + (Default: ``10min``). + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + """ + _ext_txt = ".trans.txt" _ext_audio = ".flac" def __init__( self, root: Union[str, Path], - split: str, - folder_in_archive: str = "librispeech_finetuning", + subset: str = "10min", download: bool = False, ) -> None: root = os.fspath(root) - self._path = os.path.join(root, folder_in_archive) - archive = os.path.join(root, folder_in_archive + ".tgz") + self._path = os.path.join(root, "librispeech_finetuning") + archive = os.path.join(root, "librispeech_finetuning" + ".tgz") if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) extract_archive(archive) - self._files = _get_files(self._path, split, self._ext_audio) + self._files = _get_files(self._path, subset, self._ext_audio) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: """Load the n-th sample from the dataset. From a7ecbe050022af6bc8286d21721c8e28a23a03f4 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Tue, 26 Apr 2022 12:49:47 +0100 Subject: [PATCH 06/11] fix --- torchaudio/datasets/librilight_limited.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py index a765f69453..ead2933730 100644 --- a/torchaudio/datasets/librilight_limited.py +++ b/torchaudio/datasets/librilight_limited.py @@ -84,6 +84,8 @@ def __init__( subset: str = "10min", download: bool = False, ) -> None: + assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']" + root = os.fspath(root) self._path = os.path.join(root, "librispeech_finetuning") archive = os.path.join(root, "librispeech_finetuning" + ".tgz") From f5b749132ad75e738c46b8e4b57c092294eaacab Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Tue, 26 Apr 2022 12:56:07 +0100 Subject: [PATCH 07/11] fix --- test/torchaudio_unittest/datasets/librilightlimited_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/torchaudio_unittest/datasets/librilightlimited_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py index ebc179b386..26e65e7e2c 100644 --- a/test/torchaudio_unittest/datasets/librilightlimited_test.py +++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py @@ -40,9 +40,7 @@ def get_mock_dataset(dataset_dir: str): os.makedirs(file_path, exist_ok=True) trans_content = [] for utterance_id in range(3): - sample = _save_sample( - file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed - ) + sample = _save_sample(file_path, speaker_id, chapter_id, utterance_id, sample_rate, seed) trans_content.append(f"{sample[3]}-{sample[4]}-{sample[5]:04d} {sample[2]}") mocked_data.append(sample) seed += 1 From e096ee10bc142e00ef2d8cdb4e4c7a711d9d9033 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Wed, 11 May 2022 17:49:28 +0100 Subject: [PATCH 08/11] assert dataset is downloaded to root path --- torchaudio/datasets/librilight_limited.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py index ead2933730..f334d07826 100644 --- a/torchaudio/datasets/librilight_limited.py +++ b/torchaudio/datasets/librilight_limited.py @@ -6,9 +6,7 @@ from torch import Tensor from torch.hub import download_url_to_file from torch.utils.data import Dataset -from torchaudio.datasets.utils import ( - extract_archive, -) +from torchaudio.datasets.utils import extract_archive _URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" @@ -89,11 +87,12 @@ def __init__( root = os.fspath(root) self._path = os.path.join(root, "librispeech_finetuning") archive = os.path.join(root, "librispeech_finetuning" + ".tgz") - if download: - if not os.path.isdir(self._path): - if not os.path.isfile(archive): - download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) - extract_archive(archive) + if not os.path.isdir(self._path): + if not download: + raise RuntimeError("Dataset not found. Please use `download=True` to download") + if not os.path.isfile(archive): + download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) + extract_archive(archive) self._files = _get_files(self._path, subset, self._ext_audio) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: From dc3d9ce73456cc1feb464adb27b94327fe91c365 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Fri, 13 May 2022 22:43:14 +0100 Subject: [PATCH 09/11] address comments --- torchaudio/datasets/librilight_limited.py | 77 +++++++++-------------- 1 file changed, 29 insertions(+), 48 deletions(-) diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py index f334d07826..7c9447b6ca 100644 --- a/torchaudio/datasets/librilight_limited.py +++ b/torchaudio/datasets/librilight_limited.py @@ -1,64 +1,45 @@ import os from pathlib import Path -from typing import Tuple, Union +from typing import List, Tuple, Union -import torchaudio from torch import Tensor from torch.hub import download_url_to_file from torch.utils.data import Dataset +from torchaudio.datasets.librispeech import load_librispeech_item from torchaudio.datasets.utils import extract_archive +_ARCHIVE_NAME = "librispeech_finetuning" _URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" _CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" -def _get_files(path, subset, _ext_audio): +def _get_fileids_paths(path, subset, _ext_audio) -> List[Tuple[str, str]]: + """Get the file names and the corresponding file paths without `speaker_id` + and `chapter_id` directories. + The format of path is like: + {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or + {root}/{_ARCHIVE_NAME}/9h/[clean, other] + """ if subset == "10min": - files = sorted(str(p) for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)) + files_paths = [ + (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem)) + for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio) + ] elif subset in ["1h", "10h"]: - files = [str(p) for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)] + files_paths = [ + (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem)) + for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio) + ] if subset == "10h": - files += [str(p) for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)] - files = sorted(files) + files_paths += [ + (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem)) + for p in Path(path).glob("9h/*/*/*/*" + _ext_audio) + ] else: raise ValueError(f"Unsupported subset value. Found {subset}.") - return files - - -def _load_item(file_path: str, ext_audio: str, ext_txt: str) -> Tuple[Tensor, int, str, int, int, int]: - fileid = os.path.basename(file_path) - path = os.path.dirname(file_path) - speaker_id, chapter_id, utterance_id = fileid.replace(ext_audio, "").split("-") - - file_text = speaker_id + "-" + chapter_id + ext_txt - file_text = os.path.join(path, file_text) - - fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id - file_audio = fileid_audio + ext_audio - file_audio = os.path.join(path, file_audio) - - # Load audio - waveform, sample_rate = torchaudio.load(file_audio) - - # Load text - with open(file_text) as ft: - for line in ft: - fileid_text, transcript = line.strip().split(" ", 1) - if fileid_audio == fileid_text: - break - else: - # Translation not found - raise FileNotFoundError("Translation not found for " + fileid_audio) - - return ( - waveform, - sample_rate, - transcript, - int(speaker_id), - int(chapter_id), - int(utterance_id), - ) + files_paths = sorted(files_paths, key=lambda x: x[0] + x[1]) + return files_paths class LibriLightLimited(Dataset): @@ -85,15 +66,15 @@ def __init__( assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']" root = os.fspath(root) - self._path = os.path.join(root, "librispeech_finetuning") - archive = os.path.join(root, "librispeech_finetuning" + ".tgz") + self._path = os.path.join(root, _ARCHIVE_NAME) + archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz") if not os.path.isdir(self._path): if not download: raise RuntimeError("Dataset not found. Please use `download=True` to download") if not os.path.isfile(archive): download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) extract_archive(archive) - self._files = _get_files(self._path, subset, self._ext_audio) + self._fileids_paths = _get_fileids_paths(self._path, subset, self._ext_audio) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: """Load the n-th sample from the dataset. @@ -103,8 +84,8 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: (Tensor, int, str, int, int, int): ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)`` """ - file_path = self._files[n] - return _load_item(file_path, self._ext_audio, self._ext_txt) + file_path, fileid = self._fileids_paths[n] + return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt) def __len__(self) -> int: return len(self._files) From e3faf5913bbc986fe2a32a08295bdcb5954ea174 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Fri, 13 May 2022 22:45:27 +0100 Subject: [PATCH 10/11] fix --- torchaudio/datasets/librilight_limited.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/librilight_limited.py b/torchaudio/datasets/librilight_limited.py index 7c9447b6ca..548e465138 100644 --- a/torchaudio/datasets/librilight_limited.py +++ b/torchaudio/datasets/librilight_limited.py @@ -88,4 +88,4 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt) def __len__(self) -> int: - return len(self._files) + return len(self._fileids_paths) From 557064ba28059f6035c7ac308ca47f0f9324c488 Mon Sep 17 00:00:00 2001 From: Zhaoheng Ni Date: Mon, 23 May 2022 09:04:37 +0100 Subject: [PATCH 11/11] fix lint --- test/torchaudio_unittest/datasets/librilightlimited_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/torchaudio_unittest/datasets/librilightlimited_test.py b/test/torchaudio_unittest/datasets/librilightlimited_test.py index 26e65e7e2c..9c30955652 100644 --- a/test/torchaudio_unittest/datasets/librilightlimited_test.py +++ b/test/torchaudio_unittest/datasets/librilightlimited_test.py @@ -2,10 +2,10 @@ from torchaudio.datasets import librilight_limited from torchaudio_unittest.common_utils import ( - TempDirMixin, - TorchaudioTestCase, get_whitenoise, save_wav, + TempDirMixin, + TorchaudioTestCase, )