diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index b3f1b3c725..2ceb93a440 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -29,82 +29,85 @@ CMUARCTIC ~~~~~~~~~ .. autoclass:: CMUARCTIC - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ COMMONVOICE ~~~~~~~~~~~ .. autoclass:: COMMONVOICE - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ GTZAN ~~~~~ .. autoclass:: GTZAN - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ LIBRISPEECH ~~~~~~~~~~~ .. autoclass:: LIBRISPEECH - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ LIBRITTS ~~~~~~~~ .. autoclass:: LIBRITTS - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ LJSPEECH ~~~~~~~~ .. autoclass:: LJSPEECH - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ SPEECHCOMMANDS ~~~~~~~~~~~~~~ .. autoclass:: SPEECHCOMMANDS - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ TEDLIUM ~~~~~~~~~~~~~~ .. autoclass:: TEDLIUM - :members: __getitem__ - :special-members: get_phoneme_dict + :members: + :special-members: __getitem__ + VCTK ~~~~ .. autoclass:: VCTK - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ VCTK_092 ~~~~~~~~ .. autoclass:: VCTK_092 + :members: + :special-members: __getitem__ YESNO ~~~~~ .. autoclass:: YESNO - :members: __getitem__ - :special-members: + :members: + :special-members: __getitem__ diff --git a/torchaudio/datasets/cmuarctic.py b/torchaudio/datasets/cmuarctic.py index 7aa6d541e5..7ea4ca4dae 100644 --- a/torchaudio/datasets/cmuarctic.py +++ b/torchaudio/datasets/cmuarctic.py @@ -76,9 +76,20 @@ def load_cmuarctic_item(line: str, class CMUARCTIC(Dataset): - """ - Create a Dataset for CMU_arctic. Each item is a tuple of the form: - waveform, sample_rate, utterance, utterance_id + """Create a Dataset for CMU_ARCTIC. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): + The URL to download the dataset from or the type of the dataset to dowload. + (default: ``"aew"``) + Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``, + ``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``, + ``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``. + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"ARCTIC"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ _file_text = "txt.done.data" @@ -143,6 +154,14 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, utterance, utterance_id)`` + """ line = self._walker[n] return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio) diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index a11b9dc1c8..825cf3e9ab 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -100,11 +100,28 @@ def load_commonvoice_item(line: List[str], class COMMONVOICE(Dataset): - """ - Create a Dataset for CommonVoice. Each item is a tuple of the form: - (waveform, sample_rate, dictionary) - where dictionary is a dictionary built from the tsv file with the following keys: - client_id, path, sentence, up_votes, down_votes, age, gender, accent. + """Create a Dataset for CommonVoice. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + tsv (str, optional): The name of the tsv file used to construct the metadata. + (default: ``"train.tsv"``) + url (str, optional): The URL to download the dataset from, or the language of + the dataset to download. (default: ``"english"``). + Allowed language values are ``"tatar"``, ``"english"``, ``"german"``, + ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, + ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, + ``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``, + ``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``, + ``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``, + ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``, + ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and + ``"romansh sursilvan"``. + folder_in_archive (str, optional): The top-level directory of the dataset. + version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``) + For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ _ext_txt = ".txt" @@ -192,6 +209,16 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, dictionary)``, where dictionary is built + from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``, + ``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``. + """ line = self._walker[n] return load_commonvoice_item(line, self._header, self._path, self._folder_audio) diff --git a/torchaudio/datasets/gtzan.py b/torchaudio/datasets/gtzan.py index e031801bc1..160898c73d 100644 --- a/torchaudio/datasets/gtzan.py +++ b/torchaudio/datasets/gtzan.py @@ -1,6 +1,6 @@ import os import warnings -from typing import Any, Tuple +from typing import Any, Tuple, Optional import torchaudio from torch import Tensor @@ -998,12 +998,22 @@ def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str class GTZAN(Dataset): - """ - Create a Dataset for GTZAN. Each item is a tuple of the form: - waveform, sample_rate, label. + """Create a Dataset for GTZAN. + + Note: + Please see http://marsyas.info/downloads/datasets.html if you are planning to use + this dataset to publish results. - Please see http://marsyas.info/downloads/datasets.html - if you are planning to use this dataset to publish results. + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from. + (default: ``"http://opihi.cs.uvic.ca/sound/genres.tar.gz"``) + folder_in_archive (str, optional): The top-level directory of the dataset. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + subset (str, optional): Which subset of the dataset to use. + One of ``"training"``, ``"validation"``, ``"testing"`` or ``None``. + If ``None``, the entire dataset is used. (default: ``None``). """ _ext_audio = ".wav" @@ -1014,7 +1024,7 @@ def __init__( url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False, - subset: Any = None, + subset: Optional[str] = None, ) -> None: # super(GTZAN, self).__init__() @@ -1082,6 +1092,14 @@ def __init__( self._walker = filtered_test def __getitem__(self, n: int) -> Tuple[Tensor, int, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, label)`` + """ fileid = self._walker[n] item = load_gtzan_item(fileid, self._path, self._ext_audio) waveform, sample_rate, label = item diff --git a/torchaudio/datasets/librispeech.py b/torchaudio/datasets/librispeech.py index 449ab744d3..24da90aa66 100644 --- a/torchaudio/datasets/librispeech.py +++ b/torchaudio/datasets/librispeech.py @@ -67,9 +67,19 @@ def load_librispeech_item(fileid: str, class LIBRISPEECH(Dataset): - """ - Create a Dataset for LibriSpeech. Each item is a tuple of the form: - waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id + """Create a Dataset for LibriSpeech. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from, + or the type of the dataset to dowload. + Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``, + ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and + ``"train-other-500"``. (default: ``"train-clean-100"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"LibriSpeech"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ _ext_txt = ".trans.txt" @@ -117,6 +127,14 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id)`` + """ fileid = self._walker[n] return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt) diff --git a/torchaudio/datasets/libritts.py b/torchaudio/datasets/libritts.py index a37d5528fb..5d4e94cca5 100644 --- a/torchaudio/datasets/libritts.py +++ b/torchaudio/datasets/libritts.py @@ -65,9 +65,19 @@ def load_libritts_item( class LIBRITTS(Dataset): - """ - Create a Dataset for LibriTTS. Each item is a tuple of the form: - waveform, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id + """Create a Dataset for LibriTTS. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from, + or the type of the dataset to dowload. + Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``, + ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and + ``"train-other-500"``. (default: ``"train-clean-100"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"LibriTTS"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ _ext_original_txt = ".original.txt" @@ -118,6 +128,15 @@ def __init__( self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int, int, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, original_text, normalized_text, speaker_id, + chapter_id, utterance_id)`` + """ fileid = self._walker[n] return load_libritts_item( fileid, diff --git a/torchaudio/datasets/ljspeech.py b/torchaudio/datasets/ljspeech.py index 5e3b6ee7d2..28c3a5dfc8 100644 --- a/torchaudio/datasets/ljspeech.py +++ b/torchaudio/datasets/ljspeech.py @@ -33,9 +33,16 @@ def load_ljspeech_item(line: List[str], path: str, ext_audio: str) -> Tuple[Tens class LJSPEECH(Dataset): - """ - Create a Dataset for LJSpeech-1.1. Each item is a tuple of the form: - waveform, sample_rate, transcript, normalized_transcript + """Create a Dataset for LJSpeech-1.1. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from. + (default: ``"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"wavs"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ _ext_audio = ".wav" @@ -68,6 +75,14 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, transcript, normalized_transcript)`` + """ line = self._walker[n] return load_ljspeech_item(line, self._path, self._ext_audio) diff --git a/torchaudio/datasets/speechcommands.py b/torchaudio/datasets/speechcommands.py index 9943d5965b..7c56a846b9 100644 --- a/torchaudio/datasets/speechcommands.py +++ b/torchaudio/datasets/speechcommands.py @@ -36,9 +36,18 @@ def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str class SPEECHCOMMANDS(Dataset): - """ - Create a Dataset for Speech Commands. Each item is a tuple of the form: - waveform, sample_rate, label, speaker_id, utterance_number + """Create a Dataset for Speech Commands. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from, + or the type of the dataset to dowload. + Allowed type values are ``"speech_commands_v0.01"`` and ``"speech_commands_v0.02"`` + (default: ``"speech_commands_v0.02"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"SpeechCommands"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ def __init__(self, @@ -75,6 +84,14 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, label, speaker_id, utterance_number)`` + """ fileid = self._walker[n] return load_speechcommands_item(fileid, self._path) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 4ae1ddeb0f..3912898206 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -43,44 +43,21 @@ class TEDLIUM(Dataset): """ - Create a Dataset for Tedlium. It supports releases 1,2 and 3, each item is a list containings: - [waveform, sample_rate, transcript, talk_id, speaker_id, identifier]. - - Constructor arguments: + Create a Dataset for Tedlium. It supports releases 1,2 and 3. Args: - root (str): Path containing dataset or target path where its downloaded if needed - release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. - subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None - download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. - audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". - - Special functions: - - _load_tedlium_item: Loads a TEDLIUM dataset sample given a file name and corresponding sentence name - - _load_audio: Default load function used in TEDLIUM dataset, you can overwrite this function to customize - functionality and load individual sentences from a full ted audio talk file - - get_phoneme_dict: Returns the phoneme dictionary of a TEDLIUM release - + root (str): Path to the directory where the dataset is found or downloaded. + release (str, optional): Release version. + Allowed values are ``"release1"``, ``"release2"`` or ``"release3"``. + (default: ``"release1"``). + subset (str, optional): The subset of dataset to use. Valid options are ``"train"``, ``"dev"``, + and ``"test"`` for releases 1&2, ``None`` for release3. Defaults to ``"train"`` or ``None``. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). """ - def __init__( self, root: str, release: str = "release1", subset: str = None, download: bool = False, audio_ext=".sph" ) -> None: - """Constructor for TEDLIUM dataset. - - Args: - root (str): Path containing dataset or target path where its downloaded if needed - release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. - subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None - download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. - audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". - - Raises: - RuntimeError: If release identifier does not match any supported release, - """ self._ext_audio = audio_ext if release in _RELEASE_CONFIGS.keys(): folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] @@ -140,7 +117,7 @@ def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, path (str): Dataset root path Returns: - Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] + tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)`` """ transcript_path = os.path.join(path, "stm", fileid) with open(transcript_path + ".stm") as f: @@ -171,14 +148,13 @@ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate return torchaudio.load(path)[:, start_time:end_time] def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: - """TEDLIUM dataset custom function overwritting default loadbehaviour - Loads a TEDLIUM sample given a index N. + """Load the n-th sample from the dataset. Args: - n (int): Index of sample to be loaded + n (int): The index of the sample to be loaded Returns: - Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] + tuple: ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)`` """ fileid, line = self._filelist[n] return self._load_tedlium_item(fileid, line, self._path) @@ -193,10 +169,8 @@ def __len__(self) -> int: @property def phoneme_dict(self): - """Returns the phoneme dictionary of a TEDLIUM release. - - Returns: - dictionary: Phoneme dictionary for the current tedlium release + """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes. + Note that some words have empty phonemes. """ # Read phoneme dictionary if not self._phoneme_dict: diff --git a/torchaudio/datasets/vctk.py b/torchaudio/datasets/vctk.py index 8fa874c7cf..9d689dd23d 100644 --- a/torchaudio/datasets/vctk.py +++ b/torchaudio/datasets/vctk.py @@ -54,12 +54,25 @@ def load_vctk_item(fileid: str, class VCTK(Dataset): - """ - Create a Dataset for VCTK. Each item is a tuple of the form: - (waveform, sample_rate, utterance, speaker_id, utterance_id) + """Create a Dataset for VCTK. + + Note: + * **This dataset is no longer publicly available.** Please use :py:class:`VCTK_092` + * Directory ``p315`` is ignored because there is no corresponding text files. + For more information about the dataset visit: https://datashare.is.ed.ac.uk/handle/10283/3443 - Folder `p315` will be ignored due to the non-existent corresponding text files. - For more information about the dataset visit: https://datashare.is.ed.ac.uk/handle/10283/3443 + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): Not used as the dataset is no longer publicly available. + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"VCTK-Corpus"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + Giving ``download=True`` will result in error as the dataset is no longer + publicly available. + downsample (bool, optional): Not used. + transform (callable, optional): Optional transform applied on waveform. (default: ``None``) + target_transform (callable, optional): Optional transform applied on utterance. (default: ``None``) """ _folder_txt = "txt" @@ -118,6 +131,14 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, str]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, utterance, speaker_id, utterance_id)`` + """ fileid = self._walker[n] item = load_vctk_item( fileid, @@ -145,14 +166,13 @@ def __len__(self) -> int: class VCTK_092(Dataset): """Create VCTK 0.92 Dataset - An item is a ``namedtuple`` of (``waveform``, ``sample_rate``, ``utterance``, - ``speaker_id``, ``utterance_id``) - Args: root (str): Root directory where the dataset's top level directory is found. - mic_id (str): Microphone ID. Either ``"mic1"`` or ``"mic2"`` - download (bool, optional): Download the dataset if not found in the given directory. - url (str, optional): URL from which the dataset is downloaded. + mic_id (str): Microphone ID. Either ``"mic1"`` or ``"mic2"``. (default: ``"mic2"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + url (str, optional): The URL to download the dataset from. + (default: ``"https://datashare.is.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"``) audio_ext (str, optional): Custom audio extension if dataset is converted to non-default audio format. Note: @@ -252,6 +272,14 @@ def _load_sample(self, speaker_id: str, utterance_id: str, mic_id: str) -> Sampl return Sample(waveform, sample_rate, utterance, speaker_id, utterance_id) def __getitem__(self, n: int) -> Sample: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, utterance, speaker_id, utterance_id)`` + """ speaker_id, utterance_id = self._sample_ids[n] return self._load_sample(speaker_id, utterance_id, self._mic_id) diff --git a/torchaudio/datasets/yesno.py b/torchaudio/datasets/yesno.py index e45b157c35..5a20539e28 100644 --- a/torchaudio/datasets/yesno.py +++ b/torchaudio/datasets/yesno.py @@ -31,9 +31,18 @@ def load_yesno_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, int class YESNO(Dataset): - """ - Create a Dataset for YesNo. Each item is a tuple of the form: - (waveform, sample_rate, labels) + """Create a Dataset for YesNo. + + Args: + root (str): Path to the directory where the dataset is found or downloaded. + url (str, optional): The URL to download the dataset from. + (default: ``"http://www.openslr.org/resources/1/waves_yesno.tar.gz"``) + folder_in_archive (str, optional): + The top-level directory of the dataset. (default: ``"waves_yesno"``) + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + transform (callable, optional): Optional transform applied on waveform. (default: ``None``) + target_transform (callable, optional): Optional transform applied on utterance. (default: ``None``) """ _ext_audio = ".wav" @@ -78,6 +87,14 @@ def __init__(self, self._walker = list(walker) def __getitem__(self, n: int) -> Tuple[Tensor, int, List[int]]: + """Load the n-th sample from the dataset. + + Args: + n (int): The index of the sample to be loaded + + Returns: + tuple: ``(waveform, sample_rate, labels)`` + """ fileid = self._walker[n] item = load_yesno_item(fileid, self._path, self._ext_audio)