From 25dca5f2eaf183e57121cf71c03032d574557cf5 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:37:39 +0200 Subject: [PATCH 01/42] Add test fixture for TAR WAV file --- tests/features/test_audio.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 293b7cd50b4..2086e77fdb6 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -1,4 +1,6 @@ +import os import sys +import tarfile from ctypes.util import find_library from importlib.util import find_spec @@ -26,6 +28,15 @@ require_torchaudio = pytest.mark.skipif(find_spec("torchaudio") is None, reason="Test requires 'torchaudio'") +@pytest.fixture() +def tar_wav_path(shared_datadir, tmp_path_factory): + audio_path = str(shared_datadir / "test_audio_44100.wav") + path = tmp_path_factory.mktemp("data") / "audio_data.wav.tar" + with tarfile.TarFile(path, "w") as f: + f.add(audio_path, arcname=os.path.basename(audio_path)) + return path + + def test_audio_instantiation(): audio = Audio() assert audio.id is None From 52cc44d1fb1739101ebb821f96752767308edbdb Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:38:08 +0200 Subject: [PATCH 02/42] Add test iter_archive --- tests/features/test_audio.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 2086e77fdb6..16cc8a6e010 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -37,6 +37,14 @@ def tar_wav_path(shared_datadir, tmp_path_factory): return path +def iter_archive(archive_path): + with tarfile.open(archive_path) as tar: + for tarinfo in tar: + file_path = tarinfo.name + file_obj = tar.extractfile(tarinfo) + yield file_path, file_obj + + def test_audio_instantiation(): audio = Audio() assert audio.id is None From 8ff699deb9a727b622cbaedd2150b30501836f26 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:39:34 +0200 Subject: [PATCH 03/42] Test dataset with Audio feature for TAR archive --- tests/features/test_audio.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 16cc8a6e010..f196e59763b 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -114,6 +114,36 @@ def test_dataset_with_audio_feature(shared_datadir): assert column[0]["sampling_rate"] == 44100 +@require_sndfile +def test_dataset_with_audio_feature_tar(tar_wav_path): + audio_filename = "test_audio_44100.wav" + data = {"audio": []} + for file_path, file_obj in iter_archive(tar_wav_path): + data["audio"].append({"path": file_path, "bytes": file_obj.read()}) + break + features = Features({"audio": Audio(archived=True)}) + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"audio"} + assert item["audio"].keys() == {"path", "array", "sampling_rate"} + assert item["audio"]["path"] == audio_filename + assert item["audio"]["array"].shape == (202311,) + assert item["audio"]["sampling_rate"] == 44100 + batch = dset[:1] + assert batch.keys() == {"audio"} + assert len(batch["audio"]) == 1 + assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} + assert batch["audio"][0]["path"] == audio_filename + assert batch["audio"][0]["array"].shape == (202311,) + assert batch["audio"][0]["sampling_rate"] == 44100 + column = dset["audio"] + assert len(column) == 1 + assert column[0].keys() == {"path", "array", "sampling_rate"} + assert column[0]["path"] == audio_filename + assert column[0]["array"].shape == (202311,) + assert column[0]["sampling_rate"] == 44100 + + @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") From 3d20ee583aa59cd08f8e98f7e4d41565eba757cc Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:49:20 +0200 Subject: [PATCH 04/42] Add Audio method to decode from bytes instead of path --- src/datasets/features/audio.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index c2261b40344..4900127e36b 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -52,6 +52,22 @@ def _decode_example_with_librosa(self, value): array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono) return array, sampling_rate + def _decode_example_with_soundfile(self, file): + try: + import librosa + import soundfile as sf + except ImportError as err: + raise ImportError("To support decoding audio files, please install 'librosa'.") from err + + array, sampling_rate = sf.read(file) + array = array.T + if self.mono: + array = librosa.to_mono(array) + if self.sampling_rate and self.sampling_rate != sampling_rate: + array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best") + sampling_rate = self.sampling_rate + return array, sampling_rate + def _decode_example_with_torchaudio(self, value): try: import torchaudio From 105ead7e35dbdf5da8698b1ef17cbf017067c898 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:54:10 +0200 Subject: [PATCH 05/42] Add Audio support for bytes besides path --- src/datasets/features/audio.py | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 4900127e36b..3a19ae660c7 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -1,5 +1,6 @@ from collections import defaultdict from dataclasses import dataclass, field +from io import BytesIO from typing import Any, ClassVar, Optional import pyarrow as pa @@ -11,11 +12,18 @@ class Audio: Args: sampling_rate (:obj:`int`, optional): Target sampling rate. If `None`, the native sampling rate is used. - mono (:obj:`bool`, default ```True``): Whether to convert the audio signal to mono by averaging samples across channels. + mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across + channels. + archived (:obj:`bool`, default ``False``): Whether the source data is archived with sequential access. + - If non-archived with sequential access (i.e. random access is allowed), the cache will only store the + absolute path to the audio file. + - If archived with sequential access, the cache will store the relative path of the audio file to the + archive file and the bytes of the audio file. """ sampling_rate: Optional[int] = None mono: bool = True + archived: bool = False id: Optional[str] = None # Automatically constructed dtype: ClassVar[str] = "dict" @@ -23,24 +31,33 @@ class Audio: _type: str = field(default="Audio", init=False, repr=False) def __call__(self): - return pa.string() + return pa.string() if not self.archived else pa.struct({"path": pa.string(), "bytes": pa.binary()}) def decode_example(self, value): """Decode example audio file into audio data. Args: - value: Audio file path. + value: Either absolute audio file path (when ``archived=False``) or a dict with relative audio file path + and the bytes of the audio file. Returns: dict """ - # TODO: backard compatibility for users without audio dependencies - array, sampling_rate = ( - self._decode_example_with_torchaudio(value) - if value.endswith(".mp3") - else self._decode_example_with_librosa(value) - ) - return {"path": value, "array": array, "sampling_rate": sampling_rate} + if self.archived: + path, file = value["path"], BytesIO(value["bytes"]) + array, sampling_rate = ( + self._decode_example_with_torchaudio(file) + if path.endswith(".mp3") + else self._decode_example_with_soundfile(file) + ) + else: + path = value + array, sampling_rate = ( + self._decode_example_with_torchaudio(path) + if path.endswith(".mp3") + else self._decode_example_with_librosa(path) + ) + return {"path": path, "array": array, "sampling_rate": sampling_rate} def _decode_example_with_librosa(self, value): try: From a869469267d317ced7557b8a091c18323527c0a0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:07:07 +0200 Subject: [PATCH 06/42] Fix docstring --- src/datasets/features/audio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 3a19ae660c7..64dcaeaa819 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -15,6 +15,7 @@ class Audio: mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across channels. archived (:obj:`bool`, default ``False``): Whether the source data is archived with sequential access. + - If non-archived with sequential access (i.e. random access is allowed), the cache will only store the absolute path to the audio file. - If archived with sequential access, the cache will store the relative path of the audio file to the From f0911cd55e62e405661049d7559c144b4164684e Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 29 Oct 2021 16:31:57 +0200 Subject: [PATCH 07/42] Stream TAR-based Audio datasets --- datasets/common_voice/common_voice.py | 83 +++++++------ datasets/librispeech_asr/librispeech_asr.py | 73 +++++++----- datasets/openslr/openslr.py | 124 ++++++++------------ datasets/vivos/vivos.py | 57 +++++---- 4 files changed, 180 insertions(+), 157 deletions(-) diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py index 0178da98105..32232058a77 100644 --- a/datasets/common_voice/common_voice.py +++ b/datasets/common_voice/common_voice.py @@ -15,8 +15,6 @@ """ Common Voice Dataset""" -import os - import datasets from datasets.tasks import AutomaticSpeechRecognition @@ -613,6 +611,7 @@ def __init__(self, name, sub_version, **kwargs): class CommonVoice(datasets.GeneratorBasedBuilder): + DEFAULT_WRITER_BATCH_SIZE = 1000 BUILDER_CONFIGS = [ CommonVoiceConfig( name=lang_id, @@ -632,7 +631,7 @@ def _info(self): { "client_id": datasets.Value("string"), "path": datasets.Value("string"), - "audio": datasets.features.Audio(sampling_rate=48_000), + "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, "sentence": datasets.Value("string"), "up_votes": datasets.Value("int64"), "down_votes": datasets.Value("int64"), @@ -658,49 +657,54 @@ def _info(self): def _split_generators(self, dl_manager): """Returns SplitGenerators.""" - dl_path = dl_manager.download_and_extract(_DATA_URL.format(self.config.name)) - abs_path_to_data = os.path.join(dl_path, "cv-corpus-6.1-2020-12-11", self.config.name) - abs_path_to_clips = os.path.join(abs_path_to_data, "clips") + archive = dl_manager.download(_DATA_URL.format(self.config.name)) + path_to_data = "/".join(["cv-corpus-6.1-2020-12-11", self.config.name]) + path_to_clips = "/".join([path_to_data, "clips"]) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ - "filepath": os.path.join(abs_path_to_data, "train.tsv"), - "path_to_clips": abs_path_to_clips, + "files": dl_manager.iter_archive(archive), + "filepath": "/".join([path_to_data, "train.tsv"]), + "path_to_clips": path_to_clips, }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ - "filepath": os.path.join(abs_path_to_data, "test.tsv"), - "path_to_clips": abs_path_to_clips, + "files": dl_manager.iter_archive(archive), + "filepath": "/".join([path_to_data, "test.tsv"]), + "path_to_clips": path_to_clips, }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ - "filepath": os.path.join(abs_path_to_data, "dev.tsv"), - "path_to_clips": abs_path_to_clips, + "files": dl_manager.iter_archive(archive), + "filepath": "/".join([path_to_data, "dev.tsv"]), + "path_to_clips": path_to_clips, }, ), datasets.SplitGenerator( name="other", gen_kwargs={ - "filepath": os.path.join(abs_path_to_data, "other.tsv"), - "path_to_clips": abs_path_to_clips, + "files": dl_manager.iter_archive(archive), + "filepath": "/".join([path_to_data, "other.tsv"]), + "path_to_clips": path_to_clips, }, ), datasets.SplitGenerator( name="invalidated", gen_kwargs={ - "filepath": os.path.join(abs_path_to_data, "invalidated.tsv"), - "path_to_clips": abs_path_to_clips, + "files": dl_manager.iter_archive(archive), + "filepath": "/".join([path_to_data, "invalidated.tsv"]), + "path_to_clips": path_to_clips, }, ), ] - def _generate_examples(self, filepath, path_to_clips): + def _generate_examples(self, files, filepath, path_to_clips): """Yields examples.""" data_fields = list(self._info().features.keys()) @@ -708,28 +712,33 @@ def _generate_examples(self, filepath, path_to_clips): data_fields.remove("audio") path_idx = data_fields.index("path") - with open(filepath, encoding="utf-8") as f: - lines = f.readlines() - headline = lines[0] - - column_names = headline.strip().split("\t") - assert ( - column_names == data_fields - ), f"The file should have {data_fields} as column names, but has {column_names}" - - for id_, line in enumerate(lines[1:]): - field_values = line.strip().split("\t") + all_field_values = {} + for path, f in files: + if path == filepath: + lines = f.readlines() + headline = lines[0].decode("utf-8") - # set absolute path for mp3 audio file - field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx]) + column_names = headline.strip().split("\t") + assert ( + column_names == data_fields + ), f"The file should have {data_fields} as column names, but has {column_names}" + for id_, line in enumerate(lines[1:]): + field_values = line.decode("utf-8").strip().split("\t") + # set full path for mp3 audio file + audio_path = "/".join([path_to_clips, field_values[path_idx]]) + all_field_values[audio_path] = field_values + elif path.startswith(path_to_clips): + assert all_field_values, "Found audio clips before the metadata TSV file." + if path in all_field_values: + field_values = all_field_values[path] - # if data is incomplete, fill with empty values - if len(field_values) < len(data_fields): - field_values += (len(data_fields) - len(field_values)) * ["''"] + # if data is incomplete, fill with empty values + if len(field_values) < len(data_fields): + field_values += (len(data_fields) - len(field_values)) * ["''"] - result = {key: value for key, value in zip(data_fields, field_values)} + result = {key: value for key, value in zip(data_fields, field_values)} - # set audio feature - result["audio"] = field_values[path_idx] + # set audio feature + result["audio"] = {"path": path, "data": f.read()} - yield id_, result + yield id_, result diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index acdec76ddf5..794b8d426b9 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -17,9 +17,6 @@ """Librispeech automatic speech recognition dataset.""" -import glob -import os - import datasets from datasets.tasks import AutomaticSpeechRecognition @@ -93,6 +90,7 @@ def __init__(self, **kwargs): class LibrispeechASR(datasets.GeneratorBasedBuilder): """Librispeech dataset.""" + DEFAULT_WRITER_BATCH_SIZE = 256 BUILDER_CONFIGS = [ LibrispeechASRConfig(name="clean", description="'Clean' speech."), LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."), @@ -104,7 +102,7 @@ def _info(self): features=datasets.Features( { "file": datasets.Value("string"), - "audio": datasets.features.Audio(sampling_rate=16_000), + "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), @@ -118,41 +116,62 @@ def _info(self): ) def _split_generators(self, dl_manager): - archive_path = dl_manager.download_and_extract(_DL_URLS[self.config.name]) + archive_path = dl_manager.download(_DL_URLS[self.config.name]) if self.config.name == "clean": train_splits = [ - datasets.SplitGenerator(name="train.100", gen_kwargs={"archive_path": archive_path["train.100"]}), - datasets.SplitGenerator(name="train.360", gen_kwargs={"archive_path": archive_path["train.360"]}), + datasets.SplitGenerator( + name="train.100", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])} + ), + datasets.SplitGenerator( + name="train.360", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])} + ), ] elif self.config.name == "other": train_splits = [ - datasets.SplitGenerator(name="train.500", gen_kwargs={"archive_path": archive_path["train.500"]}), + datasets.SplitGenerator( + name="train.500", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])} + ), ] return train_splits + [ - datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]}), - datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])} + ), ] - def _generate_examples(self, archive_path): + def _generate_examples(self, files): """Generate examples from a LibriSpeech archive_path.""" - transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt") key = 0 - for transcript_path in sorted(glob.glob(transcripts_glob)): - transcript_dir_path = os.path.dirname(transcript_path) - with open(transcript_path, "r", encoding="utf-8") as f: + audio_data = {} + transcripts = [] + for path, f in files: + if path.endswith(".flac"): + id_ = path.split("/")[-1][: -len(".flac")] + audio_data[id_] = f.read() + elif path.endswith(".trans.txt"): for line in f: - line = line.strip() - id_, transcript = line.split(" ", 1) - audio_file = f"{id_}.flac" - speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] - yield key, { - "id": id_, - "speaker_id": speaker_id, - "chapter_id": chapter_id, - "file": os.path.join(transcript_dir_path, audio_file), - "audio": os.path.join(transcript_dir_path, audio_file), - "text": transcript, - } + if line: + line = line.decode("utf-8").strip() + id_, transcript = line.split(" ", 1) + audio_file = f"{id_}.flac" + speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] + transcripts.append( + { + "id": id_, + "speaker_id": speaker_id, + "chapter_id": chapter_id, + "file": audio_file, + "text": transcript, + } + ) + if audio_data and len(audio_data) == len(transcripts): + for transcript in transcripts: + audio = {"path": transcript["file"], "data": audio_data[transcript["id"]]} + yield key, {"audio": audio, **transcript} key += 1 + audio_data = {} + transcripts = [] diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py index 72a08deccf2..9acb8d90d09 100644 --- a/datasets/openslr/openslr.py +++ b/datasets/openslr/openslr.py @@ -112,20 +112,6 @@ ISBN = {979-10-95546-34-4}, } -SLR83 -@inproceedings{demirsahin-etal-2020-open, - title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}}, - author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara}, - booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)}, - month = may, - year = {2020}, - pages = {6532--6541}, - address = {Marseille, France}, - publisher = {European Language Resources Association (ELRA)}, - url = {https://www.aclweb.org/anthology/2020.lrec-1.804}, - ISBN = {979-10-95546-34-4}, -} - SLR80 @inproceedings{oo-etal-2020-burmese, title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application @@ -176,10 +162,10 @@ "Setswana and isiXhosa.", "Files": ["af_za.tar.gz", "st_za.tar.gz", "tn_za.tar.gz", "xh_za.tar.gz"], "IndexFiles": [ - "af_za/za/afr/line_index.tsv", - "st_za/za/sso/line_index.tsv", - "tn_za/za/tsn/line_index.tsv", - "xh_za/za/xho/line_index.tsv", + "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/af_za/line_index.tsv", + "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/st_za/line_index.tsv", + "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/tn_za/line_index.tsv", + "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/xh_za/line_index.tsv", ], "DataDirs": ["af_za/za/afr/wavs", "st_za/za/sso/wavs", "tn_za/za/tsn/wavs", "xh_za/za/xho/wavs"], }, @@ -493,39 +479,6 @@ "IndexFiles": ["line_index.tsv"], "DataDirs": [""], }, - "SLR83": { - "Language": "English", - "LongName": "Crowdsourced high-quality UK and Ireland English Dialect speech data set", - "Category": "Speech", - "Summary": "Data set which contains male and female recordings of English from various dialects of the UK and Ireland", - "Files": [ - "irish_english_male.zip", - "midlands_english_female.zip", - "midlands_english_male.zip", - "northern_english_female.zip", - "northern_english_male.zip", - "scottish_english_female.zip", - "scottish_english_male.zip", - "southern_english_female.zip", - "southern_english_male.zip", - "welsh_english_female.zip", - "welsh_english_male.zip", - ], - "IndexFiles": [ - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - "line_index.csv", - ], - "DataDirs": ["", "", "", "", "", "", "", "", "", "", ""], - }, "SLR86": { "Language": "Yoruba", "LongName": "Crowdsourced high-quality Yoruba speech data set", @@ -565,6 +518,7 @@ def __init__(self, name, **kwargs): class OpenSlr(datasets.GeneratorBasedBuilder): + DEFAULT_WRITER_BATCH_SIZE = 32 BUILDER_CONFIGS = [ OpenSlrConfig( @@ -581,13 +535,22 @@ class OpenSlr(datasets.GeneratorBasedBuilder): ] def _info(self): - features = datasets.Features( - { - "path": datasets.Value("string"), - "audio": datasets.features.Audio(sampling_rate=48_000), - "sentence": datasets.Value("string"), - } - ) + if self.config.name in ["SLR32"]: + features = datasets.Features( + { + "path": datasets.Value("string"), + "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, + "sentence": datasets.Value("string"), + } + ) + else: + features = datasets.Features( + { + "path": datasets.Value("string"), + "audio": datasets.features.Audio(sampling_rate=48_000), + "sentence": datasets.Value("string"), + } + ) return datasets.DatasetInfo( description=_DESCRIPTION, @@ -605,21 +568,28 @@ def _split_generators(self, dl_manager): """Returns SplitGenerators.""" resource_number = self.config.name.replace("SLR", "") urls = [f"{_DATA_URL.format(resource_number)}/{file}" for file in self.config.files] - dl_paths = dl_manager.download_and_extract(urls) - abs_path_to_indexs = [os.path.join(path, f"{self.config.index_files[i]}") for i, path in enumerate(dl_paths)] - abs_path_to_datas = [os.path.join(path, f"{self.config.data_dirs[i]}") for i, path in enumerate(dl_paths)] + if urls[0].endswith(".zip"): + dl_paths = dl_manager.download_and_extract(urls) + path_to_indexs = [os.path.join(path, f"{self.config.index_files[i]}") for i, path in enumerate(dl_paths)] + path_to_datas = [os.path.join(path, f"{self.config.data_dirs[i]}") for i, path in enumerate(dl_paths)] + archives = None + else: + archives = dl_manager.download(urls) + path_to_indexs = dl_manager.download(self.config.index_files) + path_to_datas = self.config.data_dirs return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ - "path_to_indexs": abs_path_to_indexs, - "path_to_datas": abs_path_to_datas, + "path_to_indexs": path_to_indexs, + "path_to_datas": path_to_datas, + "archive_files": [dl_manager.iter_archive(archive) for archive in archives] if archives else None, }, ), ] - def _generate_examples(self, path_to_indexs, path_to_datas): + def _generate_examples(self, path_to_indexs, path_to_datas, archive_files): """Yields examples.""" counter = -1 @@ -640,16 +610,26 @@ def _generate_examples(self, path_to_indexs, path_to_datas): sentence = sentence_index[filename] counter += 1 yield counter, {"path": path, "audio": path, "sentence": sentence} - elif self.config.name in ["SLR83"]: - for i, path_to_index in enumerate(path_to_indexs): + elif self.config.name in ["SLR32"]: # use archives + for path_to_index, path_to_data, files in zip(path_to_indexs, path_to_datas, archive_files): + sentences = {} with open(path_to_index, encoding="utf-8") as f: - lines = f.readlines() - for id_, line in enumerate(lines): - field_values = re.split(r",\s?", line.strip()) - user_id, filename, sentence = field_values - path = os.path.join(path_to_datas[i], f"{filename}.wav") + for line in f: + # Following regexs are needed to normalise the lines, since the datasets + # are not always consistent and have bugs: + line = re.sub(r"\t[^\t]*\t", "\t", line.strip()) + field_values = re.split(r"\t\t?", line) + if len(field_values) != 2: + continue + filename, sentence = field_values + # set absolute path for audio file + path = f"{path_to_data}/{filename}.wav" + sentences[path] = sentence + for path, f in files: + if path.startswith(path_to_data): counter += 1 - yield counter, {"path": path, "audio": path, "sentence": sentence} + audio = {"path": path, "data": f.read()} + yield counter, {"path": path, "audio": audio, "sentence": sentences[path]} else: for i, path_to_index in enumerate(path_to_indexs): with open(path_to_index, encoding="utf-8") as f: diff --git a/datasets/vivos/vivos.py b/datasets/vivos/vivos.py index 4dda623b705..c559ce9c4ae 100644 --- a/datasets/vivos/vivos.py +++ b/datasets/vivos/vivos.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import datasets @@ -40,6 +39,11 @@ _DATA_URL = "https://ailab.hcmus.edu.vn/assets/vivos.tar.gz" +_PROMPTS_URLS = { + "train": "https://s3.amazonaws.com/datasets.huggingface.co/vivos/train/prompts.txt", + "test": "https://s3.amazonaws.com/datasets.huggingface.co/vivos/test/prompts.txt", +} + class VivosDataset(datasets.GeneratorBasedBuilder): """VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for @@ -63,7 +67,7 @@ def _info(self): { "speaker_id": datasets.Value("string"), "path": datasets.Value("string"), - "audio": datasets.features.Audio(sampling_rate=16_000), + "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, "sentence": datasets.Value("string"), } ), @@ -80,46 +84,57 @@ def _split_generators(self, dl_manager): # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive - dl_path = dl_manager.download_and_extract(_DATA_URL) - data_dir = os.path.join(dl_path, "vivos") - train_dir = os.path.join(data_dir, "train") - test_dir = os.path.join(data_dir, "test") + prompts_paths = dl_manager.download(_PROMPTS_URLS) + archive = dl_manager.download(_DATA_URL) + train_dir = "vivos/train" + test_dir = "vivos/test" return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, # These kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": os.path.join(train_dir, "prompts.txt"), - "path_to_clips": os.path.join(train_dir, "waves"), + "prompts_path": prompts_paths["train"], + "path_to_clips": train_dir + "/waves", + "audio_files": dl_manager.iter_archive(archive), }, ), datasets.SplitGenerator( name=datasets.Split.TEST, # These kwargs will be passed to _generate_examples gen_kwargs={ - "filepath": os.path.join(test_dir, "prompts.txt"), - "path_to_clips": os.path.join(test_dir, "waves"), + "prompts_path": prompts_paths["test"], + "path_to_clips": test_dir + "/waves", + "audio_files": dl_manager.iter_archive(archive), }, ), ] - def _generate_examples( - self, - filepath, - path_to_clips, # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - ): + def _generate_examples(self, prompts_path, path_to_clips, audio_files): """Yields examples as (key, example) tuples.""" + # TODO(QL): use Audio featrue with data bytes instead of string path + raise Exception("TODO(QL): use Audio featrue with data bytes instead of string path") # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is here for legacy reason (tfds) and is not important in itself. - - with open(filepath, encoding="utf-8") as f: - for id_, row in enumerate(f): + examples = {} + with open(prompts_path, encoding="utf-8") as f: + for row in f: data = row.strip().split(" ", 1) speaker_id = data[0].split("_")[0] - yield id_, { + audio_path = "/".join([path_to_clips, speaker_id, data[0] + ".wav"]) + examples[audio_path] = { "speaker_id": speaker_id, - "path": os.path.join(path_to_clips, speaker_id, data[0] + ".wav"), - "audio": os.path.join(path_to_clips, speaker_id, data[0] + ".wav"), + "path": audio_path, "sentence": data[1], } + inside_clips_dir = False + id_ = 0 + for path, f in audio_files: + if path.startswith(path_to_clips): + inside_clips_dir = True + if path in examples: + audio = {"path": path, "data": f.read()} + yield id_, {**examples[path], "audio": audio} + id_ += 1 + elif inside_clips_dir: + break From f224b68275e2baaa9601d37f2bb17f527878ee35 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Nov 2021 16:09:59 +0100 Subject: [PATCH 08/42] Remove archived attribute from test audio with TAR archive --- tests/features/test_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index f196e59763b..b11884baf98 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -121,7 +121,7 @@ def test_dataset_with_audio_feature_tar(tar_wav_path): for file_path, file_obj in iter_archive(tar_wav_path): data["audio"].append({"path": file_path, "bytes": file_obj.read()}) break - features = Features({"audio": Audio(archived=True)}) + features = Features({"audio": Audio()}) dset = Dataset.from_dict(data, features=features) item = dset[0] assert item.keys() == {"audio"} From ebb1a1ca67b7b6304c1f67fec40373f28d277315 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Nov 2021 19:35:28 +0100 Subject: [PATCH 09/42] Remove archived attribute from Audio feature --- src/datasets/features/audio.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 64dcaeaa819..7fd45ad487b 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -10,21 +10,21 @@ class Audio: """Audio Feature to extract audio data from an audio file. + Input: The Audio feature accepts as input: + - A :obj:`str`: Absolute path to the audio file (i.e. random access is allowed). + - A :obj:`dict` with the keys: + - path: String with relative path of the audio file to the archive file. + - bytes: Bytes of the audio file. + This is useful for archived files with sequential access. + Args: sampling_rate (:obj:`int`, optional): Target sampling rate. If `None`, the native sampling rate is used. mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across channels. - archived (:obj:`bool`, default ``False``): Whether the source data is archived with sequential access. - - - If non-archived with sequential access (i.e. random access is allowed), the cache will only store the - absolute path to the audio file. - - If archived with sequential access, the cache will store the relative path of the audio file to the - archive file and the bytes of the audio file. """ sampling_rate: Optional[int] = None mono: bool = True - archived: bool = False id: Optional[str] = None # Automatically constructed dtype: ClassVar[str] = "dict" @@ -32,19 +32,20 @@ class Audio: _type: str = field(default="Audio", init=False, repr=False) def __call__(self): - return pa.string() if not self.archived else pa.struct({"path": pa.string(), "bytes": pa.binary()}) + return pa.struct({"path": pa.string(), "bytes": pa.binary()}) def decode_example(self, value): """Decode example audio file into audio data. Args: - value: Either absolute audio file path (when ``archived=False``) or a dict with relative audio file path - and the bytes of the audio file. + value (:obj:`dict`): Dictionary with keys: + - path: String with absolute or relative audio file path. + - bytes: Optionally, the bytes of the audio file. Returns: dict """ - if self.archived: + if value["bytes"]: path, file = value["path"], BytesIO(value["bytes"]) array, sampling_rate = ( self._decode_example_with_torchaudio(file) @@ -52,7 +53,7 @@ def decode_example(self, value): else self._decode_example_with_soundfile(file) ) else: - path = value + path = value["path"] array, sampling_rate = ( self._decode_example_with_torchaudio(path) if path.endswith(".mp3") From 1cc27a0477079dce9bac23fc0ab915500c76643b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Nov 2021 19:40:30 +0100 Subject: [PATCH 10/42] Implement Audio.encode_example --- src/datasets/features/audio.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 7fd45ad487b..a79bb099f70 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -34,6 +34,19 @@ class Audio: def __call__(self): return pa.struct({"path": pa.string(), "bytes": pa.binary()}) + def encode_example(self, value): + """Encode example into a format for Arrow. + + Args: + value (:obj:`str` or :obj:`dict`): Data passed as input to Audio feature. + + Returns: + :obj:`dict` + """ + if isinstance(value, str): + return {"path": value} + return value + def decode_example(self, value): """Decode example audio file into audio data. From 4579b76516b2f28856ac819b100e2b8984d5e491 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Nov 2021 19:41:15 +0100 Subject: [PATCH 11/42] Call Audio.encode_example from encode_nested_example --- src/datasets/features/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 3d33f558b9b..89c913bd44a 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -851,7 +851,7 @@ def encode_nested_example(schema, obj): return list(obj) # Object with special encoding: # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks - elif isinstance(schema, (ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)): + elif isinstance(schema, (Audio, ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)): return schema.encode_example(obj) # Other object should be directly convertible to a native Arrow type (like Translation and Translation) return obj From 0d2a3d84c4aa576ebc7ed1a5bd510ca813ead33a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Nov 2021 20:17:29 +0100 Subject: [PATCH 12/42] Fix docs --- src/datasets/features/audio.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index a79bb099f70..3081db10237 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -13,6 +13,7 @@ class Audio: Input: The Audio feature accepts as input: - A :obj:`str`: Absolute path to the audio file (i.e. random access is allowed). - A :obj:`dict` with the keys: + - path: String with relative path of the audio file to the archive file. - bytes: Bytes of the audio file. This is useful for archived files with sequential access. @@ -52,6 +53,7 @@ def decode_example(self, value): Args: value (:obj:`dict`): Dictionary with keys: + - path: String with absolute or relative audio file path. - bytes: Optionally, the bytes of the audio file. From 3d35adac5162d0b3679bfa317023fb27c3f41897 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 8 Nov 2021 20:23:47 +0100 Subject: [PATCH 13/42] Enhance Audio.decode_example to accept a string --- src/datasets/features/audio.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 3081db10237..1bac3f33ac1 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -52,7 +52,8 @@ def decode_example(self, value): """Decode example audio file into audio data. Args: - value (:obj:`dict`): Dictionary with keys: + value (obj:`str` or :obj:`dict`): Either a string with the absolute audio file path or a dictionary with + keys: - path: String with absolute or relative audio file path. - bytes: Optionally, the bytes of the audio file. @@ -60,6 +61,8 @@ def decode_example(self, value): Returns: dict """ + if isinstance(value, str): + value = {"path": value, "bytes": None} if value["bytes"]: path, file = value["path"], BytesIO(value["bytes"]) array, sampling_rate = ( From ec5f7b0d9d6bd94464953f874598eacd7866b6b1 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 9 Nov 2021 18:04:50 +0100 Subject: [PATCH 14/42] Fix docs --- src/datasets/features/audio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 1bac3f33ac1..10df37e03ea 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -16,6 +16,7 @@ class Audio: - path: String with relative path of the audio file to the archive file. - bytes: Bytes of the audio file. + This is useful for archived files with sequential access. Args: From 21488c01b3273ab5cf03250ac3d67805337657e0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:27:43 +0100 Subject: [PATCH 15/42] Implement private Audio._storage_dtype to specify cached dtype --- src/datasets/features/audio.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 10df37e03ea..fa0a3bb8529 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -27,6 +27,7 @@ class Audio: sampling_rate: Optional[int] = None mono: bool = True + _storage_dtype: str = "struct" id: Optional[str] = None # Automatically constructed dtype: ClassVar[str] = "dict" @@ -34,7 +35,9 @@ class Audio: _type: str = field(default="Audio", init=False, repr=False) def __call__(self): - return pa.struct({"path": pa.string(), "bytes": pa.binary()}) + return ( + pa.struct({"path": pa.string(), "bytes": pa.binary()}) if self._storage_dtype == "struct" else pa.string() + ) def encode_example(self, value): """Encode example into a format for Arrow. From 83f04cdc69763a9ebce6fdd4628e5926d9a75d31 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:28:56 +0100 Subject: [PATCH 16/42] Change Audio._storage_dtype dynamically when encoding a string --- src/datasets/features/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index fa0a3bb8529..01302e80db3 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -49,7 +49,7 @@ def encode_example(self, value): :obj:`dict` """ if isinstance(value, str): - return {"path": value} + self._storage_dtype = "string" return value def decode_example(self, value): From 7a3f066a89daa75430f4661671c5d63cf34dd321 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:29:42 +0100 Subject: [PATCH 17/42] Update test of Audio instantiation --- tests/features/test_audio.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index b11884baf98..c4139e7d7e5 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -4,6 +4,7 @@ from ctypes.util import find_library from importlib.util import find_spec +import pyarrow as pa import pytest from datasets import Dataset, load_dataset @@ -47,10 +48,13 @@ def iter_archive(archive_path): def test_audio_instantiation(): audio = Audio() + assert audio.sampling_rate is None + assert audio.mono is True assert audio.id is None assert audio.dtype == "dict" assert audio.pa_type is None assert audio._type == "Audio" + assert audio._storage_dtype == "struct" @require_sndfile From ece5b97d089e5868aa1affa38b9c471725bcda0f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:37:06 +0100 Subject: [PATCH 18/42] Set ArrowWriter.schema property dynamically calculated from features --- src/datasets/arrow_writer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 79d1926cc97..2870cd6f19b 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -201,7 +201,7 @@ def __init__( raise ValueError("At least one of path and stream must be provided.") if features is not None: self._features = features - self._schema = pa.schema(features.type) + self._schema = None elif schema is not None: self._schema: pa.Schema = schema self._features = Features.from_arrow_schema(self._schema) @@ -216,9 +216,7 @@ def __init__( self._hasher = KeyHasher("") self._check_duplicates = check_duplicates - - if disable_nullable and self._schema is not None: - self._schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in self._schema) + self._disable_nullable = disable_nullable self._path = path if stream is None: @@ -287,7 +285,14 @@ def _build_writer(self, inferred_schema: pa.Schema): @property def schema(self): - return self._schema if self._schema is not None else [] + _schema = ( + self._schema + if self._schema is not None + else (pa.schema(self._features.type) if self._features is not None else None) + ) + if self._disable_nullable and _schema is not None: + _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema) + return _schema if _schema is not None else [] @staticmethod def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> Dict[str, str]: From 38c80cc21374f75478050a3631eb4898269d643a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:40:19 +0100 Subject: [PATCH 19/42] Update ArrowWriter.write_examples_on_file --- src/datasets/arrow_writer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 2870cd6f19b..a5d6d479ca9 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -311,14 +311,14 @@ def write_examples_on_file(self): # Since current_examples contains (example, key) tuples cols = ( - [col for col in self._schema.names if col in self.current_examples[0][0]] - + [col for col in self.current_examples[0][0].keys() if col not in self._schema.names] - if self._schema + [col for col in self.schema.names if col in self.current_examples[0][0]] + + [col for col in self.current_examples[0][0].keys() if col not in self.schema.names] + if self.schema else self.current_examples[0][0].keys() ) - schema = None if self.pa_writer is None and self.update_features else self._schema - try_schema = self._schema if self.pa_writer is None and self.update_features else None + schema = None if self.pa_writer is None and self.update_features else self.schema + try_schema = self.schema if self.pa_writer is None and self.update_features else None arrays = [] inferred_types = [] for col in cols: @@ -340,7 +340,7 @@ def write_examples_on_file(self): ) arrays.append(pa_array) inferred_types.append(inferred_type) - schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self._schema + schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self.schema table = pa.Table.from_arrays(arrays, schema=schema) self.write_table(table) self.current_examples = [] From 7787985492ae8d2657793ddaabf003d75672ddae Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:39:24 +0100 Subject: [PATCH 20/42] Update ArrowWriter._build_writer --- src/datasets/arrow_writer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index a5d6d479ca9..4014782e28b 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -261,6 +261,7 @@ def close(self): self.stream.close() # This also closes self.pa_writer if it is opened def _build_writer(self, inferred_schema: pa.Schema): + schema = self.schema inferred_features = Features.from_arrow_schema(inferred_schema) if self._features is not None: if self.update_features: # keep original features it they match, or update them @@ -271,17 +272,16 @@ def _build_writer(self, inferred_schema: pa.Schema): if inferred_field == fields[name]: inferred_features[name] = self._features[name] self._features = inferred_features - self._schema: pa.Schema = inferred_schema + schema: pa.Schema = inferred_schema else: self._features = inferred_features - self._schema: pa.Schema = inferred_schema + schema: pa.Schema = inferred_schema if self.disable_nullable: - self._schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in self._schema) + schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema) if self.with_metadata: - self._schema = self._schema.with_metadata( - self._build_metadata(DatasetInfo(features=self._features), self.fingerprint) - ) - self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema) + schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=self._features), self.fingerprint)) + self._schema = schema + self.pa_writer = pa.RecordBatchStreamWriter(self.stream, schema) @property def schema(self): From 090723e380be2becf61b5531aba811fba68c2923 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 11:45:34 +0100 Subject: [PATCH 21/42] Fix code quality --- tests/features/test_audio.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index c4139e7d7e5..8bc08537731 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -4,7 +4,6 @@ from ctypes.util import find_library from importlib.util import find_spec -import pyarrow as pa import pytest from datasets import Dataset, load_dataset From 7f587775bada438140b741ec4d9909e835077a61 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 14:50:40 +0100 Subject: [PATCH 22/42] Replace _schema with schema and condition on schema in ArrowWriter --- src/datasets/arrow_writer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 4014782e28b..e042b26a757 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -322,7 +322,7 @@ def write_examples_on_file(self): arrays = [] inferred_types = [] for col in cols: - col_type = schema.field(col).type if schema is not None else None + col_type = schema.field(col).type if schema else None col_try_type = try_schema.field(col).type if try_schema is not None and col in try_schema.names else None typed_sequence = OptimizedTypedSequence( [row[0][col] for row in self.current_examples], type=col_type, try_type=col_try_type, col=col @@ -421,11 +421,11 @@ def write_batch( """ if batch_examples and len(next(iter(batch_examples.values()))) == 0: return - schema = None if self.pa_writer is None and self.update_features else self._schema - try_schema = self._schema if self.pa_writer is None and self.update_features else None + schema = None if self.pa_writer is None and self.update_features else self.schema + try_schema = self.schema if self.pa_writer is None and self.update_features else None typed_sequence_examples = {} for col in sorted(batch_examples.keys()): - col_type = schema.field(col).type if schema is not None else None + col_type = schema.field(col).type if schema else None col_try_type = try_schema.field(col).type if try_schema is not None and col in try_schema.names else None typed_sequence = OptimizedTypedSequence(batch_examples[col], type=col_type, try_type=col_try_type, col=col) typed_sequence_examples[col] = typed_sequence @@ -460,8 +460,8 @@ def finalize(self, close_stream=True): self.hkey_record = [] self.write_examples_on_file() if self.pa_writer is None: - if self._schema is not None: - self._build_writer(self._schema) + if self.schema: + self._build_writer(self.schema) else: raise ValueError("Please pass `features` or at least one example when writing data") self.pa_writer.close() From 583be77b7b645698d078834e3d7da9869221c347 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 15:46:47 +0100 Subject: [PATCH 23/42] Add test for MP3 TAR audio file --- tests/features/test_audio.py | 42 +++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 8bc08537731..1dbaae13e9a 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -37,6 +37,15 @@ def tar_wav_path(shared_datadir, tmp_path_factory): return path +@pytest.fixture() +def tar_mp3_path(shared_datadir, tmp_path_factory): + audio_path = str(shared_datadir / "test_audio_44100.mp3") + path = tmp_path_factory.mktemp("data") / "audio_data.mp3.tar" + with tarfile.TarFile(path, "w") as f: + f.add(audio_path, arcname=os.path.basename(audio_path)) + return path + + def iter_archive(archive_path): with tarfile.open(archive_path) as tar: for tarinfo in tar: @@ -118,7 +127,7 @@ def test_dataset_with_audio_feature(shared_datadir): @require_sndfile -def test_dataset_with_audio_feature_tar(tar_wav_path): +def test_dataset_with_audio_feature_tar_wav(tar_wav_path): audio_filename = "test_audio_44100.wav" data = {"audio": []} for file_path, file_obj in iter_archive(tar_wav_path): @@ -147,6 +156,37 @@ def test_dataset_with_audio_feature_tar(tar_wav_path): assert column[0]["sampling_rate"] == 44100 +@require_sox +@require_torchaudio +def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): + audio_filename = "test_audio_44100.wav" + data = {"audio": []} + for file_path, file_obj in iter_archive(tar_mp3_path): + data["audio"].append({"path": file_path, "bytes": file_obj.read()}) + break + features = Features({"audio": Audio()}) + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"audio"} + assert item["audio"].keys() == {"path", "array", "sampling_rate"} + assert item["audio"]["path"] == audio_filename + assert item["audio"]["array"].shape == (109440,) + assert item["audio"]["sampling_rate"] == 44100 + batch = dset[:1] + assert batch.keys() == {"audio"} + assert len(batch["audio"]) == 1 + assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"} + assert batch["audio"][0]["path"] == audio_filename + assert batch["audio"][0]["array"].shape == (109440,) + assert batch["audio"][0]["sampling_rate"] == 44100 + column = dset["audio"] + assert len(column) == 1 + assert column[0].keys() == {"path", "array", "sampling_rate"} + assert column[0]["path"] == audio_filename + assert column[0]["array"].shape == (109440,) + assert column[0]["sampling_rate"] == 44100 + + @require_sndfile def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") From 8dbe0d777ba5f8ffe47df844875fb6f934621712 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 15:49:02 +0100 Subject: [PATCH 24/42] Refactor Audio decode_example --- src/datasets/features/audio.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 01302e80db3..63d54cdbd4d 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -67,20 +67,20 @@ def decode_example(self, value): """ if isinstance(value, str): value = {"path": value, "bytes": None} - if value["bytes"]: - path, file = value["path"], BytesIO(value["bytes"]) - array, sampling_rate = ( - self._decode_example_with_torchaudio(file) - if path.endswith(".mp3") - else self._decode_example_with_soundfile(file) - ) + if value["path"].endswith("mp3"): + if value["bytes"]: + path, file = value["path"], BytesIO(value["bytes"]) + array, sampling_rate = self._decode_example_with_torchaudio(file) + else: + path = value["path"] + array, sampling_rate = self._decode_example_with_torchaudio(path) else: - path = value["path"] - array, sampling_rate = ( - self._decode_example_with_torchaudio(path) - if path.endswith(".mp3") - else self._decode_example_with_librosa(path) - ) + if value["bytes"]: + path, file = value["path"], BytesIO(value["bytes"]) + array, sampling_rate = self._decode_example_with_soundfile(file) + else: + path = value["path"] + array, sampling_rate = self._decode_example_with_librosa(path) return {"path": path, "array": array, "sampling_rate": sampling_rate} def _decode_example_with_librosa(self, value): From c9732091c537e46fa1d00d5039463a6f3a2fb7ea Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 16:04:20 +0100 Subject: [PATCH 25/42] Pass raw bytes to torchaudio.load --- src/datasets/features/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 63d54cdbd4d..14b7c674fde 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -69,7 +69,7 @@ def decode_example(self, value): value = {"path": value, "bytes": None} if value["path"].endswith("mp3"): if value["bytes"]: - path, file = value["path"], BytesIO(value["bytes"]) + path, file = value["path"], value["bytes"] array, sampling_rate = self._decode_example_with_torchaudio(file) else: path = value["path"] From 7363e9ab922b93e4204893cf753e436843ffc5e6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 10 Nov 2021 18:39:01 +0100 Subject: [PATCH 26/42] Revert "Pass raw bytes to torchaudio.load" This reverts commit c9732091c537e46fa1d00d5039463a6f3a2fb7ea. --- src/datasets/features/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 14b7c674fde..63d54cdbd4d 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -69,7 +69,7 @@ def decode_example(self, value): value = {"path": value, "bytes": None} if value["path"].endswith("mp3"): if value["bytes"]: - path, file = value["path"], value["bytes"] + path, file = value["path"], BytesIO(value["bytes"]) array, sampling_rate = self._decode_example_with_torchaudio(file) else: path = value["path"] From 9f61ab8529a1078bb76451421db1e3d5154198fc Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 15 Nov 2021 21:27:43 +0100 Subject: [PATCH 27/42] Pass format to load in _decode_example_with_torchaudio --- src/datasets/features/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 63d54cdbd4d..77a425b1d4b 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -120,7 +120,7 @@ def _decode_example_with_torchaudio(self, value): except RuntimeError as err: raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err - array, sampling_rate = torchaudio.load(value) + array, sampling_rate = torchaudio.load(value, format="mp3") if self.sampling_rate and self.sampling_rate != sampling_rate: if not hasattr(self, "_resampler"): self._resampler = T.Resample(sampling_rate, self.sampling_rate) From efa4c2575d2e1595abb69eead9348ac53e24d6f6 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 15 Nov 2021 21:44:32 +0100 Subject: [PATCH 28/42] Fix filename extension in test --- tests/features/test_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 1dbaae13e9a..aacd42ee353 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -159,7 +159,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path): @require_sox @require_torchaudio def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path): - audio_filename = "test_audio_44100.wav" + audio_filename = "test_audio_44100.mp3" data = {"audio": []} for file_path, file_obj in iter_archive(tar_mp3_path): data["audio"].append({"path": file_path, "bytes": file_obj.read()}) From 659fb786e2e5c50419845d7b17b4c75b9718c270 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 13:58:37 +0100 Subject: [PATCH 29/42] Fix Audio tests CI --- .github/workflows/test-audio.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml index fa9764145c7..addf9aa1035 100644 --- a/.github/workflows/test-audio.yml +++ b/.github/workflows/test-audio.yml @@ -12,15 +12,15 @@ jobs: test: runs-on: ubuntu-latest steps: + - name: Install OS dependencies + run: | + sudo apt-get update + sudo apt-get install libsndfile1 sox - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: "3.6" - - name: Install OS dependencies - run: | - sudo apt-get update - sudo apt-get install libsndfile1 sox - name: Install dependencies run: | python -m pip install --upgrade pip From 2fc997a75ad22812ac3ad99479aea3e081e61977 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 14:10:40 +0100 Subject: [PATCH 30/42] Fix Audio tests CI --- .github/workflows/test-audio.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml index addf9aa1035..4239f21f649 100644 --- a/.github/workflows/test-audio.yml +++ b/.github/workflows/test-audio.yml @@ -17,6 +17,8 @@ jobs: sudo apt-get update sudo apt-get install libsndfile1 sox - uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v2 with: From 416d1bf194b213542b6895c2e101ec1b3c92ec4b Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 14:15:00 +0100 Subject: [PATCH 31/42] Fix audio test CI by checking out PR HEAD commit instead of merge commit --- .github/workflows/test-audio.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml index 4239f21f649..68e0b8f0b3b 100644 --- a/.github/workflows/test-audio.yml +++ b/.github/workflows/test-audio.yml @@ -1,9 +1,6 @@ name: Test audio on: - push: - branches: - - master pull_request: branches: - master @@ -18,7 +15,7 @@ jobs: sudo apt-get install libsndfile1 sox - uses: actions/checkout@v2 with: - fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python uses: actions/setup-python@v2 with: From 5f162406139733e8bf6ffc9b6a26266da8688d8a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 14:40:26 +0100 Subject: [PATCH 32/42] Change default Audio storage dtype to string --- src/datasets/features/audio.py | 6 +++--- tests/features/test_audio.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index f7762803ef7..51242733b81 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -29,7 +29,7 @@ class Audio: sampling_rate: Optional[int] = None mono: bool = True - _storage_dtype: str = "struct" + _storage_dtype: str = "string" id: Optional[str] = None # Automatically constructed dtype: ClassVar[str] = "dict" @@ -50,8 +50,8 @@ def encode_example(self, value): Returns: :obj:`dict` """ - if isinstance(value, str): - self._storage_dtype = "string" + if isinstance(value, dict): + self._storage_dtype = "struct" return value def decode_example(self, value): diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index 2360713f005..69eb63579d3 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -62,7 +62,7 @@ def test_audio_instantiation(): assert audio.dtype == "dict" assert audio.pa_type is None assert audio._type == "Audio" - assert audio._storage_dtype == "struct" + assert audio._storage_dtype == "string" @require_sndfile From 488b74abb8e1ca72fdd0160a9e23ffc7ed5cd466 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 14:43:14 +0100 Subject: [PATCH 33/42] Rename Audio decode functions --- src/datasets/features/audio.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index 51242733b81..b39833bc3c2 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -72,20 +72,20 @@ def decode_example(self, value): if value["path"].endswith("mp3"): if value["bytes"]: path, file = value["path"], BytesIO(value["bytes"]) - array, sampling_rate = self._decode_example_with_torchaudio(file) + array, sampling_rate = self._decode_mp3(file) else: path = value["path"] - array, sampling_rate = self._decode_example_with_torchaudio(path) + array, sampling_rate = self._decode_mp3(path) else: if value["bytes"]: path, file = value["path"], BytesIO(value["bytes"]) - array, sampling_rate = self._decode_example_with_soundfile(file) + array, sampling_rate = self._decode_non_mp3_file_like(file) else: path = value["path"] - array, sampling_rate = self._decode_example_with_librosa(path) + array, sampling_rate = self._decode_non_mp3_path_like(path) return {"path": path, "array": array, "sampling_rate": sampling_rate} - def _decode_example_with_librosa(self, value): + def _decode_non_mp3_path_like(self, value): try: import librosa except ImportError as err: @@ -95,7 +95,7 @@ def _decode_example_with_librosa(self, value): array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono) return array, sampling_rate - def _decode_example_with_soundfile(self, file): + def _decode_non_mp3_file_like(self, file): try: import librosa import soundfile as sf @@ -111,7 +111,7 @@ def _decode_example_with_soundfile(self, file): sampling_rate = self.sampling_rate return array, sampling_rate - def _decode_example_with_torchaudio(self, value): + def _decode_mp3(self, value): try: import torchaudio import torchaudio.transforms as T From 0ae5d44965dfc0eba084b8048a30755a9d189061 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 14:47:55 +0100 Subject: [PATCH 34/42] Refactor Audio decode_example --- src/datasets/features/audio.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index b39833bc3c2..f0e39fec4ee 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -67,21 +67,16 @@ def decode_example(self, value): Returns: dict """ - if isinstance(value, str): - value = {"path": value, "bytes": None} - if value["path"].endswith("mp3"): - if value["bytes"]: - path, file = value["path"], BytesIO(value["bytes"]) + path, file = (value["path"], BytesIO(value["bytes"])) if isinstance(value, dict) else (value, None) + if path.endswith("mp3"): + if file: array, sampling_rate = self._decode_mp3(file) else: - path = value["path"] array, sampling_rate = self._decode_mp3(path) else: - if value["bytes"]: - path, file = value["path"], BytesIO(value["bytes"]) + if file: array, sampling_rate = self._decode_non_mp3_file_like(file) else: - path = value["path"] array, sampling_rate = self._decode_non_mp3_path_like(path) return {"path": path, "array": array, "sampling_rate": sampling_rate} From 4679d8ed615a29bf367b4940f6b0ed07dac61052 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 15:38:54 +0100 Subject: [PATCH 35/42] Force CI re-run From e178cc71a966175866eab1b08315f680f5825971 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 16:02:59 +0100 Subject: [PATCH 36/42] Refactor and rename --- src/datasets/features/audio.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index f0e39fec4ee..f9812b4c306 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -69,10 +69,7 @@ def decode_example(self, value): """ path, file = (value["path"], BytesIO(value["bytes"])) if isinstance(value, dict) else (value, None) if path.endswith("mp3"): - if file: - array, sampling_rate = self._decode_mp3(file) - else: - array, sampling_rate = self._decode_mp3(path) + array, sampling_rate = self._decode_mp3(file if file else path) else: if file: array, sampling_rate = self._decode_non_mp3_file_like(file) @@ -80,13 +77,13 @@ def decode_example(self, value): array, sampling_rate = self._decode_non_mp3_path_like(path) return {"path": path, "array": array, "sampling_rate": sampling_rate} - def _decode_non_mp3_path_like(self, value): + def _decode_non_mp3_path_like(self, path): try: import librosa except ImportError as err: raise ImportError("To support decoding audio files, please install 'librosa'.") from err - with xopen(value, "rb") as f: + with xopen(path, "rb") as f: array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono) return array, sampling_rate @@ -106,7 +103,7 @@ def _decode_non_mp3_file_like(self, file): sampling_rate = self.sampling_rate return array, sampling_rate - def _decode_mp3(self, value): + def _decode_mp3(self, path_or_file): try: import torchaudio import torchaudio.transforms as T @@ -117,7 +114,7 @@ def _decode_mp3(self, value): except RuntimeError as err: raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err - array, sampling_rate = torchaudio.load(value, format="mp3") + array, sampling_rate = torchaudio.load(path_or_file, format="mp3") if self.sampling_rate and self.sampling_rate != sampling_rate: if not hasattr(self, "_resampler"): self._resampler = T.Resample(sampling_rate, self.sampling_rate) From 4c4a6873f6bf347a998d99849feaa80048203dce Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 16 Nov 2021 16:04:20 +0100 Subject: [PATCH 37/42] Fix docstring --- src/datasets/features/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index f9812b4c306..9be32c1682d 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -48,7 +48,7 @@ def encode_example(self, value): value (:obj:`str` or :obj:`dict`): Data passed as input to Audio feature. Returns: - :obj:`dict` + :obj:`str` or :obj:`dict` """ if isinstance(value, dict): self._storage_dtype = "struct" From adbcc25dde5583a9aeb1e5e3792d887a2a3677fd Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 17 Nov 2021 18:23:07 +0100 Subject: [PATCH 38/42] put back the Audio feature --- datasets/common_voice/common_voice.py | 4 ++-- datasets/librispeech_asr/librispeech_asr.py | 4 ++-- datasets/openslr/openslr.py | 23 +++++++-------------- datasets/vivos/vivos.py | 6 ++---- 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py index 32232058a77..7ceab649186 100644 --- a/datasets/common_voice/common_voice.py +++ b/datasets/common_voice/common_voice.py @@ -631,7 +631,7 @@ def _info(self): { "client_id": datasets.Value("string"), "path": datasets.Value("string"), - "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, + "audio": datasets.features.Audio(sampling_rate=48_000), "sentence": datasets.Value("string"), "up_votes": datasets.Value("int64"), "down_votes": datasets.Value("int64"), @@ -739,6 +739,6 @@ def _generate_examples(self, files, filepath, path_to_clips): result = {key: value for key, value in zip(data_fields, field_values)} # set audio feature - result["audio"] = {"path": path, "data": f.read()} + result["audio"] = {"path": path, "bytes": f.read()} yield id_, result diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index 794b8d426b9..86bbee658c2 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -102,7 +102,7 @@ def _info(self): features=datasets.Features( { "file": datasets.Value("string"), - "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, + "audio": datasets.features.Audio(sampling_rate=16_000), "text": datasets.Value("string"), "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), @@ -170,7 +170,7 @@ def _generate_examples(self, files): ) if audio_data and len(audio_data) == len(transcripts): for transcript in transcripts: - audio = {"path": transcript["file"], "data": audio_data[transcript["id"]]} + audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]} yield key, {"audio": audio, **transcript} key += 1 audio_data = {} diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py index 9acb8d90d09..8389d25289a 100644 --- a/datasets/openslr/openslr.py +++ b/datasets/openslr/openslr.py @@ -535,22 +535,13 @@ class OpenSlr(datasets.GeneratorBasedBuilder): ] def _info(self): - if self.config.name in ["SLR32"]: - features = datasets.Features( - { - "path": datasets.Value("string"), - "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, - "sentence": datasets.Value("string"), - } - ) - else: - features = datasets.Features( - { - "path": datasets.Value("string"), - "audio": datasets.features.Audio(sampling_rate=48_000), - "sentence": datasets.Value("string"), - } - ) + features = datasets.Features( + { + "path": datasets.Value("string"), + "audio": datasets.features.Audio(sampling_rate=48_000), + "sentence": datasets.Value("string"), + } + ) return datasets.DatasetInfo( description=_DESCRIPTION, diff --git a/datasets/vivos/vivos.py b/datasets/vivos/vivos.py index c559ce9c4ae..0e596402b77 100644 --- a/datasets/vivos/vivos.py +++ b/datasets/vivos/vivos.py @@ -67,7 +67,7 @@ def _info(self): { "speaker_id": datasets.Value("string"), "path": datasets.Value("string"), - "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")}, + "audio": datasets.features.Audio(sampling_rate=16_000), "sentence": datasets.Value("string"), } ), @@ -112,8 +112,6 @@ def _split_generators(self, dl_manager): def _generate_examples(self, prompts_path, path_to_clips, audio_files): """Yields examples as (key, example) tuples.""" - # TODO(QL): use Audio featrue with data bytes instead of string path - raise Exception("TODO(QL): use Audio featrue with data bytes instead of string path") # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. # The `key` is here for legacy reason (tfds) and is not important in itself. examples = {} @@ -133,7 +131,7 @@ def _generate_examples(self, prompts_path, path_to_clips, audio_files): if path.startswith(path_to_clips): inside_clips_dir = True if path in examples: - audio = {"path": path, "data": f.read()} + audio = {"path": path, "bytes": f.read()} yield id_, {**examples[path], "audio": audio} id_ += 1 elif inside_clips_dir: From 25f18069b930984f80630f2a7bab24c7be78da31 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 17 Nov 2021 18:40:09 +0100 Subject: [PATCH 39/42] fix openslr --- datasets/openslr/openslr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py index 8389d25289a..c76375b22f3 100644 --- a/datasets/openslr/openslr.py +++ b/datasets/openslr/openslr.py @@ -619,7 +619,7 @@ def _generate_examples(self, path_to_indexs, path_to_datas, archive_files): for path, f in files: if path.startswith(path_to_data): counter += 1 - audio = {"path": path, "data": f.read()} + audio = {"path": path, "bytes": f.read()} yield counter, {"path": path, "audio": audio, "sentence": sentences[path]} else: for i, path_to_index in enumerate(path_to_indexs): From 7f674776505b6dabad8faac4abe7a4a781653e7d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 19 Nov 2021 09:56:23 +0000 Subject: [PATCH 40/42] fix common_voice --- datasets/common_voice/common_voice.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py index 7ceab649186..b11a09a759e 100644 --- a/datasets/common_voice/common_voice.py +++ b/datasets/common_voice/common_voice.py @@ -713,8 +713,10 @@ def _generate_examples(self, files, filepath, path_to_clips): path_idx = data_fields.index("path") all_field_values = {} + metadata_found = False for path, f in files: if path == filepath: + metadata_found = True lines = f.readlines() headline = lines[0].decode("utf-8") @@ -722,13 +724,15 @@ def _generate_examples(self, files, filepath, path_to_clips): assert ( column_names == data_fields ), f"The file should have {data_fields} as column names, but has {column_names}" - for id_, line in enumerate(lines[1:]): + for line in lines[1:]: field_values = line.decode("utf-8").strip().split("\t") # set full path for mp3 audio file audio_path = "/".join([path_to_clips, field_values[path_idx]]) all_field_values[audio_path] = field_values elif path.startswith(path_to_clips): - assert all_field_values, "Found audio clips before the metadata TSV file." + assert metadata_found, "Found audio clips before the metadata TSV file." + if not all_field_values: + break if path in all_field_values: field_values = all_field_values[path] @@ -741,4 +745,4 @@ def _generate_examples(self, files, filepath, path_to_clips): # set audio feature result["audio"] = {"path": path, "bytes": f.read()} - yield id_, result + yield path, result From 45ed8cdc42d9a2341afbe6bc74765e394601975a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 19 Nov 2021 09:56:30 +0000 Subject: [PATCH 41/42] update infos --- datasets/common_voice/dataset_infos.json | 2 +- datasets/librispeech_asr/dataset_infos.json | 2 +- datasets/openslr/dataset_infos.json | 2 +- datasets/vivos/dataset_infos.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasets/common_voice/dataset_infos.json b/datasets/common_voice/dataset_infos.json index f9266ece149..1e79e044ac3 100644 --- a/datasets/common_voice/dataset_infos.json +++ b/datasets/common_voice/dataset_infos.json @@ -1 +1 @@ -{"ab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10802, "num_examples": 22, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4442, "num_examples": 9, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 376182, "num_examples": 752, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 3906, "num_examples": 8, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ab.tar.gz": {"num_bytes": 41038412, "checksum": "801de9c63f740c4d2c821709586921bed216c736e593051306579cf478a54388"}}, "download_size": 41038412, "post_processing_size": null, "dataset_size": 395332, "size_in_bytes": 41433744}, "ar": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ar", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6330858, "num_examples": 14227, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 3306715, "num_examples": 7622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 3330810, "num_examples": 7517, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 7881421, "num_examples": 18283, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2822099, "num_examples": 6333, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ar.tar.gz": {"num_bytes": 1756264615, "checksum": "516b369da8a000c1b98d8f5ee3b90fa12bcc5d5438391fcf01f3d5e78ccdd6fa"}}, "download_size": 1756264615, "post_processing_size": null, "dataset_size": 23671903, "size_in_bytes": 1779936518}, "as": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "as", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 135331, "num_examples": 270, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 54717, "num_examples": 110, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 63580, "num_examples": 124, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 15547, "num_examples": 31, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/as.tar.gz": {"num_bytes": 22226465, "checksum": "d9afd6d28e9c837ff0943a94452fb12ce8a7885b38fdeb25fc2912bbe4977f40"}}, "download_size": 22226465, "post_processing_size": null, "dataset_size": 269175, "size_in_bytes": 22495640}, "br": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "br", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1114817, "num_examples": 2780, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 838823, "num_examples": 2087, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 807978, "num_examples": 1997, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 4446871, "num_examples": 10912, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 260104, "num_examples": 623, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/br.tar.gz": {"num_bytes": 465276982, "checksum": "d323d71337055b794c8fe3dcdf5a0dc03d6bf8f7c8c19f96369884410aef4606"}}, "download_size": 465276982, "post_processing_size": null, "dataset_size": 7468593, "size_in_bytes": 472745575}, "ca": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ca", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 128917601, "num_examples": 285584, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6886168, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6959066, "num_examples": 15724, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 28903919, "num_examples": 64446, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 8504933, "num_examples": 18846, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ca.tar.gz": {"num_bytes": 20743110341, "checksum": "a27bec66c151ddb21c1736781b3bca972047cc20c02488bad94d2311c40bc6da"}}, "download_size": 20743110341, "post_processing_size": null, "dataset_size": 180171687, "size_in_bytes": 20923282028}, "cnh": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cnh", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 330832, "num_examples": 807, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 307840, "num_examples": 752, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 310074, "num_examples": 756, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1208870, "num_examples": 2934, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 177752, "num_examples": 433, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cnh.tar.gz": {"num_bytes": 161331331, "checksum": "9c27ce17ea8db73e7a2c8715bdb3a45a40792d6d64238cfbb467a81c6b71d71f"}}, "download_size": 161331331, "post_processing_size": null, "dataset_size": 2335368, "size_in_bytes": 163666699}, "cs": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cs", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2459092, "num_examples": 5655, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1748420, "num_examples": 4144, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1756122, "num_examples": 4118, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3247839, "num_examples": 7475, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 292158, "num_examples": 685, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cs.tar.gz": {"num_bytes": 1271909933, "checksum": "68a1d6f27eb7161fdf28da889e7d37e8c86b7aff73b0b6df52edc8359e30ac56"}}, "download_size": 1271909933, "post_processing_size": null, "dataset_size": 9503631, "size_in_bytes": 1281413564}, "cv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 436012, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 365363, "num_examples": 788, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 388030, "num_examples": 818, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3263709, "num_examples": 6927, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 607952, "num_examples": 1282, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cv.tar.gz": {"num_bytes": 439329081, "checksum": "c3fb84c28a5718f01b91cf1026985b1dcd83bb312d32620f16b5ed4f12fb8c73"}}, "download_size": 439329081, "post_processing_size": null, "dataset_size": 5061066, "size_in_bytes": 444390147}, "cy": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cy", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3029147, "num_examples": 6839, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2060863, "num_examples": 4820, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2102719, "num_examples": 4776, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 7778447, "num_examples": 17919, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1569654, "num_examples": 3648, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cy.tar.gz": {"num_bytes": 3434474658, "checksum": "269da0cbbb2887d1903c0e17bbb71ea9bcd83506ba928fe75c660cb3e52f9a67"}}, "download_size": 3434474658, "post_processing_size": null, "dataset_size": 16540830, "size_in_bytes": 3451015488}, "de": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "de", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 111735161, "num_examples": 246525, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6785721, "num_examples": 15588, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6850065, "num_examples": 15588, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 4563457, "num_examples": 10095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 14542398, "num_examples": 32789, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/de.tar.gz": {"num_bytes": 23283812097, "checksum": "733e6e367da4b9588b4bb175ac45c6c0ec545e41df5494a7ee4a7e4ff3141ef7"}}, "download_size": 23283812097, "post_processing_size": null, "dataset_size": 144476802, "size_in_bytes": 23428288899}, "dv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "dv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1312675, "num_examples": 2680, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1075889, "num_examples": 2202, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1032265, "num_examples": 2077, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 421053, "num_examples": 840, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/dv.tar.gz": {"num_bytes": 540488041, "checksum": "b2c8617df5e7aebd74d88491913ecc6b94066198e875853b0b3847d13e70f419"}}, "download_size": 540488041, "post_processing_size": null, "dataset_size": 3841882, "size_in_bytes": 544329923}, "el": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "el", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1043636, "num_examples": 2316, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 677742, "num_examples": 1522, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 631379, "num_examples": 1401, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2539987, "num_examples": 5659, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 83583, "num_examples": 185, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/el.tar.gz": {"num_bytes": 381570611, "checksum": "86c67e7bda7658a7087b5a1997d140d57957a05bb413a188610db61807c53ee4"}}, "download_size": 381570611, "post_processing_size": null, "dataset_size": 4976327, "size_in_bytes": 386546938}, "en": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "en", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 250691604, "num_examples": 564337, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6850452, "num_examples": 16164, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6976081, "num_examples": 16164, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 72156747, "num_examples": 169895, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 82557632, "num_examples": 189562, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en.tar.gz": {"num_bytes": 60613063630, "checksum": "0f8fdfc4fe715738be94ee49c4fb63d5f1608d2e6a43a2bed80f6cb871171c36"}}, "download_size": 60613063630, "post_processing_size": null, "dataset_size": 419232516, "size_in_bytes": 61032296146}, "eo": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eo", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8663844, "num_examples": 19587, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 3843190, "num_examples": 8969, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 3879354, "num_examples": 8987, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1296351, "num_examples": 2946, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2081223, "num_examples": 4736, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eo.tar.gz": {"num_bytes": 2883560869, "checksum": "c19900010aee0f9eb39416406598509b1cdba136a16318e746b1a64f97d7809c"}}, "download_size": 2883560869, "post_processing_size": null, "dataset_size": 19763962, "size_in_bytes": 2903324831}, "es": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "es", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 72689623, "num_examples": 161813, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6544041, "num_examples": 15089, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6567785, "num_examples": 15089, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 62421588, "num_examples": 144791, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 17672664, "num_examples": 40640, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/es.tar.gz": {"num_bytes": 16188844718, "checksum": "276ca393783cd8b208d56b5032b87c13a40fcadde5b3925596e67c15578d0235"}}, "download_size": 16188844718, "post_processing_size": null, "dataset_size": 165895701, "size_in_bytes": 16354740419}, "et": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "et", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1426348, "num_examples": 2966, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1173073, "num_examples": 2509, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1212463, "num_examples": 2507, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 266991, "num_examples": 569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1766673, "num_examples": 3557, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/et.tar.gz": {"num_bytes": 767174465, "checksum": "50a861393e4e7013ab71f1b63bca8c42c26dca1519c15a3b9cdb3cb5b6c561a2"}}, "download_size": 767174465, "post_processing_size": null, "dataset_size": 5845548, "size_in_bytes": 773020013}, "eu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3389176, "num_examples": 7505, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2247330, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2281644, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 10454269, "num_examples": 23570, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2389658, "num_examples": 5387, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eu.tar.gz": {"num_bytes": 3664586106, "checksum": "55b6eaf7ca7c120faa0b60d71c87189b610412334e6b710fe12c2a79489ab06f"}}, "download_size": 3664586106, "post_processing_size": null, "dataset_size": 20762077, "size_in_bytes": 3685348183}, "fa": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fa", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3246710, "num_examples": 7593, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2271812, "num_examples": 5213, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2263134, "num_examples": 5213, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9773876, "num_examples": 22510, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5329900, "num_examples": 11698, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz": {"num_bytes": 8884585819, "checksum": "5454efe3b2f6d06d51e7177469b7bef9a962adbf7611e3cd21771451112abe6d"}}, "download_size": 8884585819, "post_processing_size": null, "dataset_size": 22885432, "size_in_bytes": 8907471251}, "fi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 199505, "num_examples": 460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 183540, "num_examples": 428, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 179607, "num_examples": 415, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 64358, "num_examples": 149, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 25781, "num_examples": 59, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fi.tar.gz": {"num_bytes": 49882909, "checksum": "eb26d0904beef5ec08cf53267be7e78b8ba5056fd162057d5b085a7cba51f035"}}, "download_size": 49882909, "post_processing_size": null, "dataset_size": 652791, "size_in_bytes": 50535700}, "fr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 133605567, "num_examples": 298982, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6854610, "num_examples": 15763, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6868568, "num_examples": 15763, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1435580, "num_examples": 3222, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 17776024, "num_examples": 40351, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fr.tar.gz": {"num_bytes": 19130141984, "checksum": "719ef964b55d830a095a602aff311db39b77239e9d600b6af646ec2ed57e5e45"}}, "download_size": 19130141984, "post_processing_size": null, "dataset_size": 166540349, "size_in_bytes": 19296682333}, "fy-NL": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fy-NL", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1695909, "num_examples": 3927, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1311327, "num_examples": 3020, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1215844, "num_examples": 2790, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9389087, "num_examples": 21569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 451010, "num_examples": 1031, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fy-NL.tar.gz": {"num_bytes": 1237743070, "checksum": "ddee4fc3ce52df2379fa4069090d8f5c853155dc0462eb645f6111e2da627297"}}, "download_size": 1237743070, "post_processing_size": null, "dataset_size": 14063177, "size_in_bytes": 1251806247}, "ga-IE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ga-IE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 236396, "num_examples": 541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 215599, "num_examples": 506, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 212002, "num_examples": 497, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 917017, "num_examples": 2130, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 176661, "num_examples": 409, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ga-IE.tar.gz": {"num_bytes": 156553447, "checksum": "27223fc99af6a45f81190ecb90034806991ff3b9e3aa38a7e97caaabbb0a4ddc"}}, "download_size": 156553447, "post_processing_size": null, "dataset_size": 1757675, "size_in_bytes": 158311122}, "hi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 73903, "num_examples": 157, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 58773, "num_examples": 127, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 64002, "num_examples": 135, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 67240, "num_examples": 139, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 29139, "num_examples": 60, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hi.tar.gz": {"num_bytes": 21424045, "checksum": "5492393b04dd1307a52d93525a7db08fc392c8ba0df553668945152e434f58c9"}}, "download_size": 21424045, "post_processing_size": null, "dataset_size": 293057, "size_in_bytes": 21717102}, "hsb": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hsb", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 367798, "num_examples": 808, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 173155, "num_examples": 387, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 77478, "num_examples": 172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 28207, "num_examples": 62, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 103211, "num_examples": 227, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hsb.tar.gz": {"num_bytes": 79362060, "checksum": "3dd3d79aaa078ad7955552ebc596e0a8894ffd7a4a88a51b2c8ee80c0e088152"}}, "download_size": 79362060, "post_processing_size": null, "dataset_size": 749849, "size_in_bytes": 80111909}, "hu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1428176, "num_examples": 3348, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 699721, "num_examples": 1649, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 612969, "num_examples": 1434, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 127337, "num_examples": 295, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 72559, "num_examples": 169, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hu.tar.gz": {"num_bytes": 242758708, "checksum": "61f933155cba6c54c0b76d0ddd2caebd62d69228b7c935382112abe172660953"}}, "download_size": 242758708, "post_processing_size": null, "dataset_size": 2940762, "size_in_bytes": 245699470}, "ia": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ia", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1446791, "num_examples": 3477, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 372192, "num_examples": 899, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 664744, "num_examples": 1601, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 452330, "num_examples": 1095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 79695, "num_examples": 192, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ia.tar.gz": {"num_bytes": 226499645, "checksum": "47a137a805ea8ce01f2cf9277739919a824a9fd13468345dfbd84eddb52c02f1"}}, "download_size": 226499645, "post_processing_size": null, "dataset_size": 3015752, "size_in_bytes": 229515397}, "id": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "id", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 889083, "num_examples": 2130, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 766675, "num_examples": 1844, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 766720, "num_examples": 1835, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2831110, "num_examples": 6782, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 196795, "num_examples": 470, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/id.tar.gz": {"num_bytes": 475918233, "checksum": "71177fa9d2fac29f48db5feabc294f1d6bbcaa0c326b0d1099be66c0b804b245"}}, "download_size": 475918233, "post_processing_size": null, "dataset_size": 5450383, "size_in_bytes": 481368616}, "it": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "it", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 25748596, "num_examples": 58015, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 5629778, "num_examples": 12928, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5651445, "num_examples": 12928, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 6438506, "num_examples": 14549, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5425867, "num_examples": 12189, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/it.tar.gz": {"num_bytes": 5585781573, "checksum": "3a75b1631958af1487ee49b13cd27efc951183737ed515832cf714ed20c97808"}}, "download_size": 5585781573, "post_processing_size": null, "dataset_size": 48894192, "size_in_bytes": 5634675765}, "ja": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ja", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 317820, "num_examples": 722, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 278459, "num_examples": 632, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 255038, "num_examples": 586, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 389563, "num_examples": 885, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 222566, "num_examples": 504, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ja.tar.gz": {"num_bytes": 152879796, "checksum": "3614cd0d0abac80794351c78183967c83179fab390d7e19cad97758eb85ae558"}}, "download_size": 152879796, "post_processing_size": null, "dataset_size": 1463446, "size_in_bytes": 154343242}, "ka": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ka", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 581587, "num_examples": 1058, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 358380, "num_examples": 656, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 294673, "num_examples": 527, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 24443, "num_examples": 44, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 78770, "num_examples": 139, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ka.tar.gz": {"num_bytes": 104280554, "checksum": "7677df9d650234306a11bf8518be5807e72e7d5fc440d391304d1b99dd5517f5"}}, "download_size": 104280554, "post_processing_size": null, "dataset_size": 1337853, "size_in_bytes": 105618407}, "kab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "kab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 49343008, "num_examples": 120530, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 5936276, "num_examples": 14622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5928674, "num_examples": 14622, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 36104123, "num_examples": 88021, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 7518840, "num_examples": 18134, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/kab.tar.gz": {"num_bytes": 17171606918, "checksum": "d2089107d4f3a84856c457a436a47a883b872022f2085cfad0501469be91fd95"}}, "download_size": 17171606918, "post_processing_size": null, "dataset_size": 104830921, "size_in_bytes": 17276437839}, "ky": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ky", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 927074, "num_examples": 1955, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 700081, "num_examples": 1503, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 711620, "num_examples": 1511, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3410831, "num_examples": 7223, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 437848, "num_examples": 926, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ky.tar.gz": {"num_bytes": 579440853, "checksum": "6efe0ca5384d0419fcf5fda0e0229a1b5eb80d8eeba2d7528a4c3c9f2593206f"}}, "download_size": 579440853, "post_processing_size": null, "dataset_size": 6187454, "size_in_bytes": 585628307}, "lg": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lg", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 549563, "num_examples": 1250, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 253625, "num_examples": 584, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 168943, "num_examples": 384, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1365647, "num_examples": 3110, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 127043, "num_examples": 290, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lg.tar.gz": {"num_bytes": 208197149, "checksum": "71243c65f638cd7f392fabe22e37cbafbdca4eb5a199210000ae957a88768040"}}, "download_size": 208197149, "post_processing_size": null, "dataset_size": 2464821, "size_in_bytes": 210661970}, "lt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 402862, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 203781, "num_examples": 466, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 106451, "num_examples": 244, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 710428, "num_examples": 1629, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 44360, "num_examples": 102, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lt.tar.gz": {"num_bytes": 135299706, "checksum": "5ad3d93bc308f58a70e6685f71ae035237ef9caa0922232ac76846f7587bb8aa"}}, "download_size": 135299706, "post_processing_size": null, "dataset_size": 1467882, "size_in_bytes": 136767588}, "lv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1051326, "num_examples": 2552, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 767926, "num_examples": 1882, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 819846, "num_examples": 2002, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 641669, "num_examples": 1560, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 58933, "num_examples": 143, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lv.tar.gz": {"num_bytes": 208307691, "checksum": "8a4350ccf24884ee1012032bfd5a87e0de50d780b1f8450d1cb52afe3f69c671"}}, "download_size": 208307691, "post_processing_size": null, "dataset_size": 3339700, "size_in_bytes": 211647391}, "mn": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mn", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1088733, "num_examples": 2183, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 912144, "num_examples": 1862, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 912414, "num_examples": 1837, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1628610, "num_examples": 3272, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 332643, "num_examples": 667, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mn.tar.gz": {"num_bytes": 486369317, "checksum": "3aebc40d40eb19263576664a981f4bb8b221abeab78c8154adc3d16875c75ec7"}}, "download_size": 486369317, "post_processing_size": null, "dataset_size": 4874544, "size_in_bytes": 491243861}, "mt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 884543, "num_examples": 2036, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 690486, "num_examples": 1617, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 651610, "num_examples": 1516, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2464327, "num_examples": 5714, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 136773, "num_examples": 314, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mt.tar.gz": {"num_bytes": 425114242, "checksum": "9d53000d7832d130c4d35fb412bfc092ab8de8e763a5d2a528aebf37f052af03"}}, "download_size": 425114242, "post_processing_size": null, "dataset_size": 4827739, "size_in_bytes": 429941981}, "nl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "nl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4219972, "num_examples": 9460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2457725, "num_examples": 5708, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2200827, "num_examples": 4938, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 11420, "num_examples": 27, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1442237, "num_examples": 3308, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/nl.tar.gz": {"num_bytes": 1741827548, "checksum": "048f823408e3bbd16e63111d1b4caecb0102606c440bbdf3e5b6a6bae1e1e3f1"}}, "download_size": 1741827548, "post_processing_size": null, "dataset_size": 10332181, "size_in_bytes": 1752159729}, "or": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "or", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 196790, "num_examples": 388, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 49231, "num_examples": 98, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 65559, "num_examples": 129, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2191159, "num_examples": 4302, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 30974, "num_examples": 62, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/or.tar.gz": {"num_bytes": 199077358, "checksum": "f3edad30166fe454f4d2b14adeece1434dc4b8eb7b0ece37aac8389b7122218a"}}, "download_size": 199077358, "post_processing_size": null, "dataset_size": 2533713, "size_in_bytes": 201611071}, "pa-IN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pa-IN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 100668, "num_examples": 211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 54307, "num_examples": 116, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 20728, "num_examples": 44, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 670272, "num_examples": 1411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 20354, "num_examples": 43, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pa-IN.tar.gz": {"num_bytes": 69748265, "checksum": "d2e30f28a227ecb8209340c4133edf6489f35f8e3d1eb55ff22b96b12f36952c"}}, "download_size": 69748265, "post_processing_size": null, "dataset_size": 866329, "size_in_bytes": 70614594}, "pl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3259050, "num_examples": 7468, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2156262, "num_examples": 5153, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2203857, "num_examples": 5153, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5566818, "num_examples": 12848, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1983448, "num_examples": 4601, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pl.tar.gz": {"num_bytes": 3537012341, "checksum": "acbf77d36e083e2bcb7152ffb52ab7d1e3e64d33a3f51f106cdff7feff6279aa"}}, "download_size": 3537012341, "post_processing_size": null, "dataset_size": 15169435, "size_in_bytes": 3552181776}, "pt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2763497, "num_examples": 6514, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1948500, "num_examples": 4641, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1936082, "num_examples": 4592, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3492648, "num_examples": 8390, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 738577, "num_examples": 1740, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pt.tar.gz": {"num_bytes": 1704252567, "checksum": "6700de499f728e0e3f3ed4d7005e5b7db27ba2ddc872b21b0b404c3b4859d84b"}}, "download_size": 1704252567, "post_processing_size": null, "dataset_size": 10879304, "size_in_bytes": 1715131871}, "rm-sursilv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-sursilv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 627518, "num_examples": 1384, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 535630, "num_examples": 1194, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 539772, "num_examples": 1205, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 946574, "num_examples": 2102, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 290484, "num_examples": 639, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-sursilv.tar.gz": {"num_bytes": 275950479, "checksum": "3cfc4971b6ab8958d7c3d784977690fcc04ebd7570ecf788d5948df84a5481a1"}}, "download_size": 275950479, "post_processing_size": null, "dataset_size": 2939978, "size_in_bytes": 278890457}, "rm-vallader": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-vallader", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 267837, "num_examples": 574, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 173761, "num_examples": 378, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 163725, "num_examples": 357, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 339277, "num_examples": 727, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 175312, "num_examples": 374, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-vallader.tar.gz": {"num_bytes": 108113989, "checksum": "4fdb7dc5e20862a636ee7975831b39db29012d615f9139edf2d266b878ce43ae"}}, "download_size": 108113989, "post_processing_size": null, "dataset_size": 1119912, "size_in_bytes": 109233901}, "ro": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ro", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1457000, "num_examples": 3399, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 756861, "num_examples": 1778, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 368157, "num_examples": 858, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 827971, "num_examples": 1945, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 207526, "num_examples": 485, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ro.tar.gz": {"num_bytes": 261978702, "checksum": "450b159e936ef6ff136fcdfad193675caec5b2230d1b6ca24c5cde491ff002cd"}}, "download_size": 261978702, "post_processing_size": null, "dataset_size": 3617515, "size_in_bytes": 265596217}, "ru": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ru", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7918252, "num_examples": 15481, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4035778, "num_examples": 8007, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 4017986, "num_examples": 7963, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5123246, "num_examples": 10247, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1567391, "num_examples": 3056, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ru.tar.gz": {"num_bytes": 3655676916, "checksum": "dcbb460e58d4afc78047c3801c9eb56d940b388eb350ee3da3de5bfe5a74a025"}}, "download_size": 3655676916, "post_processing_size": null, "dataset_size": 22662653, "size_in_bytes": 3678339569}, "rw": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rw", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 222435182, "num_examples": 515197, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6836125, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6685632, "num_examples": 15032, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9774022, "num_examples": 22923, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 93086051, "num_examples": 206790, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rw.tar.gz": {"num_bytes": 42545189583, "checksum": "cf8a07059b3713022d487f9a6b8f465271f3457c525a8b350f829f87b0132b41"}}, "download_size": 42545189583, "post_processing_size": null, "dataset_size": 338817012, "size_in_bytes": 42884006595}, "sah": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sah", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 733267, "num_examples": 1442, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 379003, "num_examples": 757, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 204118, "num_examples": 405, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 636097, "num_examples": 1275, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 33499, "num_examples": 66, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sah.tar.gz": {"num_bytes": 181245626, "checksum": "dea1a454813c8f90abcbdf427fa922e1b7a116753deeb410af096ce5f0ae2405"}}, "download_size": 181245626, "post_processing_size": null, "dataset_size": 1985984, "size_in_bytes": 183231610}, "sl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 845619, "num_examples": 2038, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 363066, "num_examples": 881, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 231081, "num_examples": 556, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1033232, "num_examples": 2502, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 37929, "num_examples": 92, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sl.tar.gz": {"num_bytes": 222751292, "checksum": "184cfbfe876a1f1c6317e4e34680c82a940db833afca78203c2929db1768a353"}}, "download_size": 222751292, "post_processing_size": null, "dataset_size": 2510927, "size_in_bytes": 225262219}, "sv-SE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sv-SE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 983262, "num_examples": 2331, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 840358, "num_examples": 2027, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 844026, "num_examples": 2019, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1329608, "num_examples": 3043, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 193364, "num_examples": 462, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sv-SE.tar.gz": {"num_bytes": 421434184, "checksum": "dc8634dafacb33be00f06e376f6c479d53f84f4834952593c8903f1080535213"}}, "download_size": 421434184, "post_processing_size": null, "dataset_size": 4190618, "size_in_bytes": 425624802}, "ta": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ta", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 957720, "num_examples": 2009, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 846103, "num_examples": 1781, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 858400, "num_examples": 1779, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3584809, "num_examples": 7428, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 284039, "num_examples": 594, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ta.tar.gz": {"num_bytes": 679766097, "checksum": "78560d9d608a63ee75c3fdeb7f96f33cf0d85855ba6294b13e945de066eb46d8"}}, "download_size": 679766097, "post_processing_size": null, "dataset_size": 6531071, "size_in_bytes": 686297168}, "th": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "th", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1389723, "num_examples": 2917, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1029454, "num_examples": 2188, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 909292, "num_examples": 1922, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1268833, "num_examples": 2671, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 222666, "num_examples": 467, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/th.tar.gz": {"num_bytes": 341305736, "checksum": "a3d11043c49d3ea8ffb58dfab117cd831dd62a641e0a26ac60eb43e483534f7a"}}, "download_size": 341305736, "post_processing_size": null, "dataset_size": 4819968, "size_in_bytes": 346125704}, "tr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 778858, "num_examples": 1831, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 689987, "num_examples": 1647, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 694938, "num_examples": 1647, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 137465, "num_examples": 325, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 730583, "num_examples": 1726, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tr.tar.gz": {"num_bytes": 620848700, "checksum": "b3f266c868b1fe9f76270ba76226b1cdc17f33b3e387e6b44a64d5419f8b9768"}}, "download_size": 620848700, "post_processing_size": null, "dataset_size": 3031831, "size_in_bytes": 623880531}, "tt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 5048627, "num_examples": 11211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1974398, "num_examples": 4485, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 939118, "num_examples": 2127, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 793843, "num_examples": 1798, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 129728, "num_examples": 287, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tt.tar.gz": {"num_bytes": 777153207, "checksum": "89c8d7a49584de720f1790df39e6f07996e2eecb07f6273f4ba2668e9fe4ad46"}}, "download_size": 777153207, "post_processing_size": null, "dataset_size": 8885714, "size_in_bytes": 786038921}, "uk": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "uk", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1888179, "num_examples": 4035, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1511544, "num_examples": 3235, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1521216, "num_examples": 3236, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3830066, "num_examples": 8161, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 598922, "num_examples": 1255, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/uk.tar.gz": {"num_bytes": 1218559031, "checksum": "f3ca0143cd84f5eacb583187052e69efec21c571a426efee91a765a2284519c2"}}, "download_size": 1218559031, "post_processing_size": null, "dataset_size": 9349927, "size_in_bytes": 1227908958}, "vi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 92564, "num_examples": 221, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 82035, "num_examples": 198, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 84472, "num_examples": 200, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 366671, "num_examples": 870, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 32664, "num_examples": 78, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vi.tar.gz": {"num_bytes": 51929480, "checksum": "704bce8031932377cc21c017923ff1e96ebd2be9bd520adcf839f7a0f5f03b6e"}}, "download_size": 51929480, "post_processing_size": null, "dataset_size": 658406, "size_in_bytes": 52587886}, "vot": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vot", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1250, "num_examples": 3, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 163377, "num_examples": 411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2419, "num_examples": 6, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vot.tar.gz": {"num_bytes": 7792602, "checksum": "7fb07dd25b0575e8cd811bb8d1e5aebd17fdbca079a4ee50d81e0aaaff50f8b0"}}, "download_size": 7792602, "post_processing_size": null, "dataset_size": 167046, "size_in_bytes": 7959648}, "zh-CN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-CN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8279157, "num_examples": 18541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 3757047, "num_examples": 8760, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 3823707, "num_examples": 8743, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3908115, "num_examples": 8948, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2328784, "num_examples": 5305, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-CN.tar.gz": {"num_bytes": 2184602350, "checksum": "cd8589cac28541f9f996d1954f14c307954f1146ac44a8eadad8e31ebaf1f15e"}}, "download_size": 2184602350, "post_processing_size": null, "dataset_size": 22096810, "size_in_bytes": 2206699160}, "zh-HK": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-HK", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3142432, "num_examples": 7506, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2144145, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2163111, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 16142369, "num_examples": 38830, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1272392, "num_examples": 2999, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-HK.tar.gz": {"num_bytes": 2774145806, "checksum": "8a525ce4664d6647701449d5e72f7d8658cc3a5fabc72e05c6883994fd3c0134"}}, "download_size": 2774145806, "post_processing_size": null, "dataset_size": 24864449, "size_in_bytes": 2799010255}, "zh-TW": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-TW", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1478055, "num_examples": 3507, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1184204, "num_examples": 2895, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1204526, "num_examples": 2895, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9437896, "num_examples": 22477, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1493820, "num_examples": 3584, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-TW.tar.gz": {"num_bytes": 2182836295, "checksum": "67fadf561f8237690d4a4a1d63a9b3ac271b5d05438dc745b7e04282d909460f"}}, "download_size": 2182836295, "post_processing_size": null, "dataset_size": 14798501, "size_in_bytes": 2197634796}} \ No newline at end of file +{"ab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1295622, "num_examples": 22, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 411844, "num_examples": 9, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 40023390, "num_examples": 752, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 361626, "num_examples": 8, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ab.tar.gz": {"num_bytes": 41038412, "checksum": "801de9c63f740c4d2c821709586921bed216c736e593051306579cf478a54388"}}, "download_size": 41038412, "post_processing_size": null, "dataset_size": 42092482, "size_in_bytes": 83130894}, "ar": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ar", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 359335168, "num_examples": 14227, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 237546641, "num_examples": 7622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 209606861, "num_examples": 7517, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 515822404, "num_examples": 18283, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 194805036, "num_examples": 6333, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ar.tar.gz": {"num_bytes": 1756264615, "checksum": "516b369da8a000c1b98d8f5ee3b90fa12bcc5d5438391fcf01f3d5e78ccdd6fa"}}, "download_size": 1756264615, "post_processing_size": null, "dataset_size": 1517116110, "size_in_bytes": 3273380725}, "as": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "as", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11442279, "num_examples": 270, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 5071343, "num_examples": 110, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5480156, "num_examples": 124, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 886145, "num_examples": 31, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/as.tar.gz": {"num_bytes": 22226465, "checksum": "d9afd6d28e9c837ff0943a94452fb12ce8a7885b38fdeb25fc2912bbe4977f40"}}, "download_size": 22226465, "post_processing_size": null, "dataset_size": 22879923, "size_in_bytes": 45106388}, "br": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "br", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62238289, "num_examples": 2780, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 54461339, "num_examples": 2087, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 46995570, "num_examples": 1997, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 269858143, "num_examples": 10912, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 20861017, "num_examples": 623, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/br.tar.gz": {"num_bytes": 465276982, "checksum": "d323d71337055b794c8fe3dcdf5a0dc03d6bf8f7c8c19f96369884410aef4606"}}, "download_size": 465276982, "post_processing_size": null, "dataset_size": 454414358, "size_in_bytes": 919691340}, "ca": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ca", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12966939466, "num_examples": 285584, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 745761890, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 716442038, "num_examples": 15724, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2693542910, "num_examples": 64446, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 850402888, "num_examples": 18846, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ca.tar.gz": {"num_bytes": 20743110341, "checksum": "a27bec66c151ddb21c1736781b3bca972047cc20c02488bad94d2311c40bc6da"}}, "download_size": 20743110341, "post_processing_size": null, "dataset_size": 17973089192, "size_in_bytes": 38716199533}, "cnh": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cnh", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 18866674, "num_examples": 807, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 24675321, "num_examples": 752, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 22162315, "num_examples": 756, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 84878963, "num_examples": 2934, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 13642724, "num_examples": 433, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cnh.tar.gz": {"num_bytes": 161331331, "checksum": "9c27ce17ea8db73e7a2c8715bdb3a45a40792d6d64238cfbb467a81c6b71d71f"}}, "download_size": 161331331, "post_processing_size": null, "dataset_size": 164225997, "size_in_bytes": 325557328}, "cs": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cs", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 215205282, "num_examples": 5655, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 148499476, "num_examples": 4144, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 148312130, "num_examples": 4118, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 282225475, "num_examples": 7475, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 24717823, "num_examples": 685, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cs.tar.gz": {"num_bytes": 1271909933, "checksum": "68a1d6f27eb7161fdf28da889e7d37e8c86b7aff73b0b6df52edc8359e30ac56"}}, "download_size": 1271909933, "post_processing_size": null, "dataset_size": 818960186, "size_in_bytes": 2090870119}, "cv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 31649510, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 32513061, "num_examples": 788, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 28429779, "num_examples": 818, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 288294623, "num_examples": 6927, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 57923138, "num_examples": 1282, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cv.tar.gz": {"num_bytes": 439329081, "checksum": "c3fb84c28a5718f01b91cf1026985b1dcd83bb312d32620f16b5ed4f12fb8c73"}}, "download_size": 439329081, "post_processing_size": null, "dataset_size": 438810111, "size_in_bytes": 878139192}, "cy": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cy", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 271642649, "num_examples": 6839, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 206865596, "num_examples": 4820, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 201813388, "num_examples": 4776, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 688469886, "num_examples": 17919, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 146874576, "num_examples": 3648, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cy.tar.gz": {"num_bytes": 3434474658, "checksum": "269da0cbbb2887d1903c0e17bbb71ea9bcd83506ba928fe75c660cb3e52f9a67"}}, "download_size": 3434474658, "post_processing_size": null, "dataset_size": 1515666095, "size_in_bytes": 4950140753}, "de": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "de", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11463160619, "num_examples": 246525, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 744617681, "num_examples": 15588, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 729559862, "num_examples": 15588, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 464513461, "num_examples": 10095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1440604803, "num_examples": 32789, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/de.tar.gz": {"num_bytes": 23283812097, "checksum": "733e6e367da4b9588b4bb175ac45c6c0ec545e41df5494a7ee4a7e4ff3141ef7"}}, "download_size": 23283812097, "post_processing_size": null, "dataset_size": 14842456426, "size_in_bytes": 38126268523}, "dv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "dv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 118576140, "num_examples": 2680, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 94281409, "num_examples": 2202, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 94117088, "num_examples": 2077, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 37694847, "num_examples": 840, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/dv.tar.gz": {"num_bytes": 540488041, "checksum": "b2c8617df5e7aebd74d88491913ecc6b94066198e875853b0b3847d13e70f419"}}, "download_size": 540488041, "post_processing_size": null, "dataset_size": 344669484, "size_in_bytes": 885157525}, "el": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "el", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 80759076, "num_examples": 2316, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 53820491, "num_examples": 1522, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 44818565, "num_examples": 1401, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 186861175, "num_examples": 5659, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 6023769, "num_examples": 185, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/el.tar.gz": {"num_bytes": 381570611, "checksum": "86c67e7bda7658a7087b5a1997d140d57957a05bb413a188610db61807c53ee4"}}, "download_size": 381570611, "post_processing_size": null, "dataset_size": 372283076, "size_in_bytes": 753853687}, "en": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "en", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 26088826658, "num_examples": 564337, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 758718688, "num_examples": 16164, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 795638801, "num_examples": 16164, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5796244022, "num_examples": 169895, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 9122973965, "num_examples": 189562, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en.tar.gz": {"num_bytes": 60613063630, "checksum": "0f8fdfc4fe715738be94ee49c4fb63d5f1608d2e6a43a2bed80f6cb871171c36"}}, "download_size": 60613063630, "post_processing_size": null, "dataset_size": 42562402134, "size_in_bytes": 103175465764}, "eo": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eo", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 993655930, "num_examples": 19587, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 420153812, "num_examples": 8969, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 391427586, "num_examples": 8987, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 142476819, "num_examples": 2946, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 238105462, "num_examples": 4736, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eo.tar.gz": {"num_bytes": 2883560869, "checksum": "c19900010aee0f9eb39416406598509b1cdba136a16318e746b1a64f97d7809c"}}, "download_size": 2883560869, "post_processing_size": null, "dataset_size": 2185819609, "size_in_bytes": 5069380478}, "es": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "es", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6918333205, "num_examples": 161813, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 754049291, "num_examples": 15089, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 735558084, "num_examples": 15089, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5528972205, "num_examples": 144791, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1664876264, "num_examples": 40640, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/es.tar.gz": {"num_bytes": 16188844718, "checksum": "276ca393783cd8b208d56b5032b87c13a40fcadde5b3925596e67c15578d0235"}}, "download_size": 16188844718, "post_processing_size": null, "dataset_size": 15601789049, "size_in_bytes": 31790633767}, "et": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "et", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 161124199, "num_examples": 2966, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 133183135, "num_examples": 2509, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 137604813, "num_examples": 2507, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 30339130, "num_examples": 569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 193019544, "num_examples": 3557, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/et.tar.gz": {"num_bytes": 767174465, "checksum": "50a861393e4e7013ab71f1b63bca8c42c26dca1519c15a3b9cdb3cb5b6c561a2"}}, "download_size": 767174465, "post_processing_size": null, "dataset_size": 655270821, "size_in_bytes": 1422445286}, "eu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 317322801, "num_examples": 7505, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 238866501, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 228150083, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 988079897, "num_examples": 23570, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 208553909, "num_examples": 5387, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eu.tar.gz": {"num_bytes": 3664586106, "checksum": "55b6eaf7ca7c120faa0b60d71c87189b610412334e6b710fe12c2a79489ab06f"}}, "download_size": 3664586106, "post_processing_size": null, "dataset_size": 1980973191, "size_in_bytes": 5645559297}, "fa": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fa", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 239255087, "num_examples": 7593, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 217939210, "num_examples": 5213, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 196558067, "num_examples": 5213, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 737017546, "num_examples": 22510, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 499570226, "num_examples": 11698, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz": {"num_bytes": 8884585819, "checksum": "5454efe3b2f6d06d51e7177469b7bef9a962adbf7611e3cd21771451112abe6d"}}, "download_size": 8884585819, "post_processing_size": null, "dataset_size": 1890340136, "size_in_bytes": 10774925955}, "fi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 16017393, "num_examples": 460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 16117529, "num_examples": 428, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 15471757, "num_examples": 415, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5836400, "num_examples": 149, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2228215, "num_examples": 59, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fi.tar.gz": {"num_bytes": 49882909, "checksum": "eb26d0904beef5ec08cf53267be7e78b8ba5056fd162057d5b085a7cba51f035"}}, "download_size": 49882909, "post_processing_size": null, "dataset_size": 55671294, "size_in_bytes": 105554203}, "fr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12439892070, "num_examples": 298982, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 733943163, "num_examples": 15763, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 703801114, "num_examples": 15763, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 117998889, "num_examples": 3222, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1794149368, "num_examples": 40351, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fr.tar.gz": {"num_bytes": 19130141984, "checksum": "719ef964b55d830a095a602aff311db39b77239e9d600b6af646ec2ed57e5e45"}}, "download_size": 19130141984, "post_processing_size": null, "dataset_size": 15789784604, "size_in_bytes": 34919926588}, "fy-NL": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fy-NL", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 159116360, "num_examples": 3927, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 126913262, "num_examples": 3020, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 112288554, "num_examples": 2790, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 893887467, "num_examples": 21569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 38985422, "num_examples": 1031, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fy-NL.tar.gz": {"num_bytes": 1237743070, "checksum": "ddee4fc3ce52df2379fa4069090d8f5c853155dc0462eb645f6111e2da627297"}}, "download_size": 1237743070, "post_processing_size": null, "dataset_size": 1331191065, "size_in_bytes": 2568934135}, "ga-IE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ga-IE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 15396820, "num_examples": 541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 16611739, "num_examples": 506, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 14897739, "num_examples": 497, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 61948768, "num_examples": 2130, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 10993268, "num_examples": 409, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ga-IE.tar.gz": {"num_bytes": 156553447, "checksum": "27223fc99af6a45f81190ecb90034806991ff3b9e3aa38a7e97caaabbb0a4ddc"}}, "download_size": 156553447, "post_processing_size": null, "dataset_size": 119848334, "size_in_bytes": 276401781}, "hi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4860737, "num_examples": 157, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4728043, "num_examples": 127, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5569352, "num_examples": 135, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 4176110, "num_examples": 139, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2801051, "num_examples": 60, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hi.tar.gz": {"num_bytes": 21424045, "checksum": "5492393b04dd1307a52d93525a7db08fc392c8ba0df553668945152e434f58c9"}}, "download_size": 21424045, "post_processing_size": null, "dataset_size": 22135293, "size_in_bytes": 43559338}, "hsb": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hsb", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43049910, "num_examples": 808, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 20929094, "num_examples": 387, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 8769458, "num_examples": 172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3173841, "num_examples": 62, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5589972, "num_examples": 227, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hsb.tar.gz": {"num_bytes": 79362060, "checksum": "3dd3d79aaa078ad7955552ebc596e0a8894ffd7a4a88a51b2c8ee80c0e088152"}}, "download_size": 79362060, "post_processing_size": null, "dataset_size": 81512275, "size_in_bytes": 160874335}, "hu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 126163153, "num_examples": 3348, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 57056435, "num_examples": 1649, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 50306925, "num_examples": 1434, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 12051094, "num_examples": 295, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5881521, "num_examples": 169, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hu.tar.gz": {"num_bytes": 242758708, "checksum": "61f933155cba6c54c0b76d0ddd2caebd62d69228b7c935382112abe172660953"}}, "download_size": 242758708, "post_processing_size": null, "dataset_size": 251459128, "size_in_bytes": 494217836}, "ia": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ia", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 96577153, "num_examples": 3477, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 33204678, "num_examples": 899, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 67436779, "num_examples": 1601, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 30937041, "num_examples": 1095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 6769573, "num_examples": 192, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ia.tar.gz": {"num_bytes": 226499645, "checksum": "47a137a805ea8ce01f2cf9277739919a824a9fd13468345dfbd84eddb52c02f1"}}, "download_size": 226499645, "post_processing_size": null, "dataset_size": 234925224, "size_in_bytes": 461424869}, "id": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "id", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 63515863, "num_examples": 2130, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 60711104, "num_examples": 1844, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 56963520, "num_examples": 1835, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 206578628, "num_examples": 6782, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 16566129, "num_examples": 470, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/id.tar.gz": {"num_bytes": 475918233, "checksum": "71177fa9d2fac29f48db5feabc294f1d6bbcaa0c326b0d1099be66c0b804b245"}}, "download_size": 475918233, "post_processing_size": null, "dataset_size": 404335244, "size_in_bytes": 880253477}, "it": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "it", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2555546829, "num_examples": 58015, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 656285877, "num_examples": 12928, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 621955330, "num_examples": 12928, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 671213467, "num_examples": 14549, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 564610354, "num_examples": 12189, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/it.tar.gz": {"num_bytes": 5585781573, "checksum": "3a75b1631958af1487ee49b13cd27efc951183737ed515832cf714ed20c97808"}}, "download_size": 5585781573, "post_processing_size": null, "dataset_size": 5069611857, "size_in_bytes": 10655393430}, "ja": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ja", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 27600264, "num_examples": 722, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 26475556, "num_examples": 632, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 22098940, "num_examples": 586, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 34588931, "num_examples": 885, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 17819020, "num_examples": 504, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ja.tar.gz": {"num_bytes": 152879796, "checksum": "3614cd0d0abac80794351c78183967c83179fab390d7e19cad97758eb85ae558"}}, "download_size": 152879796, "post_processing_size": null, "dataset_size": 128582711, "size_in_bytes": 281462507}, "ka": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ka", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 47790695, "num_examples": 1058, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 30301524, "num_examples": 656, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 24951079, "num_examples": 527, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2144603, "num_examples": 44, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 7004160, "num_examples": 139, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ka.tar.gz": {"num_bytes": 104280554, "checksum": "7677df9d650234306a11bf8518be5807e72e7d5fc440d391304d1b99dd5517f5"}}, "download_size": 104280554, "post_processing_size": null, "dataset_size": 112192061, "size_in_bytes": 216472615}, "kab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "kab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3219289101, "num_examples": 120530, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 446453041, "num_examples": 14622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 414159937, "num_examples": 14622, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2282481767, "num_examples": 88021, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 581587104, "num_examples": 18134, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/kab.tar.gz": {"num_bytes": 17171606918, "checksum": "d2089107d4f3a84856c457a436a47a883b872022f2085cfad0501469be91fd95"}}, "download_size": 17171606918, "post_processing_size": null, "dataset_size": 6943970950, "size_in_bytes": 24115577868}, "ky": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ky", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 75460488, "num_examples": 1955, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 57116561, "num_examples": 1503, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 61393867, "num_examples": 1511, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 258081579, "num_examples": 7223, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 41007711, "num_examples": 926, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ky.tar.gz": {"num_bytes": 579440853, "checksum": "6efe0ca5384d0419fcf5fda0e0229a1b5eb80d8eeba2d7528a4c3c9f2593206f"}}, "download_size": 579440853, "post_processing_size": null, "dataset_size": 493060206, "size_in_bytes": 1072501059}, "lg": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lg", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 46910479, "num_examples": 1250, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 26951803, "num_examples": 584, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 16709367, "num_examples": 384, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 111180838, "num_examples": 3110, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 14069959, "num_examples": 290, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lg.tar.gz": {"num_bytes": 208197149, "checksum": "71243c65f638cd7f392fabe22e37cbafbdca4eb5a199210000ae957a88768040"}}, "download_size": 208197149, "post_processing_size": null, "dataset_size": 215822446, "size_in_bytes": 424019595}, "lt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 34605356, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 19940391, "num_examples": 466, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 10462851, "num_examples": 244, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 71150206, "num_examples": 1629, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 4414780, "num_examples": 102, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lt.tar.gz": {"num_bytes": 135299706, "checksum": "5ad3d93bc308f58a70e6685f71ae035237ef9caa0922232ac76846f7587bb8aa"}}, "download_size": 135299706, "post_processing_size": null, "dataset_size": 140573584, "size_in_bytes": 275873290}, "lv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 67269173, "num_examples": 2552, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 56937435, "num_examples": 1882, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 55289058, "num_examples": 2002, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 40259801, "num_examples": 1560, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 4383319, "num_examples": 143, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lv.tar.gz": {"num_bytes": 208307691, "checksum": "8a4350ccf24884ee1012032bfd5a87e0de50d780b1f8450d1cb52afe3f69c671"}}, "download_size": 208307691, "post_processing_size": null, "dataset_size": 224138786, "size_in_bytes": 432446477}, "mn": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mn", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 89913910, "num_examples": 2183, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 86737041, "num_examples": 1862, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 82343275, "num_examples": 1837, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 146365394, "num_examples": 3272, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 31764232, "num_examples": 667, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mn.tar.gz": {"num_bytes": 486369317, "checksum": "3aebc40d40eb19263576664a981f4bb8b221abeab78c8154adc3d16875c75ec7"}}, "download_size": 486369317, "post_processing_size": null, "dataset_size": 437123852, "size_in_bytes": 923493169}, "mt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 73850815, "num_examples": 2036, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 66520195, "num_examples": 1617, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 56412066, "num_examples": 1516, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 220666971, "num_examples": 5714, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 12328068, "num_examples": 314, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mt.tar.gz": {"num_bytes": 425114242, "checksum": "9d53000d7832d130c4d35fb412bfc092ab8de8e763a5d2a528aebf37f052af03"}}, "download_size": 425114242, "post_processing_size": null, "dataset_size": 429778115, "size_in_bytes": 854892357}, "nl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "nl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 321946148, "num_examples": 9460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 205287443, "num_examples": 5708, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 186095353, "num_examples": 4938, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 801418, "num_examples": 27, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 115133112, "num_examples": 3308, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/nl.tar.gz": {"num_bytes": 1741827548, "checksum": "048f823408e3bbd16e63111d1b4caecb0102606c440bbdf3e5b6a6bae1e1e3f1"}}, "download_size": 1741827548, "post_processing_size": null, "dataset_size": 829263474, "size_in_bytes": 2571091022}, "or": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "or", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 16067910, "num_examples": 388, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4270651, "num_examples": 98, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5485937, "num_examples": 129, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 177775963, "num_examples": 4302, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2701922, "num_examples": 62, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/or.tar.gz": {"num_bytes": 199077358, "checksum": "f3edad30166fe454f4d2b14adeece1434dc4b8eb7b0ece37aac8389b7122218a"}}, "download_size": 199077358, "post_processing_size": null, "dataset_size": 206302383, "size_in_bytes": 405379741}, "pa-IN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pa-IN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7572499, "num_examples": 211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4375532, "num_examples": 116, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1702492, "num_examples": 44, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 56683312, "num_examples": 1411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1690766, "num_examples": 43, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pa-IN.tar.gz": {"num_bytes": 69748265, "checksum": "d2e30f28a227ecb8209340c4133edf6489f35f8e3d1eb55ff22b96b12f36952c"}}, "download_size": 69748265, "post_processing_size": null, "dataset_size": 72024601, "size_in_bytes": 141772866}, "pl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 273394509, "num_examples": 7468, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 205047541, "num_examples": 5153, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 195917307, "num_examples": 5153, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 442144781, "num_examples": 12848, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 180801918, "num_examples": 4601, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pl.tar.gz": {"num_bytes": 3537012341, "checksum": "acbf77d36e083e2bcb7152ffb52ab7d1e3e64d33a3f51f106cdff7feff6279aa"}}, "download_size": 3537012341, "post_processing_size": null, "dataset_size": 1297306056, "size_in_bytes": 4834318397}, "pt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 231451724, "num_examples": 6514, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 180108694, "num_examples": 4641, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 165966139, "num_examples": 4592, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 283497435, "num_examples": 8390, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 67948392, "num_examples": 1740, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pt.tar.gz": {"num_bytes": 1704252567, "checksum": "6700de499f728e0e3f3ed4d7005e5b7db27ba2ddc872b21b0b404c3b4859d84b"}}, "download_size": 1704252567, "post_processing_size": null, "dataset_size": 928972384, "size_in_bytes": 2633224951}, "rm-sursilv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-sursilv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62396326, "num_examples": 1384, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 51707733, "num_examples": 1194, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 52114252, "num_examples": 1205, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 93351293, "num_examples": 2102, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 30593270, "num_examples": 639, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-sursilv.tar.gz": {"num_bytes": 275950479, "checksum": "3cfc4971b6ab8958d7c3d784977690fcc04ebd7570ecf788d5948df84a5481a1"}}, "download_size": 275950479, "post_processing_size": null, "dataset_size": 290162874, "size_in_bytes": 566113353}, "rm-vallader": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-vallader", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29528457, "num_examples": 574, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 18805466, "num_examples": 378, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 17012341, "num_examples": 357, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 36890435, "num_examples": 727, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 9356204, "num_examples": 374, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-vallader.tar.gz": {"num_bytes": 108113989, "checksum": "4fdb7dc5e20862a636ee7975831b39db29012d615f9139edf2d266b878ce43ae"}}, "download_size": 108113989, "post_processing_size": null, "dataset_size": 111592903, "size_in_bytes": 219706892}, "ro": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ro", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 107235430, "num_examples": 3399, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 60106568, "num_examples": 1778, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 30358457, "num_examples": 858, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 65805210, "num_examples": 1945, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 11108104, "num_examples": 485, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ro.tar.gz": {"num_bytes": 261978702, "checksum": "450b159e936ef6ff136fcdfad193675caec5b2230d1b6ca24c5cde491ff002cd"}}, "download_size": 261978702, "post_processing_size": null, "dataset_size": 274613769, "size_in_bytes": 536592471}, "ru": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ru", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 686168722, "num_examples": 15481, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 385349488, "num_examples": 8007, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 361164462, "num_examples": 7963, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 450644862, "num_examples": 10247, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 145739451, "num_examples": 3056, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ru.tar.gz": {"num_bytes": 3655676916, "checksum": "dcbb460e58d4afc78047c3801c9eb56d940b388eb350ee3da3de5bfe5a74a025"}}, "download_size": 3655676916, "post_processing_size": null, "dataset_size": 2029066985, "size_in_bytes": 5684743901}, "rw": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rw", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 21645788973, "num_examples": 515197, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 707959382, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 698662384, "num_examples": 15032, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 923146896, "num_examples": 22923, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 7969286423, "num_examples": 206790, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rw.tar.gz": {"num_bytes": 42545189583, "checksum": "cf8a07059b3713022d487f9a6b8f465271f3457c525a8b350f829f87b0132b41"}}, "download_size": 42545189583, "post_processing_size": null, "dataset_size": 31944844058, "size_in_bytes": 74490033641}, "sah": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sah", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68286985, "num_examples": 1442, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 38534020, "num_examples": 757, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 17900397, "num_examples": 405, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 62594222, "num_examples": 1275, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 3594160, "num_examples": 66, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sah.tar.gz": {"num_bytes": 181245626, "checksum": "dea1a454813c8f90abcbdf427fa922e1b7a116753deeb410af096ce5f0ae2405"}}, "download_size": 181245626, "post_processing_size": null, "dataset_size": 190909784, "size_in_bytes": 372155410}, "sl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 66122967, "num_examples": 2038, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 26872195, "num_examples": 881, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 16353097, "num_examples": 556, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 79268518, "num_examples": 2502, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 3048301, "num_examples": 92, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sl.tar.gz": {"num_bytes": 222751292, "checksum": "184cfbfe876a1f1c6317e4e34680c82a940db833afca78203c2929db1768a353"}}, "download_size": 222751292, "post_processing_size": null, "dataset_size": 191665078, "size_in_bytes": 414416370}, "sv-SE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sv-SE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62727263, "num_examples": 2331, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 59127381, "num_examples": 2027, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 53846355, "num_examples": 2019, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 109970049, "num_examples": 3043, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 13462567, "num_examples": 462, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sv-SE.tar.gz": {"num_bytes": 421434184, "checksum": "dc8634dafacb33be00f06e376f6c479d53f84f4834952593c8903f1080535213"}}, "download_size": 421434184, "post_processing_size": null, "dataset_size": 299133615, "size_in_bytes": 720567799}, "ta": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ta", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 69052658, "num_examples": 2009, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 67616865, "num_examples": 1781, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 63248009, "num_examples": 1779, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 246650792, "num_examples": 7428, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 23587453, "num_examples": 594, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ta.tar.gz": {"num_bytes": 679766097, "checksum": "78560d9d608a63ee75c3fdeb7f96f33cf0d85855ba6294b13e945de066eb46d8"}}, "download_size": 679766097, "post_processing_size": null, "dataset_size": 470155777, "size_in_bytes": 1149921874}, "th": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "th", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 100435725, "num_examples": 2917, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 82030679, "num_examples": 2188, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 63237632, "num_examples": 1922, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 95235301, "num_examples": 2671, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 18247080, "num_examples": 467, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/th.tar.gz": {"num_bytes": 341305736, "checksum": "a3d11043c49d3ea8ffb58dfab117cd831dd62a641e0a26ac60eb43e483534f7a"}}, "download_size": 341305736, "post_processing_size": null, "dataset_size": 359186417, "size_in_bytes": 700492153}, "tr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 57879052, "num_examples": 1831, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 60268059, "num_examples": 1647, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 54914798, "num_examples": 1647, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 10954154, "num_examples": 325, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 59288266, "num_examples": 1726, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tr.tar.gz": {"num_bytes": 620848700, "checksum": "b3f266c868b1fe9f76270ba76226b1cdc17f33b3e387e6b44a64d5419f8b9768"}}, "download_size": 620848700, "post_processing_size": null, "dataset_size": 243304329, "size_in_bytes": 864153029}, "tt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 348132697, "num_examples": 11211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 135120057, "num_examples": 4485, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 61690964, "num_examples": 2127, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 62158038, "num_examples": 1798, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 10403128, "num_examples": 287, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tt.tar.gz": {"num_bytes": 777153207, "checksum": "89c8d7a49584de720f1790df39e6f07996e2eecb07f6273f4ba2668e9fe4ad46"}}, "download_size": 777153207, "post_processing_size": null, "dataset_size": 617504884, "size_in_bytes": 1394658091}, "uk": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "uk", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 161925063, "num_examples": 4035, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 138422211, "num_examples": 3235, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 135483169, "num_examples": 3236, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 327979131, "num_examples": 8161, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 55745301, "num_examples": 1255, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/uk.tar.gz": {"num_bytes": 1218559031, "checksum": "f3ca0143cd84f5eacb583187052e69efec21c571a426efee91a765a2284519c2"}}, "download_size": 1218559031, "post_processing_size": null, "dataset_size": 819554875, "size_in_bytes": 2038113906}, "vi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6244454, "num_examples": 221, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6656365, "num_examples": 198, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6531856, "num_examples": 200, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 31315434, "num_examples": 870, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2981661, "num_examples": 78, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vi.tar.gz": {"num_bytes": 51929480, "checksum": "704bce8031932377cc21c017923ff1e96ebd2be9bd520adcf839f7a0f5f03b6e"}}, "download_size": 51929480, "post_processing_size": null, "dataset_size": 53729770, "size_in_bytes": 105659250}, "vot": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vot", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 146467, "num_examples": 3, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 7963322, "num_examples": 411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 107949, "num_examples": 6, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vot.tar.gz": {"num_bytes": 7792602, "checksum": "7fb07dd25b0575e8cd811bb8d1e5aebd17fdbca079a4ee50d81e0aaaff50f8b0"}}, "download_size": 7792602, "post_processing_size": null, "dataset_size": 8217738, "size_in_bytes": 16010340}, "zh-CN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-CN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 793667379, "num_examples": 18541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 420202544, "num_examples": 8760, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 396096323, "num_examples": 8743, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 381264783, "num_examples": 8948, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 266234479, "num_examples": 5305, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-CN.tar.gz": {"num_bytes": 2184602350, "checksum": "cd8589cac28541f9f996d1954f14c307954f1146ac44a8eadad8e31ebaf1f15e"}}, "download_size": 2184602350, "post_processing_size": null, "dataset_size": 2257465508, "size_in_bytes": 4442067858}, "zh-HK": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-HK", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 221459521, "num_examples": 7506, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 217627041, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 196071110, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1319233252, "num_examples": 38830, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 124170969, "num_examples": 2999, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-HK.tar.gz": {"num_bytes": 2774145806, "checksum": "8a525ce4664d6647701449d5e72f7d8658cc3a5fabc72e05c6883994fd3c0134"}}, "download_size": 2774145806, "post_processing_size": null, "dataset_size": 2078561893, "size_in_bytes": 4852707699}, "zh-TW": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n title = {Common Voice: A Massively-Multilingual Speech Corpus},\n booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n pages = {4211--4215},\n year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-TW", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 97323787, "num_examples": 3507, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 85512325, "num_examples": 2895, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 80402637, "num_examples": 2895, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 623801957, "num_examples": 22477, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 100241443, "num_examples": 3584, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-TW.tar.gz": {"num_bytes": 2182836295, "checksum": "67fadf561f8237690d4a4a1d63a9b3ac271b5d05438dc745b7e04282d909460f"}}, "download_size": 2182836295, "post_processing_size": null, "dataset_size": 987282149, "size_in_bytes": 3170118444}} \ No newline at end of file diff --git a/datasets/librispeech_asr/dataset_infos.json b/datasets/librispeech_asr/dataset_infos.json index 737f150eba8..3ff72c90ec3 100644 --- a/datasets/librispeech_asr/dataset_infos.json +++ b/datasets/librispeech_asr/dataset_infos.json @@ -1 +1 @@ -{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 11823891, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 43049490, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 894510, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 868614, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 56636505, "size_in_bytes": 30178014159}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 59561081, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 907644, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 934838, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 61403563, "size_in_bytes": 31297968940}} \ No newline at end of file +{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}} \ No newline at end of file diff --git a/datasets/openslr/dataset_infos.json b/datasets/openslr/dataset_infos.json index cd269515fb2..e3de4f34dbb 100644 --- a/datasets/openslr/dataset_infos.json +++ b/datasets/openslr/dataset_infos.json @@ -1 +1 @@ -{"SLR41": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR41", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2363510, "num_examples": 5822, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/41/jv_id_female.zip": {"num_bytes": 967179448, "checksum": "6fd795a441b3ddd62d6131d4bbd9231151af89f5d9ce5ac7d8ecb370a49576c7"}, "https://openslr.org/resources/41/jv_id_male.zip": {"num_bytes": 923612912, "checksum": "6ee23916b7489420a538e7032f58d7be088a615fb67ec3e7043414d436bb5c1a"}}, "download_size": 1890792360, "post_processing_size": null, "dataset_size": 2363510, "size_in_bytes": 1893155870}, "SLR42": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR42", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1397844, "num_examples": 2906, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/42/km_kh_male.zip": {"num_bytes": 866086951, "checksum": "c0ec9c0494c57f04cf1f2d8d2668d517598375f24e34de07272ecd637c332591"}}, "download_size": 866086951, "post_processing_size": null, "dataset_size": 1397844, "size_in_bytes": 867484795}, "SLR43": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR43", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1052597, "num_examples": 2064, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/43/ne_np_female.zip": {"num_bytes": 800375645, "checksum": "3f355b543e1fad7af5e63116db871fac8e0a2d2f1a2c8f6ebc742270819da101"}}, "download_size": 800375645, "post_processing_size": null, "dataset_size": 1052597, "size_in_bytes": 801428242}, "SLR44": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR44", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1733125, "num_examples": 4213, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/44/su_id_female.zip": {"num_bytes": 861425671, "checksum": "aa75bdef23b7bf0b980431d68df6bb32f695f3be365eb379d4c22516d2d11c5a"}, "https://openslr.org/resources/44/su_id_male.zip": {"num_bytes": 610827081, "checksum": "cabed03a45d4ce0f76e2de4d34b82d6876cd00d5ad6a5349629359028460652d"}}, "download_size": 1472252752, "post_processing_size": null, "dataset_size": 1733125, "size_in_bytes": 1473985877}, "SLR63": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR63", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1973791, "num_examples": 4126, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/63/ml_in_female.zip": {"num_bytes": 710218411, "checksum": "e82d70717d20304f20f635d248c8cb1fd0c9c888e35b9105c8306fc76498a67e"}, "https://openslr.org/resources/63/ml_in_male.zip": {"num_bytes": 635657888, "checksum": "d1a6de4f58f53b973596ff1c69a64afea70f899b044397ce37465c626eee2ab9"}}, "download_size": 1345876299, "post_processing_size": null, "dataset_size": 1973791, "size_in_bytes": 1347850090}, "SLR64": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR64", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 794097, "num_examples": 1569, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/64/mr_in_female.zip": {"num_bytes": 712155683, "checksum": "42b770ee87c95379b55e187b17dccb9fbacb05d0e8292430ffe16a7483948fe5"}}, "download_size": 712155683, "post_processing_size": null, "dataset_size": 794097, "size_in_bytes": 712949780}, "SLR65": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR65", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2092011, "num_examples": 4284, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/65/ta_in_female.zip": {"num_bytes": 769504014, "checksum": "fe00da10ae12ecd6dbe1afcc5abe365d44ad9036fb017cbd73bcfed71e0f8c81"}, "https://openslr.org/resources/65/ta_in_male.zip": {"num_bytes": 603800641, "checksum": "80e546e954939c92a0cd732446418b583b61da9f538f83b00cbd445cbebd4395"}}, "download_size": 1373304655, "post_processing_size": null, "dataset_size": 2092011, "size_in_bytes": 1375396666}, "SLR66": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR66", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1852199, "num_examples": 4448, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/66/te_in_female.zip": {"num_bytes": 505680804, "checksum": "3aa3c22d6fad33ed68951f4934ae47349ee76b77220d8261ec3bda8c24bf42b2"}, "https://openslr.org/resources/66/te_in_male.zip": {"num_bytes": 529447066, "checksum": "f8a0f239d39088b6702a2186681e2874328e9fcd9bfa6a0dd9e1dc5695be3185"}}, "download_size": 1035127870, "post_processing_size": null, "dataset_size": 1852199, "size_in_bytes": 1036980069}, "SLR69": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR69", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1603279, "num_examples": 4240, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/69/ca_es_female.zip": {"num_bytes": 1043934596, "checksum": "2ec39de70550a1cdb93aee960967125fb652b8d26b8de4f6e8658c62847c3f11"}, "https://openslr.org/resources/69/ca_es_male.zip": {"num_bytes": 804724947, "checksum": "8b412ffaa65cd85692c6eab038fc085a8ae5613c6eed38c097a65946c2ee9146"}}, "download_size": 1848659543, "post_processing_size": null, "dataset_size": 1603279, "size_in_bytes": 1850262822}, "SLR35": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR35", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 71645434, "num_examples": 185076, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/35/asr_javanese_0.zip": {"num_bytes": 1197540348, "checksum": "a871c8b71ff8fa9d95955447ca0c388e8c6f925aecfce92e1880bda2da113bcb"}, "https://openslr.org/resources/35/asr_javanese_1.zip": {"num_bytes": 1172552957, "checksum": "8024b18acc265bd502f2c36930ab41bd9a8a9cbc67d3db340698df1f6799eeef"}, "https://openslr.org/resources/35/asr_javanese_2.zip": {"num_bytes": 1187099390, "checksum": "c1605da9f74b0951533bcd9bb66a868dc4552929a6e3597d1f6b66c8436cd87e"}, "https://openslr.org/resources/35/asr_javanese_3.zip": {"num_bytes": 1178721705, "checksum": "f813cfa6ea5db1a2c7af65d62dd4d2edc932e67990570f0e5418675c0c9443d3"}, "https://openslr.org/resources/35/asr_javanese_4.zip": {"num_bytes": 1174850803, "checksum": "506af733d9c1f02372e83e997c924fac5a8141a7920d1ab345bd607e26438f0c"}, "https://openslr.org/resources/35/asr_javanese_5.zip": {"num_bytes": 1178642105, "checksum": "5300df2d2fd95033632fe7d3d77042804c92bf4f9983f11e707c20e358e45a91"}, "https://openslr.org/resources/35/asr_javanese_6.zip": {"num_bytes": 1197026293, "checksum": "a487e12f9d3fd1d3e6d8a8c2b58363813d6121e6a84937ec0d27601fea2654db"}, "https://openslr.org/resources/35/asr_javanese_7.zip": {"num_bytes": 1197789186, "checksum": "944ce7e3463f2e0d6024f8a1768e161a64dd4ab7cf8a96b7924fb8666ae2142e"}, "https://openslr.org/resources/35/asr_javanese_8.zip": {"num_bytes": 1185807385, "checksum": "cb598b81bd681dc51965c912bf4aabc4af6eb9b57d5a7cb0998ed121cec63dcd"}, "https://openslr.org/resources/35/asr_javanese_9.zip": {"num_bytes": 1160028499, "checksum": "7ee9de72360a59dc2a3cd3570627565a638d7a47f0f95ce4c14545bc9b6690b2"}, "https://openslr.org/resources/35/asr_javanese_a.zip": {"num_bytes": 1176016135, "checksum": "1fd1e4b06ed5d18614ef7ce414e7e0b6c105d6f5d87b3a6210fcedc4cc6f35cd"}, "https://openslr.org/resources/35/asr_javanese_b.zip": {"num_bytes": 1176960512, "checksum": "036bb70c60e8ba4b9be090dcd717e1da8744dd1cfdfab1eb4a4cd29d7755b938"}, "https://openslr.org/resources/35/asr_javanese_c.zip": {"num_bytes": 1178017086, "checksum": "a46d7b1ad184a4c2ac9099c8399f18fb8b14dd9ab4172a61f8abe3e464f7b2b9"}, "https://openslr.org/resources/35/asr_javanese_d.zip": {"num_bytes": 1199910382, "checksum": "9f3058916fe721f92a4d1a6c2794d82920b7c88ed780ef06fe69f8e448d0ddb6"}, "https://openslr.org/resources/35/asr_javanese_e.zip": {"num_bytes": 1175431904, "checksum": "d9234d3331fb11c082bc17f3b54c13dfa183c4cb13e35c030f7a1dbbe4c819cd"}, "https://openslr.org/resources/35/asr_javanese_f.zip": {"num_bytes": 1163711036, "checksum": "1bedbc295e4d1592e5730da8f0774fe360fe146d193b9c9815a8025072dd0b70"}}, "download_size": 18900105726, "post_processing_size": null, "dataset_size": 71645434, "size_in_bytes": 18971751160}, "SLR36": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR36", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 86668853, "num_examples": 219156, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/36/asr_sundanese_0.zip": {"num_bytes": 1433294860, "checksum": "947a0ac86008b88130f7c8f1b27d4a0f93886f653cf65b5948c0532cd0097c0d"}, "https://openslr.org/resources/36/asr_sundanese_1.zip": {"num_bytes": 1445470477, "checksum": "365f052dd9d977343002289ea1f29dea466f1243e5edf22dfb933e3fa93a6d87"}, "https://openslr.org/resources/36/asr_sundanese_2.zip": {"num_bytes": 1431289018, "checksum": "f9b9ee2a925d4fd934be3ebe09545ffb3f294f1e6d1380e837054fdf4ce8cff2"}, "https://openslr.org/resources/36/asr_sundanese_3.zip": {"num_bytes": 1446805642, "checksum": "ba3cc0e8e351a5456269c72edf7a3b50cf820941f93d7eed0e8f02a3b1b0a89f"}, "https://openslr.org/resources/36/asr_sundanese_4.zip": {"num_bytes": 1449187658, "checksum": "a6ca66e2537bd55dfaea4e716d847c70aead58c217184ab37afbd4065cca9262"}, "https://openslr.org/resources/36/asr_sundanese_5.zip": {"num_bytes": 1425741894, "checksum": "31bb8a9981b45855ab0b7c634c89040fe99b122455750a6ab956393dc9dec0d8"}, "https://openslr.org/resources/36/asr_sundanese_6.zip": {"num_bytes": 1415730042, "checksum": "3f23d6c4c67dc6f39a8ebb2af43e2efedb57028abb85eb519394f2d9ef8b3a21"}, "https://openslr.org/resources/36/asr_sundanese_7.zip": {"num_bytes": 1436967650, "checksum": "bce8f33b6ed62978915dfc601957162e9eece8bc3190cd2d548d7679409a3d77"}, "https://openslr.org/resources/36/asr_sundanese_8.zip": {"num_bytes": 1436421462, "checksum": "755e0af77d0bd6d4aa7895b2ab9fbf792c57efc49c8cec21d3d728fe3374b621"}, "https://openslr.org/resources/36/asr_sundanese_9.zip": {"num_bytes": 1434660332, "checksum": "5d426d2c99eb91ffd3db193d510e288133c426556430fe2e70e08f58815f5a31"}, "https://openslr.org/resources/36/asr_sundanese_a.zip": {"num_bytes": 1436753516, "checksum": "e032537b62aa8a8abe660bca418ac2e26a93bdc7a357b948a301bde286952fa5"}, "https://openslr.org/resources/36/asr_sundanese_b.zip": {"num_bytes": 1435014221, "checksum": "e999e83fde37ec973b1a1822aaa8769488c2a95058a3448661ac94c319881549"}, "https://openslr.org/resources/36/asr_sundanese_c.zip": {"num_bytes": 1429102490, "checksum": "275ac684fe7b8bf012dc251ddb91496e2d95c2c257ec87ab0847efa379e96787"}, "https://openslr.org/resources/36/asr_sundanese_d.zip": {"num_bytes": 1432973082, "checksum": "34ae64f8a29ddef2e05ca5ce8122b461a737d58d796dbe577a4e8a4a05c6b2ce"}, "https://openslr.org/resources/36/asr_sundanese_e.zip": {"num_bytes": 1443609656, "checksum": "25e36087063e0cc5e54cf04e5a4e065b19e0c1bc9cbc07a9f98635941b53bfea"}, "https://openslr.org/resources/36/asr_sundanese_f.zip": {"num_bytes": 1463531929, "checksum": "3d1410c31cc70994f82b9555967fa4c8d682aee288cc85b05b9c4e6352a49f14"}}, "download_size": 22996553929, "post_processing_size": null, "dataset_size": 86668853, "size_in_bytes": 23083222782}, "SLR70": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR70", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1304770, "num_examples": 3359, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/70/en_ng_female.zip": {"num_bytes": 759856787, "checksum": "e840afea824c9075db8c7d574e993837c6a4861fd0ff0275c4cc223aa00a785c"}, "https://openslr.org/resources/70/en_ng_male.zip": {"num_bytes": 454098409, "checksum": "f619d09d5ffdf0d4044ef1d57585eeaa50c0cbf08844782a9dd08f56ea9e567f"}}, "download_size": 1213955196, "post_processing_size": null, "dataset_size": 1304770, "size_in_bytes": 1215259966}, "SLR71": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR71", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1630901, "num_examples": 4374, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/71/es_cl_female.zip": {"num_bytes": 585615697, "checksum": "23593f3dac085d26f99df38159c1ab0ae2c23f5c97ad869292496abc6e171bc6"}, "https://openslr.org/resources/71/es_cl_male.zip": {"num_bytes": 859750206, "checksum": "ace2cbd6df28e94fdd636ba1263b72b557722b0d2abcf4c6e072011ac870cbee"}}, "download_size": 1445365903, "post_processing_size": null, "dataset_size": 1630901, "size_in_bytes": 1446996804}, "SLR72": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR72", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1825435, "num_examples": 4903, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/72/es_co_female.zip": {"num_bytes": 801960444, "checksum": "03721aa7b6b7fe1dd309a0c545cbef4898fac99ed811f4e1769b2fc16bb7eb70"}, "https://openslr.org/resources/72/es_co_male.zip": {"num_bytes": 810070088, "checksum": "2e72abf283adf3f52c28d9f4d59709d4a24fa57243dc696a99dfbc1b8e534c9a"}}, "download_size": 1612030532, "post_processing_size": null, "dataset_size": 1825435, "size_in_bytes": 1613855967}, "SLR73": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR73", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2027542, "num_examples": 5447, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/73/es_pe_female.zip": {"num_bytes": 913983951, "checksum": "0bcb138a6a4657fa52ec6ec129807dc2476d9a89184ea2ab4f588bbbddc12062"}, "https://openslr.org/resources/73/es_pe_male.zip": {"num_bytes": 1026322863, "checksum": "8baf41802bc59f7d170ee091d8676db725903efdcfeda12d699a31a746ae50bf"}}, "download_size": 1940306814, "post_processing_size": null, "dataset_size": 2027542, "size_in_bytes": 1942334356}, "SLR74": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR74", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 230997, "num_examples": 617, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/74/es_pr_female.zip": {"num_bytes": 214181314, "checksum": "0ff2f4ed63fbbc4305140bb88c71ca9a72b18c6686a755534b47ae28dce2861d"}}, "download_size": 214181314, "post_processing_size": null, "dataset_size": 230997, "size_in_bytes": 214412311}, "SLR75": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR75", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1252119, "num_examples": 3357, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/75/es_ve_female.zip": {"num_bytes": 517000277, "checksum": "4600baead7519afaa5f6b33cf3f4b2373e7f1902aa72841fc38582660b07fe31"}, "https://openslr.org/resources/75/es_ve_male.zip": {"num_bytes": 526316727, "checksum": "3cf8703b1b61de1bf964e26f0a2c7f0ec637b1a85eafd982e98de9301558b289"}}, "download_size": 1043317004, "post_processing_size": null, "dataset_size": 1252119, "size_in_bytes": 1044569123}, "SLR76": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR76", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2682483, "num_examples": 7136, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/76/eu_es_female.zip": {"num_bytes": 1622676657, "checksum": "b3eaa91f2be198c8455f46e802f671e33cba5d95909e58e0b59cb6638f5b4947"}, "https://openslr.org/resources/76/eu_es_male.zip": {"num_bytes": 1418448856, "checksum": "787bcb8369d3797a6b34b0e2d420f5255e12e6c6a385cd4e72ddde59c6018227"}}, "download_size": 3041125513, "post_processing_size": null, "dataset_size": 2682483, "size_in_bytes": 3043807996}, "SLR77": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR77", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2159694, "num_examples": 5587, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/77/gl_es_female.zip": {"num_bytes": 1656677564, "checksum": "e2cda7ef8d5f57b5f3086473d5297e6bb73757f0c446409245f407d7612c5060"}, "https://openslr.org/resources/77/gl_es_male.zip": {"num_bytes": 551314211, "checksum": "b768ed0b77fb4e88adf795dedcc872c53a4348ee8d11eb8efb4571fff94688be"}}, "download_size": 2207991775, "post_processing_size": null, "dataset_size": 2159694, "size_in_bytes": 2210151469}, "SLR78": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR78", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2077670, "num_examples": 4272, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/78/gu_in_female.zip": {"num_bytes": 917450036, "checksum": "bbda0815e0d2e01ad9310768e0e2be9efb612a9c56c66c4ab2f32b817da5c786"}, "https://openslr.org/resources/78/gu_in_male.zip": {"num_bytes": 825772066, "checksum": "ce474d1686104b3bd274a2d5192459cb4dee6e0c9bbcf3de1bb3b39c6ab89caf"}}, "download_size": 1743222102, "post_processing_size": null, "dataset_size": 2077670, "size_in_bytes": 1745299772}, "SLR79": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR79", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2130895, "num_examples": 4400, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/79/kn_in_female.zip": {"num_bytes": 980825420, "checksum": "182a147e5747ad4f4ac50a5e7e1ee3683e1c2c1d9105963365d151d664466b62"}, "https://openslr.org/resources/79/kn_in_male.zip": {"num_bytes": 840093695, "checksum": "38e3c0c51f792a3655cc8f4747b339df8ec4b1031a0fff590c1a1af6a8bbbcdf"}}, "download_size": 1820919115, "post_processing_size": null, "dataset_size": 2130895, "size_in_bytes": 1823050010}, "SLR80": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR80", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1282403, "num_examples": 2530, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/80/my_mm_female.zip": {"num_bytes": 948181015, "checksum": "a7cdcaa5e06864e02fa18fc0fe9595feadf332d6a63aadc01ce51a24969a2708"}}, "download_size": 948181015, "post_processing_size": null, "dataset_size": 1282403, "size_in_bytes": 949463418}, "SLR86": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR86", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 1341639, "num_examples": 3583, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/86/yo_ng_female.zip": {"num_bytes": 462033045, "checksum": "8875ebc839e57a3318ba1ce37d98c35da46d4f99f9f777f83fcf074257804060"}, "https://openslr.org/resources/86/yo_ng_male.zip": {"num_bytes": 445032517, "checksum": "58519b27f6954c446d0e7221b227a6f342b9c5ea66bf02af40c1616e086afc4c"}}, "download_size": 907065562, "post_processing_size": null, "dataset_size": 1341639, "size_in_bytes": 908407201}, "SLR32": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR32", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3958024, "num_examples": 9821, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/32/af_za.tar.gz": {"num_bytes": 950827926, "checksum": "b702a68486bf16cbf302d6e0808ea2e966f3dfa720ea0d6ce36d881aa266978f"}, "https://openslr.org/resources/32/st_za.tar.gz": {"num_bytes": 724425648, "checksum": "509202bcf6fae3b24508cfdbc3a6c886b29b4c3d822adbf6c40b21d98ada3fcf"}, "https://openslr.org/resources/32/tn_za.tar.gz": {"num_bytes": 729406193, "checksum": "3e6a522d2fafa071ec1d484cb79336ff36008a5d5d34e1444984e5df8312eb6f"}, "https://openslr.org/resources/32/xh_za.tar.gz": {"num_bytes": 907498093, "checksum": "712336c82637cbfb4304766dd7c0889bac1664945aed08bafb49eac29ae756c3"}}, "download_size": 3312157860, "post_processing_size": null, "dataset_size": 3958024, "size_in_bytes": 3316115884}, "SLR52": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR52", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 75447705, "num_examples": 185293, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/52/asr_sinhala_0.zip": {"num_bytes": 915237858, "checksum": "41bcd4cf6edde39e49bf8ca6b54c32e1403609759ff9edea2a2696ef7aa8fff5"}, "https://openslr.org/resources/52/asr_sinhala_1.zip": {"num_bytes": 908852134, "checksum": "7a4dd3279254f06ba8d1e864d2aa68eec1e6740cfc2b718d2bc060b878871e74"}, "https://openslr.org/resources/52/asr_sinhala_2.zip": {"num_bytes": 913568157, "checksum": "746b5ee016e09868016851ff2148000570b6cb6b9acde5d16527f20053d1cd14"}, "https://openslr.org/resources/52/asr_sinhala_3.zip": {"num_bytes": 901325452, "checksum": "a167e6bd9c0b64e105cc57528a455a4653303336b85731273039487d9f94afda"}, "https://openslr.org/resources/52/asr_sinhala_4.zip": {"num_bytes": 922493671, "checksum": "f17fc798ea085e876500095e8dd357d1088303598d190642978c353d51d2b94b"}, "https://openslr.org/resources/52/asr_sinhala_5.zip": {"num_bytes": 922505332, "checksum": "8285340d15064caa1da0635d50471c8de24d33e3d1ae7af3c63e4a23d3ba25fe"}, "https://openslr.org/resources/52/asr_sinhala_6.zip": {"num_bytes": 914729823, "checksum": "a511dc329dfc493c9e25d1315ab95da93a8a4b751e032c1848eeeb8655608403"}, "https://openslr.org/resources/52/asr_sinhala_7.zip": {"num_bytes": 911992962, "checksum": "8180736327c3147bac912c329fe3a571a61ecb6d4da7d4584acb0d34ab204fa5"}, "https://openslr.org/resources/52/asr_sinhala_8.zip": {"num_bytes": 924344925, "checksum": "fdf333751c254f8dc7b649fd1a48cf47ae8e855e369a182d88bee3325ae8a99d"}, "https://openslr.org/resources/52/asr_sinhala_9.zip": {"num_bytes": 920427318, "checksum": "288f4a7ea055b3963ad7d6a6e6e6189672715a42d0a1b6e99a1a8ba0fe67a9c6"}, "https://openslr.org/resources/52/asr_sinhala_a.zip": {"num_bytes": 901532849, "checksum": "da36de6739ce5b8c835c3c232d5122b883a88442ec3f91a534154b2a9177d0ec"}, "https://openslr.org/resources/52/asr_sinhala_b.zip": {"num_bytes": 924132571, "checksum": "4b5dd26de34b27e9cc88842e992626694fd329f23493f40c748d556c61395d2a"}, "https://openslr.org/resources/52/asr_sinhala_c.zip": {"num_bytes": 938991415, "checksum": "f6db1cece623fafe866a56b9f7100976823b32f968036b72a9a634138e87e92d"}, "https://openslr.org/resources/52/asr_sinhala_d.zip": {"num_bytes": 911368918, "checksum": "8ecc58c745998b05b21c8af05fdc741d437a654a8babba16c4970ad981074e2c"}, "https://openslr.org/resources/52/asr_sinhala_e.zip": {"num_bytes": 927771260, "checksum": "f5cbfd3c8d1c5bf6fe7a1c1ee606101368512a852856fb2d01f4dde7869f605a"}, "https://openslr.org/resources/52/asr_sinhala_f.zip": {"num_bytes": 917209429, "checksum": "65782dee2ba4256bab123835ef2277a3fd1116f20f403a2c4ff5ace3ac45714c"}}, "download_size": 14676484074, "post_processing_size": null, "dataset_size": 75447705, "size_in_bytes": 14751931779}, "SLR53": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR53", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 85804462, "num_examples": 218703, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/53/asr_bengali_0.zip": {"num_bytes": 919838172, "checksum": "c1bbeadbcffae8a40d8e54f25c6c3dea922951a322cc7875a18f52dec127741a"}, "https://openslr.org/resources/53/asr_bengali_1.zip": {"num_bytes": 906161405, "checksum": "b6af5d30439d25a5df20efd85bfa2e900ee962e3afb91fe88a65cdbb0689cf84"}, "https://openslr.org/resources/53/asr_bengali_2.zip": {"num_bytes": 921562897, "checksum": "ac0b50d5ad38d5295c16b7eb62901b273bd6df55dea7b1a8495e69c1a50c0986"}, "https://openslr.org/resources/53/asr_bengali_3.zip": {"num_bytes": 918817316, "checksum": "444760953dc4e006cd6e38ea647b611c7be93a07a78b1a6b83974fe3ebba6b65"}, "https://openslr.org/resources/53/asr_bengali_4.zip": {"num_bytes": 908199672, "checksum": "975a1b690ccfe0609ba50738666758ad92c3683416d1cf7771972496adb4313f"}, "https://openslr.org/resources/53/asr_bengali_5.zip": {"num_bytes": 932042725, "checksum": "21dec790c4f96771a28347ed4430c74d3f3bff046684f4522c2301f7029f632d"}, "https://openslr.org/resources/53/asr_bengali_6.zip": {"num_bytes": 900826997, "checksum": "b0f93fb831bb36c75a6f4c0731bfb991f8b6529bc3b16ee0bede3e7108a7679e"}, "https://openslr.org/resources/53/asr_bengali_7.zip": {"num_bytes": 927750265, "checksum": "647cbcfb9c92930f4625dbc107f4218cdd37f8e3494df23d42917640da22938c"}, "https://openslr.org/resources/53/asr_bengali_8.zip": {"num_bytes": 927268934, "checksum": "73168b982a0665fb4f1104eaafeb3ddc01780b39978649e01ce6ab7850a86de1"}, "https://openslr.org/resources/53/asr_bengali_9.zip": {"num_bytes": 906382286, "checksum": "25f678604ffe93fc986cc402dc4a4329f36eb44ab627c645c4957dbf8e85917c"}, "https://openslr.org/resources/53/asr_bengali_a.zip": {"num_bytes": 900283300, "checksum": "daf0fc69dbd041fd254e96df1732359666ace7c9aea9d5c64c03ab8add3a00c4"}, "https://openslr.org/resources/53/asr_bengali_b.zip": {"num_bytes": 910050386, "checksum": "2d6fc0f464130bc3761546ac0e8b085921d5f1c9afbf886b9c1fa95f9755fd26"}, "https://openslr.org/resources/53/asr_bengali_c.zip": {"num_bytes": 897120616, "checksum": "116e8e63882f548410a3b835d2d3b6a11e6a05969374d173b9c01a8ba7112abd"}, "https://openslr.org/resources/53/asr_bengali_d.zip": {"num_bytes": 914366610, "checksum": "aa155d8e0688d032229ad7a5e4c713e696d1ea531feae83ae3230e526f1db7a6"}, "https://openslr.org/resources/53/asr_bengali_e.zip": {"num_bytes": 922936447, "checksum": "2f6f97591adde2b469f29b601ba33bfc3e8049681594fe31be8a55204c70ae15"}, "https://openslr.org/resources/53/asr_bengali_f.zip": {"num_bytes": 917202893, "checksum": "42542ec7d434bd6a34b30c01fa24de206fb2d2e56afea745a14867a8c0eaa32c"}}, "download_size": 14630810921, "post_processing_size": null, "dataset_size": 85804462, "size_in_bytes": 14716615383}, "SLR54": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR54", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 61097744, "num_examples": 157905, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/54/asr_nepali_0.zip": {"num_bytes": 589002210, "checksum": "6c783a5a731c7a9c2cac678823a2ee7866db1acbad7f9a199bce3bf7a64e22b6"}, "https://openslr.org/resources/54/asr_nepali_1.zip": {"num_bytes": 582088242, "checksum": "661865704f3d9adacd74f8c98cd0f6a6e869902c6441efb96c761573dd1d2f05"}, "https://openslr.org/resources/54/asr_nepali_2.zip": {"num_bytes": 589401540, "checksum": "a2b4c373d7ebe5f2d491bf73c2324e6f5645d724df58fd765c71a3a86e7ab6d4"}, "https://openslr.org/resources/54/asr_nepali_3.zip": {"num_bytes": 574596426, "checksum": "6a925d4448f98694185d50cacfa380fad128b47ebf9d5519526b83dd6586348d"}, "https://openslr.org/resources/54/asr_nepali_4.zip": {"num_bytes": 583746586, "checksum": "7315f69b392690c22db32b3c2f14b82b1f64215c5a21d697c421d6d220a55bf0"}, "https://openslr.org/resources/54/asr_nepali_5.zip": {"num_bytes": 572967016, "checksum": "3891b332a9fc55e4fb0579bf67431989e92ab05b9715c0e9673cf356e878e0df"}, "https://openslr.org/resources/54/asr_nepali_6.zip": {"num_bytes": 588104006, "checksum": "78c321a8f55a5aa0c56feb791826a2751087cc87a36b27bba56ac6b124eac73f"}, "https://openslr.org/resources/54/asr_nepali_7.zip": {"num_bytes": 588410232, "checksum": "8b05b8b4aedfc9829cf33cd65ab3c1474eb8f738078b414d40b61f08782064ec"}, "https://openslr.org/resources/54/asr_nepali_8.zip": {"num_bytes": 585192213, "checksum": "0125cfc7c54e44bd4ac01d5558130a752cad26aa7055df753c65b400ece2c9f8"}, "https://openslr.org/resources/54/asr_nepali_9.zip": {"num_bytes": 578834881, "checksum": "6c68e80fe7c58a33aeb91b5b9bc37a99f9374a8f629e2a109bddba51d1712b12"}, "https://openslr.org/resources/54/asr_nepali_a.zip": {"num_bytes": 587798317, "checksum": "03b7bf7b6ace01a677e2a0dd079053ea29abf45743f197761190f3f52678e6df"}, "https://openslr.org/resources/54/asr_nepali_b.zip": {"num_bytes": 584397714, "checksum": "9a98d93ae91e75c6928d9222b387105e99030b8b81df9ada57c87f6b317c0853"}, "https://openslr.org/resources/54/asr_nepali_c.zip": {"num_bytes": 579440365, "checksum": "8bac1a046a86fc3684bfec2e5af1b1e0916ec5c2f1be5ccb1fb4778ecd7bb357"}, "https://openslr.org/resources/54/asr_nepali_d.zip": {"num_bytes": 588470094, "checksum": "9aad327fd72efcc009d060a8299aa70ca1757f1ec32fe3280d53e449ef75e5c3"}, "https://openslr.org/resources/54/asr_nepali_e.zip": {"num_bytes": 578091869, "checksum": "4ba73ada7cf482611b3ad3e17a77685b1ac872e5840953c07a1c6c2b10a83e4a"}, "https://openslr.org/resources/54/asr_nepali_f.zip": {"num_bytes": 577705651, "checksum": "062f4908802ab0d57362da1dfea4898898f6d21ba09596c1e271c2cda47297c6"}}, "download_size": 9328247362, "post_processing_size": null, "dataset_size": 61097744, "size_in_bytes": 9389345106}, "SLR83": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR83", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7098985, "num_examples": 17877, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/83/irish_english_male.zip": {"num_bytes": 164531638, "checksum": "2e5dbae4cc27e0e24e21f21c8e7464d219feb703f5fee3e567de6561a05024ed"}, "https://openslr.org/resources/83/midlands_english_female.zip": {"num_bytes": 103085118, "checksum": "aa1083a319e52d658b85c162905ec27cdf2ac6d5645b4caeab05a385a2c8a37f"}, "https://openslr.org/resources/83/midlands_english_male.zip": {"num_bytes": 166833961, "checksum": "8192c7a0626eb742f9999e63162289f8f9a86c9cb49ef68298dc7f624acaebcf"}, "https://openslr.org/resources/83/northern_english_female.zip": {"num_bytes": 314983063, "checksum": "22b6229d08481e7605b028185dc55dccd0611db428854f2d485d9ff34395a65c"}, "https://openslr.org/resources/83/northern_english_male.zip": {"num_bytes": 817772034, "checksum": "b627d500d1b2e3c4921fb6d91338ead7b972f67c1c2f0babb300e0ef844c7248"}, "https://openslr.org/resources/83/scottish_english_female.zip": {"num_bytes": 351443880, "checksum": "2dbe5545a7ab87112c7730086586f738ec4f42171f7738628ba084ed4ba15ccb"}, "https://openslr.org/resources/83/scottish_english_male.zip": {"num_bytes": 620254118, "checksum": "c7d2d9cd581c48a8323f6cc3886d879e2e7aca5931d98228e07d07b350d9f9a9"}, "https://openslr.org/resources/83/southern_english_female.zip": {"num_bytes": 1636701939, "checksum": "e0a2e8e64b9efdbd7bae5cdf33ac8b81db495b499c9d40da0a7d7842e42b1e76"}, "https://openslr.org/resources/83/southern_english_male.zip": {"num_bytes": 1700955740, "checksum": "788b1c59fb5713b0e1efebc02b7aa1b55182b21955493b299b9941c70a878cad"}, "https://openslr.org/resources/83/welsh_english_female.zip": {"num_bytes": 595683538, "checksum": "3c2465b9618e33f42c7d2ee753b54ae593714e758e236efcdd56c14c5bd89f1d"}, "https://openslr.org/resources/83/welsh_english_male.zip": {"num_bytes": 757645790, "checksum": "eaf8de0f8872bb647d5c159bb33713cfd58966bd59d733f5f399793778ea5058"}}, "download_size": 7229890819, "post_processing_size": null, "dataset_size": 7098985, "size_in_bytes": 7236989804}} \ No newline at end of file +{"SLR41": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR41", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2423902, "num_examples": 5822, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/41/jv_id_female.zip": {"num_bytes": 967179448, "checksum": "6fd795a441b3ddd62d6131d4bbd9231151af89f5d9ce5ac7d8ecb370a49576c7"}, "https://openslr.org/resources/41/jv_id_male.zip": {"num_bytes": 923612912, "checksum": "6ee23916b7489420a538e7032f58d7be088a615fb67ec3e7043414d436bb5c1a"}}, "download_size": 1890792360, "post_processing_size": null, "dataset_size": 2423902, "size_in_bytes": 1893216262}, "SLR42": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR42", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1427984, "num_examples": 2906, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/42/km_kh_male.zip": {"num_bytes": 866086951, "checksum": "c0ec9c0494c57f04cf1f2d8d2668d517598375f24e34de07272ecd637c332591"}}, "download_size": 866086951, "post_processing_size": null, "dataset_size": 1427984, "size_in_bytes": 867514935}, "SLR43": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR43", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1074005, "num_examples": 2064, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/43/ne_np_female.zip": {"num_bytes": 800375645, "checksum": "3f355b543e1fad7af5e63116db871fac8e0a2d2f1a2c8f6ebc742270819da101"}}, "download_size": 800375645, "post_processing_size": null, "dataset_size": 1074005, "size_in_bytes": 801449650}, "SLR44": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR44", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1776827, "num_examples": 4213, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/44/su_id_female.zip": {"num_bytes": 861425671, "checksum": "aa75bdef23b7bf0b980431d68df6bb32f695f3be365eb379d4c22516d2d11c5a"}, "https://openslr.org/resources/44/su_id_male.zip": {"num_bytes": 610827081, "checksum": "cabed03a45d4ce0f76e2de4d34b82d6876cd00d5ad6a5349629359028460652d"}}, "download_size": 1472252752, "post_processing_size": null, "dataset_size": 1776827, "size_in_bytes": 1474029579}, "SLR63": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR63", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2016587, "num_examples": 4126, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/63/ml_in_female.zip": {"num_bytes": 710218411, "checksum": "e82d70717d20304f20f635d248c8cb1fd0c9c888e35b9105c8306fc76498a67e"}, "https://openslr.org/resources/63/ml_in_male.zip": {"num_bytes": 635657888, "checksum": "d1a6de4f58f53b973596ff1c69a64afea70f899b044397ce37465c626eee2ab9"}}, "download_size": 1345876299, "post_processing_size": null, "dataset_size": 2016587, "size_in_bytes": 1347892886}, "SLR64": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR64", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 810375, "num_examples": 1569, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/64/mr_in_female.zip": {"num_bytes": 712155683, "checksum": "42b770ee87c95379b55e187b17dccb9fbacb05d0e8292430ffe16a7483948fe5"}}, "download_size": 712155683, "post_processing_size": null, "dataset_size": 810375, "size_in_bytes": 712966058}, "SLR65": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR65", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2136447, "num_examples": 4284, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/65/ta_in_female.zip": {"num_bytes": 769504014, "checksum": "fe00da10ae12ecd6dbe1afcc5abe365d44ad9036fb017cbd73bcfed71e0f8c81"}, "https://openslr.org/resources/65/ta_in_male.zip": {"num_bytes": 603800641, "checksum": "80e546e954939c92a0cd732446418b583b61da9f538f83b00cbd445cbebd4395"}}, "download_size": 1373304655, "post_processing_size": null, "dataset_size": 2136447, "size_in_bytes": 1375441102}, "SLR66": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR66", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1898335, "num_examples": 4448, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/66/te_in_female.zip": {"num_bytes": 505680804, "checksum": "3aa3c22d6fad33ed68951f4934ae47349ee76b77220d8261ec3bda8c24bf42b2"}, "https://openslr.org/resources/66/te_in_male.zip": {"num_bytes": 529447066, "checksum": "f8a0f239d39088b6702a2186681e2874328e9fcd9bfa6a0dd9e1dc5695be3185"}}, "download_size": 1035127870, "post_processing_size": null, "dataset_size": 1898335, "size_in_bytes": 1037026205}, "SLR69": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR69", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1647263, "num_examples": 4240, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/69/ca_es_female.zip": {"num_bytes": 1043934596, "checksum": "2ec39de70550a1cdb93aee960967125fb652b8d26b8de4f6e8658c62847c3f11"}, "https://openslr.org/resources/69/ca_es_male.zip": {"num_bytes": 804724947, "checksum": "8b412ffaa65cd85692c6eab038fc085a8ae5613c6eed38c097a65946c2ee9146"}}, "download_size": 1848659543, "post_processing_size": null, "dataset_size": 1647263, "size_in_bytes": 1850306806}, "SLR35": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR35", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 73565374, "num_examples": 185076, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/35/asr_javanese_0.zip": {"num_bytes": 1197540348, "checksum": "a871c8b71ff8fa9d95955447ca0c388e8c6f925aecfce92e1880bda2da113bcb"}, "https://openslr.org/resources/35/asr_javanese_1.zip": {"num_bytes": 1172552957, "checksum": "8024b18acc265bd502f2c36930ab41bd9a8a9cbc67d3db340698df1f6799eeef"}, "https://openslr.org/resources/35/asr_javanese_2.zip": {"num_bytes": 1187099390, "checksum": "c1605da9f74b0951533bcd9bb66a868dc4552929a6e3597d1f6b66c8436cd87e"}, "https://openslr.org/resources/35/asr_javanese_3.zip": {"num_bytes": 1178721705, "checksum": "f813cfa6ea5db1a2c7af65d62dd4d2edc932e67990570f0e5418675c0c9443d3"}, "https://openslr.org/resources/35/asr_javanese_4.zip": {"num_bytes": 1174850803, "checksum": "506af733d9c1f02372e83e997c924fac5a8141a7920d1ab345bd607e26438f0c"}, "https://openslr.org/resources/35/asr_javanese_5.zip": {"num_bytes": 1178642105, "checksum": "5300df2d2fd95033632fe7d3d77042804c92bf4f9983f11e707c20e358e45a91"}, "https://openslr.org/resources/35/asr_javanese_6.zip": {"num_bytes": 1197026293, "checksum": "a487e12f9d3fd1d3e6d8a8c2b58363813d6121e6a84937ec0d27601fea2654db"}, "https://openslr.org/resources/35/asr_javanese_7.zip": {"num_bytes": 1197789186, "checksum": "944ce7e3463f2e0d6024f8a1768e161a64dd4ab7cf8a96b7924fb8666ae2142e"}, "https://openslr.org/resources/35/asr_javanese_8.zip": {"num_bytes": 1185807385, "checksum": "cb598b81bd681dc51965c912bf4aabc4af6eb9b57d5a7cb0998ed121cec63dcd"}, "https://openslr.org/resources/35/asr_javanese_9.zip": {"num_bytes": 1160028499, "checksum": "7ee9de72360a59dc2a3cd3570627565a638d7a47f0f95ce4c14545bc9b6690b2"}, "https://openslr.org/resources/35/asr_javanese_a.zip": {"num_bytes": 1176016135, "checksum": "1fd1e4b06ed5d18614ef7ce414e7e0b6c105d6f5d87b3a6210fcedc4cc6f35cd"}, "https://openslr.org/resources/35/asr_javanese_b.zip": {"num_bytes": 1176960512, "checksum": "036bb70c60e8ba4b9be090dcd717e1da8744dd1cfdfab1eb4a4cd29d7755b938"}, "https://openslr.org/resources/35/asr_javanese_c.zip": {"num_bytes": 1178017086, "checksum": "a46d7b1ad184a4c2ac9099c8399f18fb8b14dd9ab4172a61f8abe3e464f7b2b9"}, "https://openslr.org/resources/35/asr_javanese_d.zip": {"num_bytes": 1199910382, "checksum": "9f3058916fe721f92a4d1a6c2794d82920b7c88ed780ef06fe69f8e448d0ddb6"}, "https://openslr.org/resources/35/asr_javanese_e.zip": {"num_bytes": 1175431904, "checksum": "d9234d3331fb11c082bc17f3b54c13dfa183c4cb13e35c030f7a1dbbe4c819cd"}, "https://openslr.org/resources/35/asr_javanese_f.zip": {"num_bytes": 1163711036, "checksum": "1bedbc295e4d1592e5730da8f0774fe360fe146d193b9c9815a8025072dd0b70"}}, "download_size": 18900105726, "post_processing_size": null, "dataset_size": 73565374, "size_in_bytes": 18973671100}, "SLR36": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR36", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 88942337, "num_examples": 219156, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/36/asr_sundanese_0.zip": {"num_bytes": 1433294860, "checksum": "947a0ac86008b88130f7c8f1b27d4a0f93886f653cf65b5948c0532cd0097c0d"}, "https://openslr.org/resources/36/asr_sundanese_1.zip": {"num_bytes": 1445470477, "checksum": "365f052dd9d977343002289ea1f29dea466f1243e5edf22dfb933e3fa93a6d87"}, "https://openslr.org/resources/36/asr_sundanese_2.zip": {"num_bytes": 1431289018, "checksum": "f9b9ee2a925d4fd934be3ebe09545ffb3f294f1e6d1380e837054fdf4ce8cff2"}, "https://openslr.org/resources/36/asr_sundanese_3.zip": {"num_bytes": 1446805642, "checksum": "ba3cc0e8e351a5456269c72edf7a3b50cf820941f93d7eed0e8f02a3b1b0a89f"}, "https://openslr.org/resources/36/asr_sundanese_4.zip": {"num_bytes": 1449187658, "checksum": "a6ca66e2537bd55dfaea4e716d847c70aead58c217184ab37afbd4065cca9262"}, "https://openslr.org/resources/36/asr_sundanese_5.zip": {"num_bytes": 1425741894, "checksum": "31bb8a9981b45855ab0b7c634c89040fe99b122455750a6ab956393dc9dec0d8"}, "https://openslr.org/resources/36/asr_sundanese_6.zip": {"num_bytes": 1415730042, "checksum": "3f23d6c4c67dc6f39a8ebb2af43e2efedb57028abb85eb519394f2d9ef8b3a21"}, "https://openslr.org/resources/36/asr_sundanese_7.zip": {"num_bytes": 1436967650, "checksum": "bce8f33b6ed62978915dfc601957162e9eece8bc3190cd2d548d7679409a3d77"}, "https://openslr.org/resources/36/asr_sundanese_8.zip": {"num_bytes": 1436421462, "checksum": "755e0af77d0bd6d4aa7895b2ab9fbf792c57efc49c8cec21d3d728fe3374b621"}, "https://openslr.org/resources/36/asr_sundanese_9.zip": {"num_bytes": 1434660332, "checksum": "5d426d2c99eb91ffd3db193d510e288133c426556430fe2e70e08f58815f5a31"}, "https://openslr.org/resources/36/asr_sundanese_a.zip": {"num_bytes": 1436753516, "checksum": "e032537b62aa8a8abe660bca418ac2e26a93bdc7a357b948a301bde286952fa5"}, "https://openslr.org/resources/36/asr_sundanese_b.zip": {"num_bytes": 1435014221, "checksum": "e999e83fde37ec973b1a1822aaa8769488c2a95058a3448661ac94c319881549"}, "https://openslr.org/resources/36/asr_sundanese_c.zip": {"num_bytes": 1429102490, "checksum": "275ac684fe7b8bf012dc251ddb91496e2d95c2c257ec87ab0847efa379e96787"}, "https://openslr.org/resources/36/asr_sundanese_d.zip": {"num_bytes": 1432973082, "checksum": "34ae64f8a29ddef2e05ca5ce8122b461a737d58d796dbe577a4e8a4a05c6b2ce"}, "https://openslr.org/resources/36/asr_sundanese_e.zip": {"num_bytes": 1443609656, "checksum": "25e36087063e0cc5e54cf04e5a4e065b19e0c1bc9cbc07a9f98635941b53bfea"}, "https://openslr.org/resources/36/asr_sundanese_f.zip": {"num_bytes": 1463531929, "checksum": "3d1410c31cc70994f82b9555967fa4c8d682aee288cc85b05b9c4e6352a49f14"}}, "download_size": 22996553929, "post_processing_size": null, "dataset_size": 88942337, "size_in_bytes": 23085496266}, "SLR70": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR70", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1339608, "num_examples": 3359, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/70/en_ng_female.zip": {"num_bytes": 759856787, "checksum": "e840afea824c9075db8c7d574e993837c6a4861fd0ff0275c4cc223aa00a785c"}, "https://openslr.org/resources/70/en_ng_male.zip": {"num_bytes": 454098409, "checksum": "f619d09d5ffdf0d4044ef1d57585eeaa50c0cbf08844782a9dd08f56ea9e567f"}}, "download_size": 1213955196, "post_processing_size": null, "dataset_size": 1339608, "size_in_bytes": 1215294804}, "SLR71": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR71", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1676273, "num_examples": 4374, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/71/es_cl_female.zip": {"num_bytes": 585615697, "checksum": "23593f3dac085d26f99df38159c1ab0ae2c23f5c97ad869292496abc6e171bc6"}, "https://openslr.org/resources/71/es_cl_male.zip": {"num_bytes": 859750206, "checksum": "ace2cbd6df28e94fdd636ba1263b72b557722b0d2abcf4c6e072011ac870cbee"}}, "download_size": 1445365903, "post_processing_size": null, "dataset_size": 1676273, "size_in_bytes": 1447042176}, "SLR72": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR72", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1876301, "num_examples": 4903, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/72/es_co_female.zip": {"num_bytes": 801960444, "checksum": "03721aa7b6b7fe1dd309a0c545cbef4898fac99ed811f4e1769b2fc16bb7eb70"}, "https://openslr.org/resources/72/es_co_male.zip": {"num_bytes": 810070088, "checksum": "2e72abf283adf3f52c28d9f4d59709d4a24fa57243dc696a99dfbc1b8e534c9a"}}, "download_size": 1612030532, "post_processing_size": null, "dataset_size": 1876301, "size_in_bytes": 1613906833}, "SLR73": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR73", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2084052, "num_examples": 5447, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/73/es_pe_female.zip": {"num_bytes": 913983951, "checksum": "0bcb138a6a4657fa52ec6ec129807dc2476d9a89184ea2ab4f588bbbddc12062"}, "https://openslr.org/resources/73/es_pe_male.zip": {"num_bytes": 1026322863, "checksum": "8baf41802bc59f7d170ee091d8676db725903efdcfeda12d699a31a746ae50bf"}}, "download_size": 1940306814, "post_processing_size": null, "dataset_size": 2084052, "size_in_bytes": 1942390866}, "SLR74": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR74", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 237395, "num_examples": 617, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/74/es_pr_female.zip": {"num_bytes": 214181314, "checksum": "0ff2f4ed63fbbc4305140bb88c71ca9a72b18c6686a755534b47ae28dce2861d"}}, "download_size": 214181314, "post_processing_size": null, "dataset_size": 237395, "size_in_bytes": 214418709}, "SLR75": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR75", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1286937, "num_examples": 3357, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/75/es_ve_female.zip": {"num_bytes": 517000277, "checksum": "4600baead7519afaa5f6b33cf3f4b2373e7f1902aa72841fc38582660b07fe31"}, "https://openslr.org/resources/75/es_ve_male.zip": {"num_bytes": 526316727, "checksum": "3cf8703b1b61de1bf964e26f0a2c7f0ec637b1a85eafd982e98de9301558b289"}}, "download_size": 1043317004, "post_processing_size": null, "dataset_size": 1286937, "size_in_bytes": 1044603941}, "SLR76": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR76", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2756507, "num_examples": 7136, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/76/eu_es_female.zip": {"num_bytes": 1622676657, "checksum": "b3eaa91f2be198c8455f46e802f671e33cba5d95909e58e0b59cb6638f5b4947"}, "https://openslr.org/resources/76/eu_es_male.zip": {"num_bytes": 1418448856, "checksum": "787bcb8369d3797a6b34b0e2d420f5255e12e6c6a385cd4e72ddde59c6018227"}}, "download_size": 3041125513, "post_processing_size": null, "dataset_size": 2756507, "size_in_bytes": 3043882020}, "SLR77": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR77", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2217652, "num_examples": 5587, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/77/gl_es_female.zip": {"num_bytes": 1656677564, "checksum": "e2cda7ef8d5f57b5f3086473d5297e6bb73757f0c446409245f407d7612c5060"}, "https://openslr.org/resources/77/gl_es_male.zip": {"num_bytes": 551314211, "checksum": "b768ed0b77fb4e88adf795dedcc872c53a4348ee8d11eb8efb4571fff94688be"}}, "download_size": 2207991775, "post_processing_size": null, "dataset_size": 2217652, "size_in_bytes": 2210209427}, "SLR78": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR78", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2121986, "num_examples": 4272, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/78/gu_in_female.zip": {"num_bytes": 917450036, "checksum": "bbda0815e0d2e01ad9310768e0e2be9efb612a9c56c66c4ab2f32b817da5c786"}, "https://openslr.org/resources/78/gu_in_male.zip": {"num_bytes": 825772066, "checksum": "ce474d1686104b3bd274a2d5192459cb4dee6e0c9bbcf3de1bb3b39c6ab89caf"}}, "download_size": 1743222102, "post_processing_size": null, "dataset_size": 2121986, "size_in_bytes": 1745344088}, "SLR79": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR79", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2176539, "num_examples": 4400, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/79/kn_in_female.zip": {"num_bytes": 980825420, "checksum": "182a147e5747ad4f4ac50a5e7e1ee3683e1c2c1d9105963365d151d664466b62"}, "https://openslr.org/resources/79/kn_in_male.zip": {"num_bytes": 840093695, "checksum": "38e3c0c51f792a3655cc8f4747b339df8ec4b1031a0fff590c1a1af6a8bbbcdf"}}, "download_size": 1820919115, "post_processing_size": null, "dataset_size": 2176539, "size_in_bytes": 1823095654}, "SLR80": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR80", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1308651, "num_examples": 2530, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/80/my_mm_female.zip": {"num_bytes": 948181015, "checksum": "a7cdcaa5e06864e02fa18fc0fe9595feadf332d6a63aadc01ce51a24969a2708"}}, "download_size": 948181015, "post_processing_size": null, "dataset_size": 1308651, "size_in_bytes": 949489666}, "SLR86": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR86", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 1378801, "num_examples": 3583, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/86/yo_ng_female.zip": {"num_bytes": 462033045, "checksum": "8875ebc839e57a3318ba1ce37d98c35da46d4f99f9f777f83fcf074257804060"}, "https://openslr.org/resources/86/yo_ng_male.zip": {"num_bytes": 445032517, "checksum": "58519b27f6954c446d0e7221b227a6f342b9c5ea66bf02af40c1616e086afc4c"}}, "download_size": 907065562, "post_processing_size": null, "dataset_size": 1378801, "size_in_bytes": 908444363}, "SLR32": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR32", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4544052380, "num_examples": 9821, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/32/af_za.tar.gz": {"num_bytes": 950827926, "checksum": "b702a68486bf16cbf302d6e0808ea2e966f3dfa720ea0d6ce36d881aa266978f"}, "https://openslr.org/resources/32/st_za.tar.gz": {"num_bytes": 724425648, "checksum": "509202bcf6fae3b24508cfdbc3a6c886b29b4c3d822adbf6c40b21d98ada3fcf"}, "https://openslr.org/resources/32/tn_za.tar.gz": {"num_bytes": 729406193, "checksum": "3e6a522d2fafa071ec1d484cb79336ff36008a5d5d34e1444984e5df8312eb6f"}, "https://openslr.org/resources/32/xh_za.tar.gz": {"num_bytes": 907498093, "checksum": "712336c82637cbfb4304766dd7c0889bac1664945aed08bafb49eac29ae756c3"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/af_za/line_index.tsv": {"num_bytes": 218947, "checksum": "c4d096cb50a037ce8c3a41a198615083d93c3bbbd6f1cfdb52c3ebfa5de09340"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/st_za/line_index.tsv": {"num_bytes": 154784, "checksum": "04cd7e8db7eae8ad9044fa8ac79f3e48fd3a64d045cd907ff005fd82f1ca6a82"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/tn_za/line_index.tsv": {"num_bytes": 174447, "checksum": "c621270b3ee70d515bbce846e1b64135dc4554f62cf3528d9550a1512f5841f1"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/xh_za/line_index.tsv": {"num_bytes": 178725, "checksum": "6a356aac4e698561302574f62be30029536ac057e009633f0af8de68513d874a"}}, "download_size": 3312884763, "post_processing_size": null, "dataset_size": 4544052380, "size_in_bytes": 7856937143}, "SLR52": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR52", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 77369899, "num_examples": 185293, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/52/asr_sinhala_0.zip": {"num_bytes": 915237858, "checksum": "41bcd4cf6edde39e49bf8ca6b54c32e1403609759ff9edea2a2696ef7aa8fff5"}, "https://openslr.org/resources/52/asr_sinhala_1.zip": {"num_bytes": 908852134, "checksum": "7a4dd3279254f06ba8d1e864d2aa68eec1e6740cfc2b718d2bc060b878871e74"}, "https://openslr.org/resources/52/asr_sinhala_2.zip": {"num_bytes": 913568157, "checksum": "746b5ee016e09868016851ff2148000570b6cb6b9acde5d16527f20053d1cd14"}, "https://openslr.org/resources/52/asr_sinhala_3.zip": {"num_bytes": 901325452, "checksum": "a167e6bd9c0b64e105cc57528a455a4653303336b85731273039487d9f94afda"}, "https://openslr.org/resources/52/asr_sinhala_4.zip": {"num_bytes": 922493671, "checksum": "f17fc798ea085e876500095e8dd357d1088303598d190642978c353d51d2b94b"}, "https://openslr.org/resources/52/asr_sinhala_5.zip": {"num_bytes": 922505332, "checksum": "8285340d15064caa1da0635d50471c8de24d33e3d1ae7af3c63e4a23d3ba25fe"}, "https://openslr.org/resources/52/asr_sinhala_6.zip": {"num_bytes": 914729823, "checksum": "a511dc329dfc493c9e25d1315ab95da93a8a4b751e032c1848eeeb8655608403"}, "https://openslr.org/resources/52/asr_sinhala_7.zip": {"num_bytes": 911992962, "checksum": "8180736327c3147bac912c329fe3a571a61ecb6d4da7d4584acb0d34ab204fa5"}, "https://openslr.org/resources/52/asr_sinhala_8.zip": {"num_bytes": 924344925, "checksum": "fdf333751c254f8dc7b649fd1a48cf47ae8e855e369a182d88bee3325ae8a99d"}, "https://openslr.org/resources/52/asr_sinhala_9.zip": {"num_bytes": 920427318, "checksum": "288f4a7ea055b3963ad7d6a6e6e6189672715a42d0a1b6e99a1a8ba0fe67a9c6"}, "https://openslr.org/resources/52/asr_sinhala_a.zip": {"num_bytes": 901532849, "checksum": "da36de6739ce5b8c835c3c232d5122b883a88442ec3f91a534154b2a9177d0ec"}, "https://openslr.org/resources/52/asr_sinhala_b.zip": {"num_bytes": 924132571, "checksum": "4b5dd26de34b27e9cc88842e992626694fd329f23493f40c748d556c61395d2a"}, "https://openslr.org/resources/52/asr_sinhala_c.zip": {"num_bytes": 938991415, "checksum": "f6db1cece623fafe866a56b9f7100976823b32f968036b72a9a634138e87e92d"}, "https://openslr.org/resources/52/asr_sinhala_d.zip": {"num_bytes": 911368918, "checksum": "8ecc58c745998b05b21c8af05fdc741d437a654a8babba16c4970ad981074e2c"}, "https://openslr.org/resources/52/asr_sinhala_e.zip": {"num_bytes": 927771260, "checksum": "f5cbfd3c8d1c5bf6fe7a1c1ee606101368512a852856fb2d01f4dde7869f605a"}, "https://openslr.org/resources/52/asr_sinhala_f.zip": {"num_bytes": 917209429, "checksum": "65782dee2ba4256bab123835ef2277a3fd1116f20f403a2c4ff5ace3ac45714c"}}, "download_size": 14676484074, "post_processing_size": null, "dataset_size": 77369899, "size_in_bytes": 14753853973}, "SLR53": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR53", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 88073248, "num_examples": 218703, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/53/asr_bengali_0.zip": {"num_bytes": 919838172, "checksum": "c1bbeadbcffae8a40d8e54f25c6c3dea922951a322cc7875a18f52dec127741a"}, "https://openslr.org/resources/53/asr_bengali_1.zip": {"num_bytes": 906161405, "checksum": "b6af5d30439d25a5df20efd85bfa2e900ee962e3afb91fe88a65cdbb0689cf84"}, "https://openslr.org/resources/53/asr_bengali_2.zip": {"num_bytes": 921562897, "checksum": "ac0b50d5ad38d5295c16b7eb62901b273bd6df55dea7b1a8495e69c1a50c0986"}, "https://openslr.org/resources/53/asr_bengali_3.zip": {"num_bytes": 918817316, "checksum": "444760953dc4e006cd6e38ea647b611c7be93a07a78b1a6b83974fe3ebba6b65"}, "https://openslr.org/resources/53/asr_bengali_4.zip": {"num_bytes": 908199672, "checksum": "975a1b690ccfe0609ba50738666758ad92c3683416d1cf7771972496adb4313f"}, "https://openslr.org/resources/53/asr_bengali_5.zip": {"num_bytes": 932042725, "checksum": "21dec790c4f96771a28347ed4430c74d3f3bff046684f4522c2301f7029f632d"}, "https://openslr.org/resources/53/asr_bengali_6.zip": {"num_bytes": 900826997, "checksum": "b0f93fb831bb36c75a6f4c0731bfb991f8b6529bc3b16ee0bede3e7108a7679e"}, "https://openslr.org/resources/53/asr_bengali_7.zip": {"num_bytes": 927750265, "checksum": "647cbcfb9c92930f4625dbc107f4218cdd37f8e3494df23d42917640da22938c"}, "https://openslr.org/resources/53/asr_bengali_8.zip": {"num_bytes": 927268934, "checksum": "73168b982a0665fb4f1104eaafeb3ddc01780b39978649e01ce6ab7850a86de1"}, "https://openslr.org/resources/53/asr_bengali_9.zip": {"num_bytes": 906382286, "checksum": "25f678604ffe93fc986cc402dc4a4329f36eb44ab627c645c4957dbf8e85917c"}, "https://openslr.org/resources/53/asr_bengali_a.zip": {"num_bytes": 900283300, "checksum": "daf0fc69dbd041fd254e96df1732359666ace7c9aea9d5c64c03ab8add3a00c4"}, "https://openslr.org/resources/53/asr_bengali_b.zip": {"num_bytes": 910050386, "checksum": "2d6fc0f464130bc3761546ac0e8b085921d5f1c9afbf886b9c1fa95f9755fd26"}, "https://openslr.org/resources/53/asr_bengali_c.zip": {"num_bytes": 897120616, "checksum": "116e8e63882f548410a3b835d2d3b6a11e6a05969374d173b9c01a8ba7112abd"}, "https://openslr.org/resources/53/asr_bengali_d.zip": {"num_bytes": 914366610, "checksum": "aa155d8e0688d032229ad7a5e4c713e696d1ea531feae83ae3230e526f1db7a6"}, "https://openslr.org/resources/53/asr_bengali_e.zip": {"num_bytes": 922936447, "checksum": "2f6f97591adde2b469f29b601ba33bfc3e8049681594fe31be8a55204c70ae15"}, "https://openslr.org/resources/53/asr_bengali_f.zip": {"num_bytes": 917202893, "checksum": "42542ec7d434bd6a34b30c01fa24de206fb2d2e56afea745a14867a8c0eaa32c"}}, "download_size": 14630810921, "post_processing_size": null, "dataset_size": 88073248, "size_in_bytes": 14718884169}, "SLR54": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR54", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62735822, "num_examples": 157905, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/54/asr_nepali_0.zip": {"num_bytes": 589002210, "checksum": "6c783a5a731c7a9c2cac678823a2ee7866db1acbad7f9a199bce3bf7a64e22b6"}, "https://openslr.org/resources/54/asr_nepali_1.zip": {"num_bytes": 582088242, "checksum": "661865704f3d9adacd74f8c98cd0f6a6e869902c6441efb96c761573dd1d2f05"}, "https://openslr.org/resources/54/asr_nepali_2.zip": {"num_bytes": 589401540, "checksum": "a2b4c373d7ebe5f2d491bf73c2324e6f5645d724df58fd765c71a3a86e7ab6d4"}, "https://openslr.org/resources/54/asr_nepali_3.zip": {"num_bytes": 574596426, "checksum": "6a925d4448f98694185d50cacfa380fad128b47ebf9d5519526b83dd6586348d"}, "https://openslr.org/resources/54/asr_nepali_4.zip": {"num_bytes": 583746586, "checksum": "7315f69b392690c22db32b3c2f14b82b1f64215c5a21d697c421d6d220a55bf0"}, "https://openslr.org/resources/54/asr_nepali_5.zip": {"num_bytes": 572967016, "checksum": "3891b332a9fc55e4fb0579bf67431989e92ab05b9715c0e9673cf356e878e0df"}, "https://openslr.org/resources/54/asr_nepali_6.zip": {"num_bytes": 588104006, "checksum": "78c321a8f55a5aa0c56feb791826a2751087cc87a36b27bba56ac6b124eac73f"}, "https://openslr.org/resources/54/asr_nepali_7.zip": {"num_bytes": 588410232, "checksum": "8b05b8b4aedfc9829cf33cd65ab3c1474eb8f738078b414d40b61f08782064ec"}, "https://openslr.org/resources/54/asr_nepali_8.zip": {"num_bytes": 585192213, "checksum": "0125cfc7c54e44bd4ac01d5558130a752cad26aa7055df753c65b400ece2c9f8"}, "https://openslr.org/resources/54/asr_nepali_9.zip": {"num_bytes": 578834881, "checksum": "6c68e80fe7c58a33aeb91b5b9bc37a99f9374a8f629e2a109bddba51d1712b12"}, "https://openslr.org/resources/54/asr_nepali_a.zip": {"num_bytes": 587798317, "checksum": "03b7bf7b6ace01a677e2a0dd079053ea29abf45743f197761190f3f52678e6df"}, "https://openslr.org/resources/54/asr_nepali_b.zip": {"num_bytes": 584397714, "checksum": "9a98d93ae91e75c6928d9222b387105e99030b8b81df9ada57c87f6b317c0853"}, "https://openslr.org/resources/54/asr_nepali_c.zip": {"num_bytes": 579440365, "checksum": "8bac1a046a86fc3684bfec2e5af1b1e0916ec5c2f1be5ccb1fb4778ecd7bb357"}, "https://openslr.org/resources/54/asr_nepali_d.zip": {"num_bytes": 588470094, "checksum": "9aad327fd72efcc009d060a8299aa70ca1757f1ec32fe3280d53e449ef75e5c3"}, "https://openslr.org/resources/54/asr_nepali_e.zip": {"num_bytes": 578091869, "checksum": "4ba73ada7cf482611b3ad3e17a77685b1ac872e5840953c07a1c6c2b10a83e4a"}, "https://openslr.org/resources/54/asr_nepali_f.zip": {"num_bytes": 577705651, "checksum": "062f4908802ab0d57362da1dfea4898898f6d21ba09596c1e271c2cda47297c6"}}, "download_size": 9328247362, "post_processing_size": null, "dataset_size": 62735822, "size_in_bytes": 9390983184}, "SLR83": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n title = {{Rapid development of TTS corpora for four South African languages}},\n author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n and Martin Jansche and Linne Ha},\n booktitle = {Proc. Interspeech 2017},\n pages = {2178--2182},\n address = {Stockholm, Sweden},\n month = aug,\n year = {2017},\n URL = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese, Sinhala, Nepali, and Bangladeshi Bengali}},\n author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {52--55},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n Khmer, Nepali, Sinhala, and Sundanese}},\n author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n De Silva and Supheakmungkol Sarin},\n booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n year = {2018},\n address = {Gurugram, India},\n month = aug,\n pages = {66--70},\n URL = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n Telugu Speech Synthesis Systems}},\n author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n pages = {6494--6503},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n year = {2020},\n pages = {21--27},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n year = {2020},\n month = may,\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n pages = {6504--6513},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = {6532--6541},\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n to Text-to-Speech}},\n author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n month = may,\n year = {2020},\n pages = \"6328--6339\",\n address = {Marseille, France},\n publisher = {European Language Resources Association (ELRA)},\n url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n booktitle = {Proceedings of Interspeech 2020},\n pages = {404--408},\n month = {October},\n year = {2020},\n address = {Shanghai, China},\n publisher = {International Speech and Communication Association (ISCA)},\n doi = {10.21437/Interspeech.2020-1096},\n url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR83", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7098985, "num_examples": 17877, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/83/irish_english_male.zip": {"num_bytes": 164531638, "checksum": "2e5dbae4cc27e0e24e21f21c8e7464d219feb703f5fee3e567de6561a05024ed"}, "https://openslr.org/resources/83/midlands_english_female.zip": {"num_bytes": 103085118, "checksum": "aa1083a319e52d658b85c162905ec27cdf2ac6d5645b4caeab05a385a2c8a37f"}, "https://openslr.org/resources/83/midlands_english_male.zip": {"num_bytes": 166833961, "checksum": "8192c7a0626eb742f9999e63162289f8f9a86c9cb49ef68298dc7f624acaebcf"}, "https://openslr.org/resources/83/northern_english_female.zip": {"num_bytes": 314983063, "checksum": "22b6229d08481e7605b028185dc55dccd0611db428854f2d485d9ff34395a65c"}, "https://openslr.org/resources/83/northern_english_male.zip": {"num_bytes": 817772034, "checksum": "b627d500d1b2e3c4921fb6d91338ead7b972f67c1c2f0babb300e0ef844c7248"}, "https://openslr.org/resources/83/scottish_english_female.zip": {"num_bytes": 351443880, "checksum": "2dbe5545a7ab87112c7730086586f738ec4f42171f7738628ba084ed4ba15ccb"}, "https://openslr.org/resources/83/scottish_english_male.zip": {"num_bytes": 620254118, "checksum": "c7d2d9cd581c48a8323f6cc3886d879e2e7aca5931d98228e07d07b350d9f9a9"}, "https://openslr.org/resources/83/southern_english_female.zip": {"num_bytes": 1636701939, "checksum": "e0a2e8e64b9efdbd7bae5cdf33ac8b81db495b499c9d40da0a7d7842e42b1e76"}, "https://openslr.org/resources/83/southern_english_male.zip": {"num_bytes": 1700955740, "checksum": "788b1c59fb5713b0e1efebc02b7aa1b55182b21955493b299b9941c70a878cad"}, "https://openslr.org/resources/83/welsh_english_female.zip": {"num_bytes": 595683538, "checksum": "3c2465b9618e33f42c7d2ee753b54ae593714e758e236efcdd56c14c5bd89f1d"}, "https://openslr.org/resources/83/welsh_english_male.zip": {"num_bytes": 757645790, "checksum": "eaf8de0f8872bb647d5c159bb33713cfd58966bd59d733f5f399793778ea5058"}}, "download_size": 7229890819, "post_processing_size": null, "dataset_size": 7098985, "size_in_bytes": 7236989804}} \ No newline at end of file diff --git a/datasets/vivos/dataset_infos.json b/datasets/vivos/dataset_infos.json index 0b87dd10d94..df8ed1e44dd 100644 --- a/datasets/vivos/dataset_infos.json +++ b/datasets/vivos/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for\nVietnamese Automatic Speech Recognition task.\nThe corpus was prepared by AILAB, a computer science lab of VNUHCM - University of Science, with Prof. Vu Hai Quan is the head of.\nWe publish this corpus in hope to attract more scientists to solve Vietnamese speech recognition problems.\n", "citation": "@InProceedings{vivos:2016,\nAddress = {Ho Chi Minh, Vietnam}\ntitle = {VIVOS: 15 hours of recording speech prepared for Vietnamese Automatic Speech Recognition},\nauthor={Prof. Vu Hai Quan},\nyear={2016}\n}\n", "homepage": "https://ailab.hcmus.edu.vn/vivos", "license": "cc-by-sa-4.0", "features": {"speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "vivos_dataset", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3186233, "num_examples": 11660, "dataset_name": "vivos_dataset"}, "test": {"name": "test", "num_bytes": 193258, "num_examples": 760, "dataset_name": "vivos_dataset"}}, "download_checksums": {"https://ailab.hcmus.edu.vn/assets/vivos.tar.gz": {"num_bytes": 1474408300, "checksum": "147477f7a7702cbafc2ee3808d1c142989d0dbc8d9fce8e07d5f329d5119e4ca"}}, "download_size": 1474408300, "post_processing_size": null, "dataset_size": 3379491, "size_in_bytes": 1477787791}} \ No newline at end of file +{"default": {"description": "VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for\nVietnamese Automatic Speech Recognition task.\nThe corpus was prepared by AILAB, a computer science lab of VNUHCM - University of Science, with Prof. Vu Hai Quan is the head of.\nWe publish this corpus in hope to attract more scientists to solve Vietnamese speech recognition problems.\n", "citation": "@InProceedings{vivos:2016,\nAddress = {Ho Chi Minh, Vietnam}\ntitle = {VIVOS: 15 hours of recording speech prepared for Vietnamese Automatic Speech Recognition},\nauthor={Prof. Vu Hai Quan},\nyear={2016}\n}\n", "homepage": "https://ailab.hcmus.edu.vn/vivos", "license": "cc-by-sa-4.0", "features": {"speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "vivos_dataset", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1722000675, "num_examples": 11660, "dataset_name": "vivos_dataset"}, "test": {"name": "test", "num_bytes": 86120132, "num_examples": 760, "dataset_name": "vivos_dataset"}}, "download_checksums": {"https://s3.amazonaws.com/datasets.huggingface.co/vivos/train/prompts.txt": {"num_bytes": 1075754, "checksum": "d6c6fcbe258d80d0f63e0f87d414b805f6ae11f41d40cdba5454152c3d6f14c0"}, "https://s3.amazonaws.com/datasets.huggingface.co/vivos/test/prompts.txt": {"num_bytes": 56446, "checksum": "ed27898d081eaa41b1e7e38451eb85f7ca06138896b471691510e7bab1187c2e"}, "https://ailab.hcmus.edu.vn/assets/vivos.tar.gz": {"num_bytes": 1474408300, "checksum": "147477f7a7702cbafc2ee3808d1c142989d0dbc8d9fce8e07d5f329d5119e4ca"}}, "download_size": 1475540500, "post_processing_size": null, "dataset_size": 1808120807, "size_in_bytes": 3283661307}} \ No newline at end of file From 63d0d47bc627abbd20f8c59f117f32682c23ca7d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 19 Nov 2021 15:44:19 +0100 Subject: [PATCH 42/42] fix dummy data --- .../dummy/ab/6.1.0/dummy_data.zip | Bin 14443 -> 11523 bytes .../dummy/ar/6.1.0/dummy_data.zip | Bin 4298 -> 0 bytes .../dummy/as/6.1.0/dummy_data.zip | Bin 4097 -> 0 bytes .../dummy/br/6.1.0/dummy_data.zip | Bin 4057 -> 0 bytes .../dummy/ca/6.1.0/dummy_data.zip | Bin 4163 -> 0 bytes .../dummy/cnh/6.1.0/dummy_data.zip | Bin 3886 -> 0 bytes .../dummy/cs/6.1.0/dummy_data.zip | Bin 3928 -> 0 bytes .../dummy/cv/6.1.0/dummy_data.zip | Bin 4456 -> 0 bytes .../dummy/cy/6.1.0/dummy_data.zip | Bin 4182 -> 0 bytes .../dummy/de/6.1.0/dummy_data.zip | Bin 4117 -> 0 bytes .../dummy/dv/6.1.0/dummy_data.zip | Bin 4109 -> 0 bytes .../dummy/el/6.1.0/dummy_data.zip | Bin 4428 -> 0 bytes .../dummy/en/6.1.0/dummy_data.zip | Bin 4014 -> 0 bytes .../dummy/eo/6.1.0/dummy_data.zip | Bin 3952 -> 0 bytes .../dummy/es/6.1.0/dummy_data.zip | Bin 4007 -> 0 bytes .../dummy/et/6.1.0/dummy_data.zip | Bin 4524 -> 0 bytes .../dummy/eu/6.1.0/dummy_data.zip | Bin 4174 -> 0 bytes .../dummy/fa/6.1.0/dummy_data.zip | Bin 4186 -> 0 bytes .../dummy/fi/6.1.0/dummy_data.zip | Bin 3864 -> 0 bytes .../dummy/fr/6.1.0/dummy_data.zip | Bin 4085 -> 0 bytes .../dummy/fy-NL/6.1.0/dummy_data.zip | Bin 3982 -> 0 bytes .../dummy/ga-IE/6.1.0/dummy_data.zip | Bin 4096 -> 0 bytes .../dummy/hi/6.1.0/dummy_data.zip | Bin 4303 -> 0 bytes .../dummy/hsb/6.1.0/dummy_data.zip | Bin 4363 -> 0 bytes .../dummy/hu/6.1.0/dummy_data.zip | Bin 4258 -> 0 bytes .../dummy/ia/6.1.0/dummy_data.zip | Bin 3767 -> 0 bytes .../dummy/id/6.1.0/dummy_data.zip | Bin 4029 -> 0 bytes .../dummy/it/6.1.0/dummy_data.zip | Bin 3995 -> 0 bytes .../dummy/ja/6.1.0/dummy_data.zip | Bin 4274 -> 0 bytes .../dummy/ka/6.1.0/dummy_data.zip | Bin 4434 -> 0 bytes .../dummy/kab/6.1.0/dummy_data.zip | Bin 3902 -> 0 bytes .../dummy/ky/6.1.0/dummy_data.zip | Bin 4390 -> 0 bytes .../dummy/lg/6.1.0/dummy_data.zip | Bin 4074 -> 0 bytes .../dummy/lt/6.1.0/dummy_data.zip | Bin 3824 -> 0 bytes .../dummy/lv/6.1.0/dummy_data.zip | Bin 3862 -> 0 bytes .../dummy/mn/6.1.0/dummy_data.zip | Bin 4768 -> 0 bytes .../dummy/mt/6.1.0/dummy_data.zip | Bin 4094 -> 0 bytes .../dummy/nl/6.1.0/dummy_data.zip | Bin 4075 -> 0 bytes .../dummy/or/6.1.0/dummy_data.zip | Bin 4316 -> 0 bytes .../dummy/pa-IN/6.1.0/dummy_data.zip | Bin 4416 -> 0 bytes .../dummy/pl/6.1.0/dummy_data.zip | Bin 4103 -> 0 bytes .../dummy/pt/6.1.0/dummy_data.zip | Bin 4130 -> 0 bytes .../dummy/rm-sursilv/6.1.0/dummy_data.zip | Bin 4041 -> 0 bytes .../dummy/rm-vallader/6.1.0/dummy_data.zip | Bin 4384 -> 0 bytes .../dummy/ro/6.1.0/dummy_data.zip | Bin 3976 -> 0 bytes .../dummy/ru/6.1.0/dummy_data.zip | Bin 4811 -> 0 bytes .../dummy/rw/6.1.0/dummy_data.zip | Bin 4150 -> 0 bytes .../dummy/sah/6.1.0/dummy_data.zip | Bin 4569 -> 0 bytes .../dummy/sl/6.1.0/dummy_data.zip | Bin 3947 -> 0 bytes .../dummy/sv-SE/6.1.0/dummy_data.zip | Bin 3993 -> 0 bytes .../dummy/ta/6.1.0/dummy_data.zip | Bin 4385 -> 0 bytes .../dummy/th/6.1.0/dummy_data.zip | Bin 4467 -> 0 bytes .../dummy/tr/6.1.0/dummy_data.zip | Bin 4222 -> 0 bytes .../dummy/tt/6.1.0/dummy_data.zip | Bin 4270 -> 0 bytes .../dummy/uk/6.1.0/dummy_data.zip | Bin 4550 -> 0 bytes .../dummy/vi/6.1.0/dummy_data.zip | Bin 4094 -> 0 bytes .../dummy/vot/6.1.0/dummy_data.zip | Bin 3339 -> 0 bytes .../dummy/zh-CN/6.1.0/dummy_data.zip | Bin 4204 -> 0 bytes .../dummy/zh-HK/6.1.0/dummy_data.zip | Bin 4210 -> 0 bytes .../dummy/zh-TW/6.1.0/dummy_data.zip | Bin 4052 -> 0 bytes .../openslr/dummy/SLR32/0.0.0/dummy_data.zip | Bin 4125 -> 12652 bytes datasets/vivos/dummy/1.1.0/dummy_data.zip | Bin 1884 -> 14710 bytes 62 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 datasets/common_voice/dummy/ar/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/as/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/br/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ca/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/de/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/dv/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/el/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/en/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/eo/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/es/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/et/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/fi/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/fy-NL/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ga-IE/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/hi/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/hsb/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/hu/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ia/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/id/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/it/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ka/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ky/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/lg/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/mt/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/or/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/pa-IN/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/pt/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/rm-sursilv/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/rm-vallader/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/sv-SE/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/ta/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/th/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/tr/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/uk/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/vi/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/zh-CN/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/zh-HK/6.1.0/dummy_data.zip delete mode 100644 datasets/common_voice/dummy/zh-TW/6.1.0/dummy_data.zip diff --git a/datasets/common_voice/dummy/ab/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ab/6.1.0/dummy_data.zip index 26e2c823b65c8ee650fd1b86a5c67363b0199f94..282b285dd46de9cecfd66000b7dba230dd1f591d 100644 GIT binary patch literal 11523 zcmcgy3pAA5`yQ8EB9|JBLm0QH41+KchSVUEOG#mbBDqV5Tp}tV3MENRh&YtXNhyt7 zP9nKf2ss^5a;scQ^8dEZmoZKKhU&l9+P-fsYwh>h&-?8C?ftxCV#vgdM1b}v(A#Y0 z<-b212r-1iepgq2TL(KIJ9!K9B?#8n#YuKUB}sPRx!;ci!N5GqfI!R)U@|ih8-i`l zK+mAT_@IGoZH@4{o6RicF}AZ~&KYt98kTBEdm^9x9^x+Iu+gTi!`xe=OhOprP)sX$ zgcwFvim{9gDex>3*3}8SuqiG^R{Kq8MmR6JKJ8puPLicbTtaqIO7b}-q{D7YOY(@r z`Gnfhj=Otq_jH;sm8!S#K_E;FS=k&_`5#+D2m#7j{OF)rBA8JK1adABnDu71WvTXRYatx2Wtobx@Z42byK5z$$jA8cDc>t^ zON|S68NH()_nLA=yA3Wq<>#Zc6;s!>s6K9OtY>G^$%8`U4-3yG?YDE4YiE9P;8kJa z^{VmU54VkC&eXlTSR8TuM0{pRN^H`ToPoFIcj7Z|h{n9~A8*Q%Id+ZrAHLMVaaE)q zPP=sUZi8S)0LL9xAGN5X&boVi`sMZsG@zG1!n*cFyOrH=VwV>spN~*Qf`1Av*}moT zl8rW=CDXRkfz#WNJ=_v@nsqBCZu4Ai7$~Yb-To(HwJ5Wtz%|zrz7>Z9z;L;&40^2s zX&@6#N!JBrfRrG_jSLGF$oA=4-bl_o(+ujYUNH61Ny2h@;SScgZG|)S!-ca(l+GVu4UVs zZD3?Pd9dW*RKR55^k3>#udf%3`K#Hg-?ds&p}?t2Jbm+w3iO8I`JlXDLA-TC-sUOvzdKgf>^=$QncIv_Pre(yV}}Yx zVR9UcE~JGK!`~Fyb%}gr2=6#o6&a!v%wOM!;`RtvFIvvbX}|I|0}x_228mkyjSwl5 z00f}+9U*=<-_!~Pc>wDC{R&m0&<{}T3Oe;;x9f{YFF`0tPzO&c<;;pyg(3)8!V2^U znXA(*C<@Ricv3|hNX8p5VUb6=gfe+N5-!*7OjjjDjxd*nJmAcVix2(h5Ld$vKjet! z;IlU6V3&ULdl7}r(}#zb=-b4%O8WEi=AJLT62-?W(Rx-hhG+tbRC1Afm8Ea3iZb@d z3yD;8f|0XS&87E|{ara+<4yiT0#o?sdHZ?$OW6yyKRNrP#A5XGC&kIOOOxpvrxnhI z^60RASf*UNGtE~pJWfe+fSYSAJgmdsfyXrQ`qC+XW^RxFn}qQ>a6EUkEnC&L{J0-aEZOFmUe?1nIn6lP(Kk0D zIp0MJ*0_6gnv`Ux)bJnLUzaFl5_!)}V6@csQGDg+yh)d-=%>-81l0!;iTWx-j~-sx zSJ$@mPqCvlCsstCZqXFqjErX7BOZpmZ)vhvC%%5K+l?zEL%am9<#5(6O{ftZSx zV%&O!mEAP`u?q1m+A>|Ag`tfJO^6p$W04Y50wx`{07i-aMCgW__w;8oZIDC-f zej&}Bpz-NeZ2O3F{J4Kj+J{x5#_5IC)>nGdN9>*nYrQ?t89Q#q5-reEDwg6EH5R^G zGygbW%JqrM#NvccI~CgBiX5Mq2zWgB4n6d_dWsQjv|4K}l#4}j^-MEsRb~@wCpck(xy;V8D1BCfkD&Qu=-)0K9DTnEaCeL* z@hT)f(5W`dx-C20$O=54d^ew5X6}1VZOmD`z12igjLm7XBk^_?^Oigl3-c^)bTfCr z(n*yL6j|^O*^$8qgAR{fSc6b?Zx7+C0(|iq$>lYSH-vk)wQy#Tvhp@on~aCNCz_Xs zJGm6ZbFR!`4Y4jmXYBEm-o(i+5T=ukG&+IZBw3(f;9M`&ba_ppcdpo0^V4M0nq5^P zLXwT9W@oVMRy8^v&W1HvaZ)m?&;@ag;fL#zG2to$)dp4>r!w`VjrN-QxaM@{?;RME z`!^uIn*9XM>`kEB%X?ZME(Z@AmiTs|)@>Z*OAzh)>mlOyq{Y9F55;I3z8QH|9DZkH zf4JR%rq0@NdAH>sDz9Wz9plTsR2+P-uTzybyTvWCnX#!jN8;1K7G`y;Gvy*zKYf&T zx^O)xzir~CF;{R03*o9U5!Kfqs<6)^CC~2g1O{O!neu#3d#fbYj#+dadfL}n-}3CF z&a`*nNL%T^t?r`Y>w{z?k*f3#wO}A>>DHTL-jJw4Qfa3f2m!aCt(jKlf4`)6UI&s+ z^gorwc5&u_q8f&lInm9^MtAUDZ}F)q4!l-{s5%d2 zC$5CuW>)WCIPO9hG=<^GZREM^Xfv-Ao(w zT*I0w34^<$3?&kBi?WtU>AXVjwLg7;S*+1Z)0V3%Rr(L67>j5Lfz!A%e@187zFwZz z(VgKU#8om{JlLgiNd1|{&Jx#yw&T_v+(iM^uQEDI20W{u#TGBWQ0gKon%AG_(*H5j zx}_`7(YIP|+_kBAw0B2(#L}r(@2{p^%hP#>k3k?e>a8Hk$LOhX>$xOotnb&w#vR^p ze9`{;`Xi{zc!TtN!lxRyOKrb6PP8tFFT5Tuk?SpUDi?#i#QitHVNYAnqnbpAz%g{d zN|ZT~3IB4s&MEiO2}O;R51pD?b?c36?jjeFHD^s!LhAu^sx?s;X9VtU5l>zs*CDb$CA8H$ zi%&e1D>{oUojxJxtPdK^qLKuG@F^QvFvYgQqvGu&_dDXMyXZ*PwQ+xR3 z%-8AW$XrkkG3IY|wy0R#`uggpl`Y5yqydK^Df(!Fla1h-$iB1uTSR0rsV4DTvO_rX zu9B8rm%@f#vk?!ThWYYy0^4ntlQMgUgo-i zG}$g!?y@D}HR|++@RU^1JZ8BPyY^kDKFhC5i8oH$R>_BYrQA7aY@C&)@@fTaVK9p5z1wC88t$)0t zip#p`#gwOSLyc{N=FARfb~Pt&3#3_)wC`(nVJ8#Xnf?8u)hGp`9cvqO9lx^1}sIw|YI)-@fDtoAZt?!n#_W83Vt+(^gcIAMWFkh|wsm3!kjzqoBi8M{JvUE8p4qQt^TcYgMOxu1{o!?;J*v^| zNy=rFOqD@x&$(ESr8V}9*z?{j6x4bVaz{pF_<->gE`0)>JHpRh9aO4$NY>(*Zosw& zer8qB$)icS*UR6!SlA<0E7OjARBURycc-c#aR-sNzvjB6#k=f05yIusi4z$)UL8J- zl1iAhhbku@^!Z-&8J`|wIAEq`@866XYFg{s*yCqru&%C7@O5t4+ISE^8jY@tAQyhD&p4Q6x6;l_7nmJ_jv##{Ays3#&B3=oYGfd ze?!3FJ`Z4KzX6!C3JyC9tP=tT_jv$Y{~Le-#%48k7Xk+Nc>pu~O~BTCP31!f7~JUq z#=wYV`hKbNzkL`nfCq0B#icn_N}7ewF9XA+Ps*YqD1E|hAXFcET~Hy^#BeqQs7_K= z%RqG!Zlq9lhB?^hn(IHPRZ>TwtagE_8{9~t3D6ho>m(>>qV|sAU^G7Xa4;|^tpXYx zYzn`IY0h-MTV9#T(GSaMvjl=#4rP@GSj})Fg(lG%aW1n%kP^RBUV|qKH!;KW`!a;Wi zX!)@;oOntSKx|6?6d1v9BZXQmSqKTVVEhjXKrSeaIbhbpjTE|$i*5>NI&d_sTS^W< za7x1rcz2n)1<&_M&G z7XS<&xRF8^LFXkpKL$U#^wY3^sL7zbqyyIhZlq9V{)Le7BRKzq64)7lujb&M3pY~e z5Cn%#N`BrapkxX51!%*H68JT+=yC}tVZmQC+OW{jk3oKdt@(Kbl(1k=fHthsUjU14 zhkz0m>+0ZLe~8$cUY#2-J;7Tw+e zB`nwrpbbkz=;vY4?F>-Df}H@`uu6UbEV_LGN?5QDKpU2c@XxaaVA16Ym>4dCE*ju> Qq@oCB=%n!q3XVYhANM2+D*ylh literal 14443 zcmeHOc|6qX_a9r58oQ7%vQ4r~WJyK{gUFigV#r`@*-27aQkHC`MG_%{s zYp-NWNM#H8ekNtsYVJS3dw=(Z*T;)LoO3?U`#k5I=ly<0Pn(K*6A1G)lpS$3n8(MT9z;PsK?MR)PXa&m){KbXv^6mA+kcmn z69mFM4ERRG1JLf@49TUyk(lR~hJF_f#(?mek&l>YK7Egd75Lwcpj%gL4I0NT9fVSV zK)jS75G#H(-kvsSH*s$--)|vlo`)l~foC#3M{7aPrc zP}fY}{jsh7bt)%&k}YJ_W$MLWU6Tj#cNNphss-$EgIfJ%>MV&u*|j&J^D|iWJ5`QN zmA~y=FsHe|qaPx3Mg#R{uGTBYFkgL>ddf>2QEc#2ht;9&y38K2AZM4@h_P+ zKQKUWh8aUVT`$_Ul)5VADNwldmcPEs+Pg`?$sstSwEVGlj&VQxv1Hk?%cWHzpX01` zYgL1qllUh_?d6LX;@;3H>G|7VyBKvosc7Y$Xl{{&d_H=n`1#!1yE|J=x-;g=wG>)) z1{RqxUTP}*~&niqhN z8-Q;sksW*l#q(^J=5FTOeCHaYg`+NZ!1)icMo~KO2g907^){)*KXY=spPjC)yo1T~ zLzb=d1q25InG6x6JbZ?h&LFi@2Kf=9qQuNvZ7N>wve{ z$Xn5w==6lyEr>A;3Na+?TPly~nMTLY9cxTk5arcPEpD*P9!Z_H>E%}X=+_@JXSgA1 z>(B$fB+s+6q4w}Qk!(pt^BKsJgrz?udp`0+&d&#QPJV(+eQj8##NvioZ(Q8CiPqcK z#sv*j1N2esw!thq+pkc(2WuCiG#0L2#oiflwy?NGjJyU-rkfh3ZmS>M8~Mv`&K z_fQ;*k?%FRSu}r7m!1IMM#sgz#L6sc{Q^B>qbvqUJ3|2TqD&u04JQ|Ldh+R{U8ghc z36AEELCW<3zRH=Gs~airbB~yGY|c#2D$r=qn>+OxX;dBR=v)}Td1o%|Da#5-rh|vj z!Oe7AgH=*lb)sPh1qvm#(9Z#e-Xj^}3n?S39;T0I2d83LuTDvhT zR!~F~QW)DB8q$1OJXB_)LCZYz!Ywr+9VY{C*WA~4oF-<)-Uh@s&_&A|z7Lceu2ovd zIQc=l)OUbGQDc%Vfp_4~=b*|(8omx9YiWpWEP^|5Y-v`gz$DSLiYPEN98GB4(6c9(Vp zC)gXBe;!1$i;fp{wfu*WtN>M;zQ2Qs;Qg2KW0byUtn8`O)m($y z>JXE*XSH`E3f~& zybWABQ!+WQFG!(x-yfx}$F1iq`xx#8G>m2Tmri&z^v0BIzwyACm$zWNzHGj@^EmPL77(u@bzrAbNNwO8r?KjaTI;z?5h4XC|_z zz|lv75|64jeEn7(RuQC{9_34o~!H0ufQA4>qkyxanPpbP|(Y{y5v z9Z{Zm82}>AfaX{o-A9s)VK0YS6+t;m3Ej`o{3}sC2pxla6eSy3>9YR-o!c(Tj)>cN zBbdDad5QkDKsBWY=#|qpt{elSl84yEj1*OKMQ%u+(q->L8$a6A^)7E|X9sHw zs}{X>depfDM=Q=f;iFgC^?9J;$$IhnIj86g^3qwr(izOu7ehq{H$G$uyOPi(Z=aoB zZ5hlWc@SBn9O4ly+zx81Wgh2yZD4prq)1Ds#wK4Z!xZt3JKbb)G3|lJzz3E_oyCh**3WB;oH(^-2$h;7HZ5EjiHeXJ`Nz1 z1|YPZXp5rVh)XqQr_KWp;ouvqK%slCXr^7d1+^y>`VRt99HBNqOD6;#k zNZ53@Nk42WE43i!v}KR)P+4^N6(1?9m*$&{Q;J13KPdJO$wu{DlCG$usyosB3QQZ3 z(mFJKgsH5UQ)%#2l?cxVKiy^d)Okq$G&@7Xi3ji?sBwg9fXP#T!+M~Vwh4`R9c?^z%!ZtTwMi0WbS&NjuZ}Hm6_`<9O9`^3wCS?b3 zrN0}*KW)@hVYf@ZF5Rx$#7!(5-r~U&CdJt$G6!a%P9fENX_RKP;E36tiLbVPLRPOm zXu9^FHMc&>aCqjE+|0%o7*E$mM)Zo#n_D;~Q~lTSK;3$lQ~2*nZ1VEW&L60Zt9K$A}Rp@7LTQ zt!&M`p)Lb|%62ZktvKc*afWv><*OGL!(gS7CEuh#5Et)aOm-FGOK$p?+tZSdN3v5E zr@s2dj~_od_S|6Kq_EA3=R>%4`pcSv4o@lGOJ<2>6}vwAfLFd9k5_wWHvyb1_;BJi z&|CmT%(ITk4l&-NBZV50(^^`v2CDE+F zsyrdCm z>YOvU_yF_m&V-k_VwRD21B#Z8he6Ll#pqqB`8;o3zRfsXYuHB95b4BEVdTJFyNfNZ zVDuraVZvRgu{+(hD5RD+O*$PH8#@YBSef>0H=>cz?vA>}8_O4R6OY-qCzSAIXsSD# zMbiBtq3=+erDqGdl*)cOx6XD?klz{U)6EA2S@Pq)coxc}ZC~h}v#jF$V!7Bkb;kEq z@zJVo!w_KO;nz69gq)vz-k~;SaH?0M2fwTBbCObHq2xr9If?m z2qB{$ondRA_G+J9bKQXhT_Z+bn+hEj!Lfdbhlf9Ou}XJp)f*dy%sw{g>@*kD-23KQ z1ocN5F0+apcy_wr4H+#3+7-w!3UXCyTB=QPw#S#_i^7?{(!rLX*=<604|peJk0?dI zj9KYlwAc`|?^24{^F^h^KYf%UIJ#98Jfq9tbKAFZMMqol`VZHhzrg4qg;R%X0Jt4& zs~9-}xH$p1x$rkGskV9?037oWfZp7qjTwJ7yqa;3xOOAL+9Tj<NoA$<3`tzh1PjPfm3qOb33sl?IiLPAr=&n%R= z-l&g`HciYN_S@Qb_#yv5M$FQM^JB3ok%1{-wSdQE-deZ4_6~&yi)8&5VnojvMn-x_B$*;MuR~l}V#WtI{3ap~Yy1yj8pHpP41U!#F%^MpI*R5e!T%s800$Ui z;uj8xPr0}b;%NyTU|aU@*9Bu6{{V$+JAsbSHck^E&p7_*LI56-{3*_?>3+68@NlPfx2;g_3Bfersj(WQLc!|k~!^Cz= z?v@mT?G}T<$kcC^2nw$-nbv=B`sFv^KNE;Zmo`cA9pF?sK#IcDAgo-oCPg)`ll{LV z`$-pRaq@v!tJeSwn0j3cfWBH%u`sOj_5U8eUYdMp392>#Fs5HNtu^H}fY*uq{|u2Y zL4JB*+wS)Yyq+2VHHimzK>Zud8E0<#d1|av|Nm#yzYO`CnrJZnC4lRiz(1!6tdb*y z2lzeYLrr*=|Bf{FJipf8u@VH~XoP$m_MVp zaiy85ZDhaV+A#+3N#Y2D`=*5Aq%_`}Hf<-{Cqab)JY!W$l5b0>T1wz~9^)n3Gj^Q# z{Sp((nfSm`@ssToXO<<;E3VEdgZHYHJIMA*bdLFz@1LE#f9xB~w$BzM+c)7k{8z3? z=d&o@YjX&Z=lX9XGx6~dcLRM8CeIh zgG9-5iMMyKnlwp|#Iqo-M;kFRJz_t=24oQSq6$fu#N!UG%V}})ToRWitgMgYc2&b> Q0x^LCfh!YTl9=TCpf3k)+A)=4 z0ATvdl^Em7`|P2-!k_^2 zd%)mq$=#_Pf%kPnN!RRD5%4yJ(R6iyd_z-JjCIdwI>vVap`@4EzRRM>DM{+ zp>Di?(Uv($%p@EhrH8pwj2Z<+5>2ey7*m8U2xy+ML_m6tW*&Un?(hy8Y=h-W7~tW! zXNnY#v;h0!E~C3^a9SlA3|`Oc#%}RHXVdU-3(Kpi>(no@cp-S|n(EY*n#SQ#&JFIlyW16|WEU$~f7x%yVFxKdHQ6gyk>cz*oW-ltYWdGmEBji-hq zix4t9iiL6dX$3Nq3A*IDz(mWaHA!R!Yv=r^l02bGqCmeSE*Ro(1u?P1w?IM5#N+XN z=EQWhIjkgU&{#@hyE|n#xuF1PQf6#nUce3RR`Mp`o^k{BR{vGOt0N|{>*15^QH)Q&x!<&y~QkN?CZh#Zay*w9IQLS zbh7ykV_^nT6;KrRk#tXx zxTg`}T~`z$;N*6cZGO^?qcVmbrvA9%v2qXcocwrDLOW}$y4m|+wTV`(#k|nJ^=pVD zLWlIG1ro(a?mPxGP%Os#LoOcHe?RlQI-+rg5aCSL)HyJ(h~Kq%zc9ZoTA=89b!hA4 z3ss(?KK!{}#-8fp-OFSXpoZ<$Cb5#`B^mdu%G2dT3$;dEp~KsdB}Ps{lilJ_FYmN6 z=kNsufWB1Ps2gce3g!$HKLlPSVvkv;Qgl}Rf@g;Ak?Y28S65EA7>c!J537Z)NsG)t zo02LIK~7Vp<)ccwpMGhH&R$q*S;Dyd5SZq+UpNjEZ-`HVaYPMFK^pkXeOCNZ<6o)u z98c9U)N&ewfpgFCmrtLh3GOMznq9F;U9x7w!374{I?hR)nV4h+$Nd26}e7 zJgCue!Lskmx#874Wj3X!tQHI6kbC=FkKb`NDXrmk!0Pn(tNTWV#a>mOG|=4XC|mQs z+ix%EiA}d=y}{G7KV8uB0X!8^{td_2YCqd_8`=?K0PoEBN}IXl8R?clf7+;-h?myfnN ztShXC??k@w@E;QepbC#2i0~S@quNgyuRldlj1s40uK0;?ziQC3mzb6c5fP_cDr`h9 z73VJ9oPl<}Da@pNSP4jY_ebc|W7C7vdz@E&n>8JC2U^Pde3ivhY?5p1_kARAt$sZh z)?B<^YDM~`_Q&}7|4@00y83^W|7TQo1;+?xQw?qN?~c!t{_Ysx-^q-Nv%@FCK#0s? zi)Fcj;KmXekxT8^+~-N#0u^!#4|_4rg5!)YOM+$PDgt^IS$P#Z6j$1mcV~uw5|*z? zwBhl5=J2N{SCuaB&HluJVtM+DjISgDIq&RF69Wjm-bK8$en__600Th zjv+j4Vp#v9XyVpdV2lQ*WDA}xzK>Q335eQ%wo6xE1d;LX|SwfaLudkkhz^QF89RAswcE! z2QIC?pxA8-Ihc}Z^O&NQe21VFEi`mQ!#B28bAH!Ok=R%V@xY1Ji%B3iWyQ7HPV1!H z(2XErYAOg*QvpuD;j#G5)qM-!cXjI<{RIVW8=duzPS1*V^a)Dsn7cqi8k$A|{*YyE zhFK@I7wPD{HGEHkLc9hmE53NPB{#CBB`+0Xd8+ZYe@U~vWoquLzAGkk4=9RX8o6%A zQ&ONO(j3`ztz0;a1RAj=h$s9C% z;=2sklN+%Q!zf14`)ML;RQTEH*bM>huihY92N zq|Nii8i}lZoler%X@d%*O&h;l*4mM{an|?-qZ$T~dlpE&t=p$9mqmrGn>?u0E!A10 zZfvL5Q&Xr;rIY%jx&A$KbDy`K=?kQr`BPn>l7G?h8SB$>ac#95+m7{Q`)zcS*P4*u zFPG-Hji=yx=#lMoL)V>&^iek!-1Sr=syFErc2oKNC-yhXZ>}-l*wjp>rt$zg-Rz$$ W&Q@qti5nW*xHJ+f zEpu;llS*lliJO>e%98!sANy13IjUDkZgc;8{rXddE-*kAa1HeKKJMsB3Lt3`ZC3z73?+io>=}3r1_%fb2myfbAb7D_vWW7+ zA}&EF7Yfxk&>y9(fk9!>STqWQMPV>}R?k20yx>r&fS;VZy$ z(49G!$@UB}WGb&u0qpeiA$i$WI>a&k?N=luQ*G3VZNx5uR2@DJc66dOC*x!7i_t7} zZb9c^Y@&1LR?CN(*(_0$9ggNdk7cpeUW+K-kr|%(Knvv(tP%=MRqF~H9^FlBfRu}e z9_e7qY#gx9{QajUvtCzkJYKi+w+4cPUDkygCZ=iG6g&p`X8P51$H_~?99vr^-PN!` zrG<59tlixfd%fwp%|7FzsGG$$-s`Gw5ucs$=4~e*Qr}f+Kh{}(BUvR}kDU3cKJBl^ z&Kn_uPB_D<=wTSiGAL%)Wvarc%&6?%`{L+U`8KOH3M-*px}(x-(u_$StoA^3UDU8t zn_eG}dOu0Vrp9hC;zQvG!|>F-%C=hnD<7?|88&6O??&~EdXBVMKi@mTtaU)AlQE7f znRhVt;;uPxRl$8?58PP^k1F2kXr9U*>hDiXZt)95gkoBCQE4%)^*4HM(p!9aC{3E> zHdVyvnn&Ka|JrpKuT2WchxFI9%ctj;ym{Qx5g|0&!E-IZ>Q_-S0$adC5CAmzZvj`g zpdU6svaUsyzC>(ar=*U+tNtf_4fgM-S0#C?w3#u;T~NXXWb&>Qn$m+mPA z%C9MQ+d>O7YShd_vFS+m{XPL}UB<)h2{D`N^u{IBb*mVoID`y1@S5^_8L}AP_M*vt@`8C8T>RQm3YC zeUSRIj&aWe6YbZ3?wu&JC*C;M?&2>M>64gQ`0nA2Hw5tugAB@0!RE%zfd|IpQ>^g? z4nql-0s~N6m2MZ~E=I|&!BFD~8$y;iY{TWDioOQFlODJ)gdqs z&jg~Ws7IrEAuW#P$AWSt)NB3gdGu{j32si}du7dEdTal2{0;HX7}>S0@%2W*td}7T zJnx;k!6958CvfM-o&ifrOKjQTfHy@U_}z}@+{6$57tdAobZ0d;@3g(slj|cUr*WI zm-^t=Y!06PU^Au!xVuq*BsMD1K;#5kRJg8B)of4t{x~wZB)moTHs>xa*rorOsNqC> z+I_3H@}0d|u%?zN0+JwIy@J4jHxsjuH>!_MeM&uZdq!R&e4`6!Lytfk(&oP_$v%sX zXo=?Jq*Pc`p(SIEj7o+HO7eco*CFl*zOEOBovSLg-B~cJB<*+wg;UOQnRgz`p$HHohOlAeYeEwE;BWb+bPCL zk?3+WN2iFKp7!WC;_AA7NsooQ3w8I*a~$qMBM7DK&;rv6tl=qID(4 znb)oV9cAU5f-pV{tqXmeaSPKiqD_eRia@TE`)H-`@_4Gfe|8v2?G2=Aa+7FW4|BNC z+|#XiYHTL@T&KMb&hK#N>aK!S9)%%Ifl0_%l;j}s@m+23Ju<#Gf&u&QO3$m!@y<|J ztp4n7n9io;jPiELf>q~8xGU|eDTKpv$0Z!4&fS41TD*K2twE8}#o9LXozyFQq6tk- zv5KL2j&QmYjiXH(tOD5>n!9O}SKPm-t+f7-3QpSS<7u4Q{VPc=qHyYmqkksv=~WvY z-{EMFx;q)c7;N^=#w)>RDjhlf7UBUb_KLsX*_W(=mx?6LnD0LyUO8(sBgU(lg`_JP zQ@Y*&5w>v9ht ztxNGvz4^r4K@+QFoDn~A4rhqjA7@`{BlJt0LB2@Kw%oL1dImTY3F_O5SGW|;jx`|F zd+k4=t7QFCHD|y%d#wu-H3g4vY8Gd-D-!JQorPU!4*yS`x{Y{m%x$-Q6`w-AK{nC^4{Ph7Lh}g4tVslJ~X)>ppWFqV)jqo8mxtaT6a9uhxd{cx8!|l^zbS5 z!ICcZU?UUh_ra`jEw%E53i;?&&PjEgV-q**^MVI&T#i6qHENw=k9Ep)p9!)YncJnB ziz$xsRAiOnIraXSQFyA?Z8wKkw9;m$;QExp5=>CCL!%BUpn>tIwm827S0F-clqtEZ zGD7T!J87DIa6dX#a;BTQR^~)jc1r#Z#_3CKWu{EH*6c<6DoNq^O8>4b`s(_Ex5}sO zAE2W-VdbyKhi2Vbsx$AR{Z3Lg#*8^%D;2&yS`+VwtSN!Ya@nU8xrf+B7KV}Cp=f=a z`-ID$iC^#^_$eTxtWkd=tBh^2g`c=VY#?k*3{GTZJ0-n3x0cgVa!1{p`C4{=$ zKkR!FQj)d?ui!P|7_SL;VPQIUKWmvjkQRg4{=Ry+C5N$5>_Ab5rgB2Yi!}o=Hm=|U zM&3ai9hGU@(O%sYiJf)bBNNmYGF4_hz3rA-*-e=g!gKWmC?nmpuM6|K8u}rpK~6IY zuYWV5S#%98@y$9L?}Ol#Rwr_O2*FnC$s>wa#YlxtCH;oRvb{r?!@dK7WtSftvNFQj zLQ@_|yXz9N_1e5{qCP%7dYaaf|w$v~aB^SA)Ct%T6C3s;@DG(!tji+|em9)F77 z?CdDNvU2 z#ES|10El!m28X*q*#I_dfMGsWCLG zl|uFC*%=ZfA3nwA-qx68Q4g-2v%8g6)tSdMR~Sxz%arOV_V#4;qUAwrT!iDK-L&C& zeC=h`1Qx#NcYWBa_3dbwJ>eQT;IhZ2&Ey#8nZdMN6U-9~0z^==BOOT*iVJg+zAm4d zlUCqr7pHSKEwh+Yedu9kZF6<@7F&Ip+zb}tW@GFFc1$VR}GA-FN#90VW*m;-Bo2ZLj z?mEGR{UR$lgx}`zv|Zwd!9p6s%h_s`FMK%=+lqI~XR<}-P4*82)CMgCoVBvy5BLv; z`}?G&c6))uw#pZf^x1p^FrNAT=C&4GAiwpVhXD-;x^P4U_*j&uWw!;u78!AXx~w`2 zRDSQdn7R+>d;zIno#^+OOTF1*W)tv#NbsYZ*xESgxE--Y_c_6nfc5$*Q@jWpYuEC{D0=AOw8;6?en<8-}39r z_Xh_c2Dkcl~~Ro-?}Z9bW*3+>iC+tqmaFUZ?zryHg=$Yj`ZX$Z#(FFYXrZ9fom6!qOly&m zVP`j)-Hv;!c6+84p$!^Z)uWd+awg|*>GZLW2?l4WVt76wn~nnNnJDSLy9-({>zyZc z@8Z~H`DEqE?hFa`a&c_V;&WQbTk{C1540a^b@`$Or|Y>!2;IfJR!o2-g5DnE@6j=vIi z7kx`kRXRpQdYGv5{*r;KmQ=>#?WxrFn`ynZEY5wd3@QQ?#|IQgk$!R9u*Ba=W2xuY zD9)>MAAQC?m@LUjRB0(Un`WxS+R7TiHIGo{gElQPAm83fj2v)qXR((ohBu4DJvA&u z%=P&`!jS=m^7`RtM`8BH_Hqz;M&1*0$CEDddN`otaF-fw(+Z#;x~~i0{iVtwWZ<<&DJ?A`~87-;q~8ihJplYFpw1xu}%S*wQhmbV=(|bQ z>JEkjce4L1Dlf1mn0>$?^c6SWPt4e3vtW1UW!Tsw&zp{RP?n*aO=BqSbdp&Ae0Yh3 z$(O5Nh8vQblHcl=06XRc4jS5!&4%GAxE=wjYDj~q!JeY>UOkxvMK=e$IzmLy@_e5t zyK0kog(8F#LAkO2(XJVXLs}G=ASq?>TDqkQOI~tdDXBDbWXcn3y&=hLdMhwm*2VE+ zuDC@^rO6*~0_eQX>&M+*&sWk6@S1oJ;Nw|8jsGrjY!&sJox#QhlmzLkHhy z_0Ui~zaUgcR#1K{Umfwx^6G^ zNA8n5nHAj`CKkibzftsIr))-eN-^w>PM_t{n%;OhWJ4F zD;Ig>NP9l4P@J|eS1PVVzv2Wep72BLb=Kwo#r|Itd$Xylv>!O$3jgkM#1TBP0l!t- zLOXqv1ZUEKm46g$A)Z6?(Zd-93G&mjlbsbO4%c}Ej9I;CuCyII5sBhbBUE_hx*>7= zr&w~g-^)ks=X)JGG;-OByK%7P!{w&B?7GUWb$XIFyq+ZcZ^RA@#VWP4v>fAau~zXw zBZ5s%xMm)?u}f&zY>#(o!qv=NefHofBZj7Dj3gu7l$@bb>tcBS&@?9dIv^=!bpP@} zKkmq~42&~rMi{{sWH3ne$r>V0?IPg?55#P^a91jOp1SuawrbRWfY^YZd!@`bL(}j2>#S(|*G$Kzp37>mV+0)B*g%5SwUAT%V2IJz9y2;dH$-62R z8eDM|U1GfxBgwNHS^QvP`n_w__cf=JW^U-^mvC_RYI^ zAYNpv$%Z7jClIOgoii05J{^82qSQv-zdiUxz^B5~=$hks0iR}C-sgX$oV{V+ijkYw zjHsQ~SYF$-A->!_RR0vTKujiC!KJ@th4Yx*_fhZh;B;jno@!&nQU zZmwfj)twKB$oiruhb9cU(}O)E?YPdmIc}lIs3vv1jWvaO$2uk7f1q$Og~IzNBV^Po z;V;RZGg@yZl4PZ9`9o%v6Q$ZZVka17W}_36A=S+Sy~h@-1kB)!7hYelA30aaH0eZ1 zEyIuNjOy6$HABLY(F`mfYyCoeOW#9Cm#X}mA;ROMs;^ER+8$nSb}es5@d!^?Xw;Iy z8;)fwOe^b3KdnxGS-%(63%A-((ki>gc{4En$+Vt<7yi!Y1!nbVudD1lJKmbd*)h0wvwsOOgHnVDF?~Fux93_v|e1!cC|5B zNmkoHH+k76{61aU#I3%dRzf9M>4vVjuIQt#UY{$e@7d_4u34u4#Qrw&Yj?~yHrUBv eE1PrB&Hj1gtcQkx$^ig7`1J$3=OzdMfd2xnX2FsG diff --git a/datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip deleted file mode 100644 index 9c0004c00ee9597beaf91b600870f6e7ad2fd337..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3886 zcmeH~c{r47AIG1;*kza9si+Vc=dv6e%VggbLLAGO6Iq8b$#Oz=g)y2g<&{vjh!~ur zn4~1x8)Zqh91`!?lJGv0u9zdIasGa1u4nmUuHWarzrW@Fj-?4b11muJj1~JI|MK$n z#s-K2?m<32=bYVL(XK~rtXTm1tOHgqlq)Eh4S+CA(*pp*2k>UOV-QK|AP9^c!Y{x- zC{RvW9ww)#pr{}RQqfQ2!!$-m_?BcR-;h`8|S_?590g_~`bB&I

EQuv6Kb=lKQRhO%(`G>Tu~n!}XvxaR09@ll z%iL&G?9lX0as68N{ufoo!jOvh=SD@6^*z%p-Ehv=C7{(w?eK)GmXa%xP7?8Mj=bx8 z+gAZi($6YkmkH>EyCsgX=_R8~cb_k$BaX~ut{?o&NGX_2!L|9OZhr*je-Da&bQglV zBQZ3{?E}iK+h)pNV2w7i`;<*dExUD&{ade zSEu%tm~+Q;#VC83^$TVrui|HFF4yG$IX`>`9zWpYz|nuD<8i%bzZ7d$>uyH(PW?l1BU+?-bH?peRtm$TBmj^HEdqd~EDX?Ld+i?rk z03jJr+)nUP+J(RvS8q?S`H}A5WTsT~cdFVa#l-iK0y~s6DIFFS#j`N7@3kxgpV4D| zH+7GZ#ZNX-zO2^81`*xYuv2b*tCa=!{~M0EjwmyGE>_(|`lH$+e`RS=WZC_2`KAcb3g1zme@sQ!{l)c{-aqNbg2QW46tDbm4@8K%(K8_u~;g{96gvGvhY zL;mt6ft)=^ksN}pjB31$wyY7q$?VGvb&mPt%^{_oCnUYn6m?mU^O3}LI;G2GzTi;p zwid}S$E>$RocNq`oWy2gD5GX;Vib0f7&I~v{VgXuOq&1ayw^I2S@yry;g`M{Ti)flr9x~8}{Cab>93}?Ub%-)_d+w=8QZ<~(*&T+<7y1@{?>QUG)P@-8 zs17GX$0zD}CxO_NA+NY|M=y}tsirbIlXZnuuEtg&J6pm_}ASNDTXYvcqxcS>^rfYt2{i|qeftsZ?f>Qu|P;+X=7*{W; z)BuW8cJWZJ)Au)=^|W0*b`Lmt!k(&F2=@dg&quL z$ubJvo2hfaJnSV)U8`DLa;E;JW|xbQCjx_|1!64NiBu->sA=ZN>Y#xKsMGi?ufJf8 z0>iC{0ghT(`2aq>$-GlN3E~miH_R%C>o?D8&9?>MLX1l0Zxj?jJodX&uTdm1Bi&IM!VR zlms>duIlJf2mA;3`eV{|@488v*2$%jwBcky8pX+ecVC;firU9+(7?h6@*<(&V~bEb z?u$a(;%Y(cj``WdQoFgWSPVE(X~ll?c0Y!0cQ{+2B=G+UtYxWf}hi5-4I7 diff --git a/datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip deleted file mode 100644 index 555ddd9291d931584bc740caddd92b54eac0ca31..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3928 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3O)7eg%I zUtovJ`657Hb1+<8loa%Arfm}=BLl;CAT}W0f|8=d%sjo4;xceRO`KF11k<-Ew8%Dx&Ak1J+J7YfMjh|OIdHeX}n z*K_5|zBg|YDM>fk>iEG=efE@Ra_lT=$?SK9T_z_f@vdy{xTjViHRr11&4yb%6H1;7 zCFeLwO3MgE7`f@=RQ$^>PHo?a z;;COhKh?j?4@xPBpl(_pwuA#1%zuGckNBWYNi8ELpqUi~SRc%q(UO@+DNudPtzIh-;@xGafnZt%t8>?J7@F(G0XxR%RqFz}685UNw~ct_}+ zq6u7ECa7GS^ZM~!k;0zW8Z$PnDmSZ~_T=Y3&gZ|`bGK`MRlV3W{nOWZh0ZLAOXluU zC{3;F@Q69N|LG3-bCq#_K9=p>dog^8=(VaH2@6-g>3caTD)wB7`}I9rzNmkYMufM= zyriJRY4f*)0>k4*}p`pnfU)R zOqj}DzO#jm#nJO|?7CySC-OhmV!t35q{+_xc*lnx>xMrL``BlmzxDm_&RN{=)i?LL z?OyS-(lX|Hta*bME`??vHy-cW2%Ak#0=PO)=*83zjYj3_dHuTCH8AucU^1#A5a&kZrS>1WVR*@KhhN6d-pyxgf8&{e?@P9@O?EGu^}BdogvW{LU!o83 zo#YEX(Aw}_*Ynt$#3S3b&W`$>Bzv+F>ehH~m1FdiQ>~7gw-*ZR5KY(;)Mu%< zW24Z{V{2X2VwQ(qIFs}sBWCwO6R!(B{IMAi%feV#^jRl68l5XyUiyK}dU@|wzP836 zHoiZ9=6$*G`%-b5j(Jedvd9V7ZRTb;&N5h2Zl%BT%JJT~gSxM6kM2_JDOnOHrDFEx zON(IqiT$A`<*)v#JM&oDc1q2n^r{0BuJKnbeOZ|Oxq>CCuu1A~;oncyv+i;%T{nNW zTWWpcen!*`l(o|}Ob8eStiUKRBR&IV=3$nCIP%fiiKlac5z%)3jmSKO3x}t@QQg(# zI5|Y=hoW-W-NdR>8`~F2f0>lFewFWT=gMF9O)NheO`1$FEzJ+Tpb$ouS+_(LE)=|~B7E812p8x$7 z_0(f|2?H;i*G0Df*G}G@Td;p$;MV(k7Ga;i2BpqS>FE}kxbepW|HXU0^+j)tp1$R2 z1q181^B=iEF#u^~f{HNYMy4t8F#rw!%zV^V7AO+Xs-XaHMkYCCT&-FOsQm&AZyiBQ zlr}Caq>W30fhr6L1OGa@q8f;&y-V08%+@Z#rnioZiL?pa_61r7YWrgK7NR+Z+VX{1 z2Fxi8OBz{#6jC_h^cK)Eq;@dSR-Cbeuoblth-_;3f!!G* z?L;dOfHq@k5<$`;!ndgPH?qx(Sc$e7QJWL?E-1&LRy)X+Ca@E2DN-dwoUN!OEwZhT wIf%9uquj-3H?A@UVmB})1Cw$pP!VYn3@>qTS-du7F(u0NlM=^#A|> diff --git a/datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip deleted file mode 100644 index 037451f05570f46163441670e1517e57a90ae04e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4456 zcmeH~X*iT^8^`Y%V>cN~S&~8W$c$x>r4W+ryF^(BWgTW1R1`8%>Oe}iQ3%PwljA>pU%tnIIg%@-+*h>L`+R=-s@Z_9Y&WY=MD#^} z5CB{R1AqeKfFP`^m%m&P4*zAN5{X(aA1`VBxKU3Mbj%tRV0(K;`PD&Kw_SKMYxP%w z*C{F+lgs?h!jNwYGe9aJes<}NcBBU}f&!ZYm};gUf7*?1i=Jl7e^=jnGc_}dq+}Gp z(fago^j@VFrO~vrZ_JOJZY8J%hdx9C>i?+%?b06;Hj~y=tZkJYE5lzw48_zH1`szZ z-y#tOZYhf;m+Fxb*E5*Cih1%ns9`r^STj3;zqCiLSyA8YoU=Lz*%n+{)qb+J`JsWe z%nc(@p0xMKmVLI}F?uXd+&gk<=K)rd;IKoi(Zf%nihRZQp$pGE1BLEXaloe)H3*KO z?3hJ@x4eA~JI02Z*c%gGv%yin@58&Rg9ObOcmG#omGcx#r?T+#MCvv>W20kJH5Wu3 zyt7ryrjC-NI^~2GCQt~$!$B_fpJmm;fw?gAV;bS|5hs&w$M-BsafY07Gu22jjPtqk z9C;i@>JOf(W4g-5JLwFe40>H=6Ujw~yN=_M?KLFq+<^q9Ub`VlwT--+ zH`IB1etz8ZSmS^kskl5w9O3Ak6f8}GNcI%@T6a}R<=xqS!b)?WVoS%+#&*Cp35j@S z%vBo3^N{nBk0z65c|jEzYN8PP%3u%DG+SArZ742eW`hA?0)|sQ6_l_EmD!n zW+gCnVT4D#NkLPeUVb&lhV$SiWKxDbd7C7MLPw(hGm|({w6-mOSLh^IA=9|f8w~H@ihKq{H>;W94Cy$!e2R4Wlo^)~@9OJC z_b(6k?;NH|+E?5a{eLHcGkIpv`_N z*K5-R=o3nyu=Z-Ym3$MzSpIcb80tpRM357OaGOTSQ4S{ChWDC;y!8vW@mLI;wd*J@ zw0FEX@5EE?TS!jxvLx>~Xml)})4{r~i?H)9Th}I^8uJ9>2dKZ(%I%fw~Q z_Wzp~la1|C^+U00pN$d| zxfVs%1?jf$sDH5SMn-W1ZfC%u|y-Ey#gj>a97_t$&Oc8&&ScX?7dZ3 zP7X|b_y|*FUn*Ps6@6LXsqQeZaEgT9)u@nGN2{H0xQ2AdBGJ%>N4{Z6PU9i62<-s^$3R3+i*bEn(1^C< zqsZZDUScKfmU|Gz(^~cKi5-0L<>W2x!hG#s<~JOvFv|mFh^%XDuYFC1gomtLw_1;i zhTDkagmzp-m0ChE6^5V+Zl`~mr<2RAg)*+N7)J7R3lb0U+uuFuQo#p#3L1?)*E5jj zTMU->P*x9ARxvsUpUJuKF=p_&{865J!)P1LXPkMZ<45lXMbBgc$4h*oy1P!7(Mq&% z+10eAA?_U-PHeQdohL3OPdsg-MGGMcH+J#~o+r-UFg8gSV$DGeTH`NQ{fwy-L0P=brbG1sNLJ49!)_?|<= zbBhqfrLXmqh6tP&j9V_>im|>Va~;Pj$E|3PS8UlA;F00pjdDG=!RDCwJtV-zW~Ww! zRI0I{;I>wL;1HRmj8F=|1&?oqQ1}JGf_^b>M81Ne!pZvwbp`9)nkrzr)+z;~QB&jd zsoggpb;wLwZ4Q~vAfvr8&qEtR>n^KQ&Q&d`o665!?yE4v+-&s1VLYd!((&5TFLLcA zLgM@j2ln)vJcA-z@9(ImNtMFcPQ*EKm+yNiw})zHa!(B6M8;{jr%8Bgd~n!*FEdov zy5H~E+{qF?E$qX9g^g{M6N6f{pk?vC4js!Dzk2A{!MJ$er1Qe8?Yfus`)ND@fkm0t zn{oud@nOJ(&hqP;nTPr7%<~5UC4rTIi#mJP2K<-#=l7(ar=Jzlgia2Fq%X4&9kx6R z{c>3=9rM~r=qrp~WI%2tg#Nc`pVnO#9kyyNqEpvZXN9`9PhU-?(yN<6>i4ew_spN0 z%hk+u2*b=Dyo65vMVo11yqt?Gt6kgQtR^S1Fic+Qc7DIy<%(N-#IA;(Vq+M(>cC}; zy4IvuQ>*B;$)K=5S@yrN*YlXKY=~v50N}rr Cg>EeX diff --git a/datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip deleted file mode 100644 index b9cf883295201cb5075150393a9ac7d2e16854df..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4182 zcmeH~dpy(oAIFC+%@)dCG97aqzYKHhxJDuxayv-wn{pd+Sz$sonrfQ4cI70WH0r76n*+ zzWI8Afh0h<5IjEA1?Lv*ruc^q6vUM!ZRxu72{{G>ae!G|AP{&8xLB-P1i88hmVm!J`G&kurCzeN=OkQsOv*dCq;btOF0vMea9lj z?^ku}-I^D;Jkx!LZk6;n2Kl}=)UHHqP2RitkF9eci2i}A0n13c2=(JmQ+SQjf=StW6Ydt09Vu&Sl z1LryO36%ALc7*@*rlp;hn%62TQ6vWE9vclI&($jGvw}M(J%2q!nY{6+;jS2; z(A}g+q?x4|I@VqYech}|_=;NfgG#jp?h-6^;?lM)?|x5U(Nz3!Tk8-A&ji>59nC&j zm4L$mqE)QDlEldf(HPH73GbX--$H5XhXTYA`Q6Ood(2G9v%HIx zx}qI;OAch~1*FS8;3E}UbhK*F=EIoR*6aND)G|hLiZMllg_i zn&dTL60fr!7gs7mW}DFR79*s^+4Kd(vJ>MhcdH(!ohT>32`}J8m3=1!H(xJcj(Fg{ znOSO&AGC-E)r69#2ein&tEUAL-6fMFNDNTmF;EXQFUq%>rBzW#U?nwt(9sq{sFaN> z(ZZTE3UXo2)ZXVBABN0z1@V3Jw5C??v7}>mQT>^d<=eGuj-JD$G?>yQT4OVmj{U|P zXGrz-b_-x$)`+<)1fPEaxg8jyoHH(YUe0f; zvb&4j?w#u@5P9~BK~~Sv-UwO z{rctGgJt7~BFZV8*DAeRLyb>}oq~|@?Gh)=1by<+7mRuZ-LMftv;mXow}{*~YIBYa zVv)J^N9&GUy;(I)q1D^fwashR%b7o$v1niLDGSj}joI}^^L{n4agl3T<X2Sf z_T}D-c1#JWi^g}+t{D9%J=3hBi>y+ISJeH;UH2Kr4;5iWTc_-1M0bmm!-f?a<`wvE zFM38AWEghbx1Zu6$5@wS1|+@m8IB;zPj8N4L8%LR_iIN>tC2XN1B_>iC zFbiIe^FGs1d`DG|-Vw2K+tvi@|Ckfo#2l-PHqzfXsw4}_rd5E2rMAXGFvfB7)iZqz zSR1JmhBQ7N_bTY7#ta=se-Y-QoA%5gt9a{V7zNi|HhA{=e9yxHdqu_!Be)~Cuz?yM z-ksl&U1Y?VjZHS}x^wMiHcz7E=)HpBVx)g9WY0B1g#t!=Fzbv=HvZNvQ<`ksh?$&# z9;HzODd~t75)hDaJL?|wC)7wl{8)5U>zg+eDm*3iHyH1}*SYS#4!H&-}o47afEbb~QWlU}6jLVbAtq?+*Z+i3*UFIb0M3v?( zNNud2u67ZzI@<3)DHpX9H^Xp=jL*!DYbjsEyB$({WAwoy>NnF~?%jt6((IxUIA55j z4rG5Xn4)pk+ReB?`Ob;BE8Ec; z*#`1|iR4?37kY5}2HIGSadup6DlZ*H9o4UY5~|jgoPhsOJ)Ak;)-^$#`ftrUsAM;M zuwh3@vG?dqzsQlA`k~rtkHd_Q4%MkGh0`_&nb2a0Z~iZL;)v;TF&jALF414#=)-NP znIsCFvnW8@w4FG(S&LeOJcFw(OHz_HLhU@g&#%PWI!)Fj?|?EKN>6E44n5baCjSi4 zl0h(uRm6_A#E&8wrn|yiDD7bGI!V>BOS3_=wvl(ix|+rVU86+Gobt?Zp1F__)_xw$ zg8#bXP`fHTIo;HR|QRI%6(rT@%JRI3>B(%k-jO`TQW3{?hHLgb(?W@c%_Y zBxt$dqQ2qkg8z{Ae@vT-I{fT%G&Bz<}NZbiE8sjDOIN@_gN^lVbU1>T=C*G9ONOldIN%k6Pv%oCv#pTtm zP8=)AQxLYv%lYHy%U$ZY)d%ECXeBq>&=toed(_pldnGj<$~JY)B>yM&mzG~UW4^F~ goeXScER1dT_Y-G5G-Sh4X&&J12kf5v000E~8>eUM)&Kwi diff --git a/datasets/common_voice/dummy/de/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/de/6.1.0/dummy_data.zip deleted file mode 100644 index 1f7d059d1837552904bee2d91a141c639c47e535..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4117 zcmeH}dpy(oAICShTnnv4E;IN0HidY&{@mJ#gd(|Ylcrc^oH*p*C%3S5p^l7WX~f*( z=MtSrF1h9INa#coPDQIk{JyImAvx{*@%!($J@(!AulMu&`944I_vdAg=H%iBu|5;| zryV}se7OjKBtf_!Kfe$UoEOPU%gK=s#7UJq=*jv8oe=NxMufNfbun>{`RST!%GFB{_^ z#gQ2c1%p7!93T*!eE^B*h4t`mDr`R8uA*o1hMPz5 zk_v1BG{xZV*P+yT@T?RKE9th}H_I-Y>S?#u&JP(XR#5dri4+bx=RUUQ7>gcNybBdmE1MWUC_oLm&*)>wxqE`PCgnOd zkCFtW#9Ix_qasksdMmPFg$4h)6o@@@j!Cg7xXhfLxtxqOc4+tUR){G_*fHm6Z5qtj z(rUud=`qm;k1-F{Mp`uizDBuV4wx_pOlY!i0_Pq4V-KdueQ?2Z15Y4TAey|NXf>DY zAXGF&X7t{P&MhR4HRPDI3zoSx#Ze+c1 z)9hTO^9-!eZi291t|3megb(9?5#xUKco`!AKW9sV{>1|eR(0vD%^A-?9AI`>RgMIA zEV$n))p`4%zWeqG4k44?gR)$H(J&N$a#V%2k;^PIi!&W8Qg?zLmEg>cEsMXYlw-)G zo+4&!uhDG%K=eQE2Ga~Jc2?;#RIfM?ra;9tz(!-;QfwUt2Nn?6d~zmvK{YdlI1)0H zZ>~u`tlv!>LAMy#kv-skXM`=4yI^KZ6*V8#UN#9Xh~teSlCF$CpqVcu(dlAklhJfl zE1StqMjg+kaKAV08L;VhkJJbhnTK;Bc6(E-@F|2@+vVjF_kAnadoru8tf?4>k{y}F zD8SVV;J1f;SHWJV@W6QS#(g!$>YDF#jyI*icL*3rE=)=G5Ar20IPK&SNWs4!(5YqrULJ3STt2hOQxcyd&hqN-i4IxfdykS?TOmXL~fm7YwNS4zET%Y8R` zpUq*qyOp?(hJjFZ@*~Ax36&Ay4+=sc;n%%n%%9$ohA7iQi*uDY$zdMR-L6E8!o-E~ z0M1AO@NQu%z7dH4NbUY%*;AJB$@w{=Xyq6rra9F$u|AyYr<4LYmRA?z%QL7;EWgp~ zVn)(=n~iz%N+wUf`Hy9&VbKT1V*2u+Qn+O5;?tv{*O;xHW_yRSN6YA#>lIDNdnqT_n#e51kwp_;*XpKQCgEFWd$t;+77=0GellC zAs1(Eof601BvuHfTsE>ZuTWY&t>a6%%_CSYPT2w%sVlI5Qi~jU}iC&Z{maZ1* z>js6g>biIw@)ARtRy^I zcCCOt)_&|QY=);BSJ00eKbg5(G_S|=+BrZf>_+5#qn4hY)cgzGt?~Nw*S>v?(?bj3 zR>>=UNj`cb9d7X}QRPLBTZ)?ACDf;YYZiXQ+3{9p`VE zQV!zqc2jpS7j8j$NJgDI5&rM^e>o6uLTK8TJZtu1d>#AoHXSZX-NXI2cOkFhPg@8m zOp2MDZ&5^QHb5`;8l}!GZ)v*oWa&gzPn#ekGiFGQ?CEX1_`8V!Q)p($biBHUA9F&s4A#-tOC= z7*}PRP>S49#=zs}>c~&(UUa>u!gI%%Sy^V!AxaPL(olulGWc}fH0FS$#7odNuNvQlC@hFg}NwF!AI`+B~}!4uX;M3$iqSsKhC&AC}5-}Qwr44O4%-L4Dh z+@AybKPV^|8k?j~!5x6Hg8O%uwHg-J$M(-Kpp8J+ zBe{UTHScNDWdX1?As(P^YG;+YKBBIrCIkJ;CiQDz{V{W6)LP5b=VF`r&9nf>-w9pz zXx3g_UG4g?u$DZ>%{F;8I{bLKtd3iM+^vP?Y+)O^=I~>Wx_%6=rAG6yP2Dh*|Bn5+ nc5f9=lqYivp&WBqDTgGs z?haTaDGEJdib^7Pjk|8NNZr?}S0lMwe?G6*bM19qyZ-vVuix+U`F+11jJ+^K9N=Bg z%fp>Ny?l8{0BZq1QfMgE)6bjet%GwF1B44UV7+)Zi6Q|AK{&zy02u=xm_>`I&M)Fi zM)(pU!by<`18o#S7paRxpmY%^lz`QXOMK>iqMCr!{m7pID+>7k2M{B?@nPD;Nb;wRN-Hq;;%1qDyHV|_4vtMoXk5>*teH^6`{OrC zX?d^d)sg2lbt|*ZytwTNz10EU@7i+>FOD2`r;&0qE!(0FC~IG4 zDPXil*k$Hlp)u(o-HKgfYWdK&7P%Ip?jKPzjNqr`DMAcxl2|)g&6f=4J1b)l981 zqvkUhtAt{A?Y{JEIk)8y8#mg=1gBP+`pH{^tUXgn5jVP5&>efHBoFt@9=J;2nT@X& zdmM3A4^#^as--QsT7LfIpVR6USFQA1-s|5%t;DUaB5d@Q<1`&DHIG2ma%zz04#5t{ zr483*2Rx5LR$Ul9gOg3{=#Mv*>je^Knu%;OMa}1GP<;y$jcVF7Sh);12p>iqF^&k@ zRdel4U+~gyh|cEZ7dyo)%O7dvQ;$=$T)qDaTrV2$A1-Ww~A*oeFsbeZan6a z^=4&8h5>u6vT*Y4Os_2so;&>W5b5o@)){+qv@#V$YDxVm4SR$fpGD+Q%dcH(Ce?Ms ziki4MRn?Xn_H~-pb&L$@u6?0bWH)dDZD&w0YMtBpDALcXx}*jaA*|U<^IvHflyVb) zm;K0K9&cc%P28vy9g7;unhl%?sd)D=3CesWr8T`yWwqXhIDNv+wk-1(F{5iKtxNyi z#9hMIW?O-eb9p+$ya?20Kd6nK;M$PAL-1hB@%Q_x5bq>?=QM?49)`)Fpi$%op9Ub= zjM?)cCotp~n*PWdO$mi}xTZnZQ=X^Ml?xB1HN>%3Wlm$Nc9}mj4$vDJdY5z}x3%J| ze(2uvU^pe)z0SjlS-#w}{Nnyg?^ob&2i?BYNRWN5wNu~IKd0vG83}lm9%b8r$yx1{ z3K!UR9u_WRp?+<0lHJ9m!Jz#x6h>clW!D>;w&ggqM0gt%9iJntXq=fIFM||S6<4{? zn6$}#x&GwIlGZ#2S1QA;;yryT$a8KI>*fYWtq+s z*o4TWgk#+#z#Ss?zON46+I&CRjXMyXm~8h>$LHfC2BC`rz2DRn<~2YY?caQVL-1DH z2nDzPUiPu(?H5gsNh_Jgs7bGDAre~fN6*hf!+Sp5&Uk9R#UV=R=~liSc#U@kEf5eZhfJn3yKZKNo+Ajm9ZJYtY;{N7B-WX zMz`5Lv7~<$ZBK8Y6TK#5^OUY)%=jp_VG)b=43OdeQ_uu4Ydu7TWwjVSMd1ssX zG%QN~e(ZzTp17@L)~eeX3hI?^Ou4#Sn(a67@r5xSsFd~I8x^&7*#=;Jn=Ry|BllI4 zQ!aYHG=m^fHj0NwgP!b0C$~u*x-|lot{fg@^-30;8x~hWIBOL8?mV5iO5fEnzqn}p z=u|z49b#b1`57!jpR>l;0_fB$SltsM2i&p0>&1VfsvIV3}I!OIgTdl1QSb9h&A-E%IC~* z>QY9goLo*n-fS$zDfQy&ClEMQ6DGASQyMUx(=Cnne6H2@uxsBM$+x84+elUBj!j2f zO&esNx;LJGlM8Xo_nuwK54F7J1v>TXr7yS&&!URaQ!g_bA@FBZkue1Ss& zjwSDPUN9H=9L#^Bpen#GIBRJ$zu-TF^dFNJMsz-jV_6~~=~G|_VZ6Zp-DUAl==`|; z83rC$fF1(@|K{w|qRRqdbHY1FT~r-Db$;xeOT7qo3IVBKL+Fp03**#WrXxgP<~Mx_ zB!4GZVeENc;UXYIGhK{&W30sZHiKlDNH@IsSpg9$7QlO&UXF&%5 z4)g#ZLpy-zYU6~LBD#^k1S&jH{q#(X`j3IC4J$XK%CJ}eImzwelkJLqcJv8VD|9(Z z_vrDWy|;6L0wP)yZI8Wdq@7Wi;Y_$7B6TsEQnU2<;RJEI(cb2nffO6OXkq=ICR9f6es9HV$ zV;ghpg`uR=*L?B1JSk6_C!LP(YRrDGgFU(=`RG5)T@r%rn#wFaOwtD`J?aG>=(w_= zS^2%-?u)Is?iB&9W>7BfI+X>?;0$<4(Vjs&d-8AdS9idW0lo>+)1Dx>Cvb34r4~aT zWtA8cg7P`~0^9n8bsgED8ex1dt%HJ+GcxCMyJHrfK2CGNs|Mg=>v~hSC>-%yEWV%M z@Bm22tM2AeV&sE`liAU?XTVN)BsNHMQb|{z=i)v6mPEF+ zQjH_3x=&49Pw=3ot$)%xb3cqmA+=P4J%zIW5&@x}c^8=fjx8tYVsHxhdj?@d zehwJ;B>tPC`xzFeehb2Dn4=s;L$lPaqf=Z&azbG`jGx`lEs9Hok^ zaJT-mg@0_G4nuN4qH2*1`?*ojK{4%5Oc%+H+Pz&fseUr*w)v-21kf%}gp&GwKe5~% zV*+Hv#-gPEZyz`7R~bDKXEN_x4na@-spdYV!=HJ14tcE+{~+i6kd<1?rqerjw&5xihLVuD?p&$IGcDkp4=N|KJl_Rg<_BsAK zwvqjeh%^JU?E&cAh+YP6JyoYgrl+4ZGSy#ap@!RJ;V4d7aWwwy`4rQ+yJJI`&10~W z6^qQPT3K<|wOWy;T`NR;H{y4hY+$bcm&yN*OiB`L$0?wt=>5Ax*qP{P@A`Y47~rF1 zH(hES+N(~YLPGjf$}@RIW)p1EXSw*F$Cz9|W=R-U&|NVSf!u1(Rwm>j>k9KRO0P!R z%o3HeuI|q*Zk)ZP66*4lN^B$KNe&aPJy+piKJQ`Dw=K*g} zw7PQ8)iN?yy}2RSS;R>JvYF|)oP{$78uCHc(Ur(qMupVeXoC)*twY`vyEMJ__lgI*l!6S~%gSZ3N4;6HlFIPtEtal4%_wEi$ zPzE?q21>NMg%kegvt<|%BIUG)1hT~``+PVl!aWwSeSmkOOeZfYg0Mqg!1m<`+44a zY6QC;%t*BspMPXJHdu-uzgZkx=bGh)i9#H-@5+9H*nM2mHReXJr@)?q5Ys$K^D0kq z&d36NOh&0q#DQ?}IMrIqJ|^?~`j1KkZMzUpvvjla6-Za0(g82ZYFsI4yDhUbqGYFn zxf+qCjps{b@+O|Ew`S$Fvp8^Qg{35SfES%>twbn6IkRw8+Rq? zE^}ssSGh!$DyO10i1^IHnMm&EhI@LP@<;XotL~B`)daaMcNbjEsajj>pSa}T>dnI(9!Q?*l3dB`Nn4l#^5Wj>*gA!uXD}6SWp;P zU2qYuy?()em~nnj`gy`xC5^$;Xe51^cR<+6yz|R#t@g_6r=71bu;74nUq<#6mN4&3Q^2b-iJ)rG|mENn>JvvhROlZ{#sw*`Sg^ eDO+x$nf;^VY=mY4GY0@r@Q(-8^A`vJfd2xdt!`le diff --git a/datasets/common_voice/dummy/en/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/en/6.1.0/dummy_data.zip deleted file mode 100644 index 8c7b5942542954ae8fa5c1a7f14054658e351eb5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4014 zcmeH}dpy(oAIHC#49lI6Tq05p!!$W2omL64q$4xhGQu{_%q2qT$R)QzC8=;yq>OP| zj+~+rk^3drjxKIH3Q^?r`z}3V$!X{D`}b^*&v)NF9^c3N`Te|K@Av!jvb7X|hya|= zi+rEGpKd;1L;)qh)sI3s;^Io66LjtO2?GL|>ez#v#vFWg860oT{l*MlSU;Rpne(X+CNT(79bV{{UA9w2@Rz1-`|fV+kQty#pq zg@wfsX+8ju;|Bme-T`zP!Go$p_x1mjsN_ua!FOrsuM<`G#GG2C3KMKVnhI*kcdFU- zuPA82i%TewrE(xy$A{?DKG&As%g*mD`%bxJ6sH)u?%!Bq!QPo9h3RQ~mT%N4+aCNL z?;(TSP3V-lA86R>q+UmJim#P)Ap*1uvTvYxi0A{K6T&R<@Y8JgCARAJSI+S1Wc77p z^879ss5-;eyL=~2e$%?-V?T{HvUaV(uQVB689Q=xt$5PWGZ8{U`+*Rc(!T4pd!GdO zrkK|p>s6%p>(+XR@xLzEbhTRdqWk>AHP1WOCgX+H- z#f9o-SorR-D8=U%=mcFJZz!=Y8d`fKV_P}JuHg5JkDoa0a2(Ne4>TH^?6BGVH!JK@ z?JOi8d#nnM1e3oTOui29$#*6Be>e5X2x_Gs4CDS*W_y~BIpIJWrRS1wm+7VLHF8AD zsCAp!?0lI6L6J1ocQH1{66u!x4m#mgQ6xE*NJ7GW1|DtqY}5LlKeMZ1?pGxTt{Qam zi(O~zbZd{bSE^06SG;Sphj%b?n87}Bc=oPBu4&@a_G_gICcmCzp3Zxd&Ya3@EF62p zwCX4<>6M6YJH>V!c~m1-aQcGVqr6mSHL-Zboc&hamZK&}`Csx*m-~3&n?ri~-`#t& zHomQ>ahTMTf(q$>>mb?7$aw{m(Wl}>7{^h3Gxvh6wC}A+>X)m5k=z2UlM9V<&LB$24tu-XEZZtc)YoFijzyV)AOLo32YWxXrZpXvT=A(4d>BUYb8uiH`? z#p+k+R_ab2P{@;}S^w7WRv(u5T7c8o^WyBvB<{WVU~82wD9&AQ8S*ZUKY{E4ju4XT z7l}BX@r~{vC0kQ*2-xxV8tn-3ZBY$uPBrC5gMhpg=BapySrl8-I5efSj8>_T1VO|6 z@Z_+JG|3z6pM|GA$Uliz+{~RGAWZ8eB+gwVyNkMao#@$Sci}w2lv!YOv%ZUIgV2n& z#VHEf^NC;*3{#7*or6J|A|Oq;#$Ho3SsO)!afHcLRNRrN7SpueD0TnCof`;^)z78E zQ6*2c<80UZY30V=d#OCxQ97_X0Bh12x;8nzX~h4R*W(jJv$nUaF5mO)(hS|i_A^O8 ziWHN@`zA4raDsJ5p&s#pV?JH3u6)%qtmmk)x^(>Y1Gl%?>txD$oo_}#nTVuLAKxK{ zIYH~xJ|(i=+ubETQ0fkNW@`PX%m1Ovm1El)9l(YW_;<&NH~lb)_I+XQCagw4w?65; zSvb{%b0tTz2b5=JB_E&^kOtCqR*EmiM##Y%p%AEsay6yt1=?~{V z1e?+~Q$ln;Zo6}pHP}Qq*rdOujv{+!=T>`v4!;_h=%^-&@Vh>`Aw@3f$J174 zA+4;I^5?LOtQ~yU5GHv|*}M50lr|is)Uo7)BgA?IqFqz>uh)DGc|?F{{lQn9qIVQg zR6jA}93W*nLR*m~F=W^m`ToQE0srC?V$+T)Q%6EN2l8~guDIn6OnUr*YR(O{ie!#_ zco{UhW7Py!zC0ico|2Kb)4%Z$9fF8v$|nUyN2{_@o%LOz1Hj6w8ZkQW_Fq%gJ}9#F z6hr9M){@!=M+6pYZF@GUGj)@*=w%VhNW;e;6H`}|&pwzbGSKyR+|*@#zUN}i39A3a zgq|B?!z4RAd5ul^-;}b6sgX%*ljrf%LfoA-UXXFp2vo2cTt>WimIrm|;9I7_@QIex z)d*Nr`?&T?AY2Wb66NUDX(um(c^dh+a+l`HpxGi!S&VA5`0G^_E^V0UiPDp=pHE1l ze{aAW9z}bbCEA(YL*1DQ)9yod`>D(*J5*^^20?EQ=gq!+fdL?*J~alO{FLtV`t~u! z2Q12E&LV;i+nYzbV@nAs!GAS*dX}rZ71u*sldk7qsYc=y-HbI8vZZ|sZrqT&PxBfU zUO)JQYBhuMP^B9)d1bbszMXxpGS{H1Xap)eM|_B}dvJN7=oPiZFWVT;0*|2!@mku#mc2(bkeLroyKoMPaZ?4>gI^ zk6$=5s5>@-)uInj+wQre%QsFOV8XG=Pa;Q*tl(`uho0Ob#mt4 zV(W_JU8T<&d3bYsH;zzCOfDb4vcPg)S^hI$0$`hSZn>PZ;JkDG7XdYZg@AMB818`o z;G(}zTI!(}NNn?L9!Z}ZH3;K4>Thmq!P0U2>3JB~NkC3G1pHZ)r)9SV!WOw}kh-io z3si0+xtJOb4q_guUu@;~nM?iAVx}^LXXaN!10;XLDcM?bYH?w=+=u^Sa+x5{vXpg>ZDx|P)EQ)dZBw>vF*9WyV=O5qNeK}VCn@`qB^6Q0 zeuR@XvNRQQq7ufTqEb2UGwRAra?JVT{qLQ*o@bsv=DI)6@BZG)*UF3y#tqOvFG`4* zrH5}f9$*WA3-a|n>WXs%en{Q(W1>0}+s$D}fxX@G+(tLv9QH{NqCyp9 zyK<|ZxpXX4TPK`INTypm|?17TY92aWl2hf=_Z!cv>8Tvc)@l_DE_efn=>v>6zOd?IZ-d8ed zKGJPP_EPsbt!IIDbzAJy7@?QcvL;Hgr$c!_O+-LVR9V*qhYwy;k)BT9-x0j*)a}}* z*C|53m`O^%CmihA_M*5lzadyKOFJlT+J>E0J2?CS5Dby(!`zX?ZoNaD9dm+`t6wJmYrq3qH1?i5^OCGE~3KOmQ!w= z;f`^yO_sI>Q@#LR>a3?c*v-cioELc9_r%lN;3uts?=dHg(i2~KjQdJfmh8k|+bZ}J zV>?1xs!k4hs%;NQmn`L`I3EcEq#6=xRLe7e%TE`-Qs0S;oKk-2f3GHf-}=7RPm8;I zZ8~h`RgdU*1oV|gdZs+CCTy=mE8a!pT6HmM4y4{x_4mVGjhrX)MMYGiP3sJ=9zQfy zHhs%^n)&yd+*)-Q8=2X}NYI@`pwz)V$udPS#u!c^RuTHnKy(&so z)sFx1-9eg2M{E84#q3T=g=epC+@kp2EPs+R74-Di#RR+2`0cMghXpAg`Q4Jbye$$+?bGE}l|>RrAamb8Zlp z$ewGicZxWf*dt18} zI>#x!aZ%ySZOY%ou(4!-zmoQ$!S4QCNq=o!QBwcM`soWb#V5^5y_;(PT;O=Mles@K zBTs3{fEkttH;M-9{ozSiIp9`llg`9O>t+P6L3aVSXv5U5Dy%k{*xi3se+io@;7k1oHBYn1ylKHkPJuA%NxLlmFp-c2x zpt%}e5Qp$Lj}NHiH@j<#>&qK*IzN3Dad@^>$n3K>)YxW zeB+YwVsM{3SeN9RAfA5A?u)L<3Y!XnO0uoyb~k;)3jj1-`XiI~W&i4Z|3yGqU^(Cy zT?6KT|KNkyCarYA%Osj^E{mikPYlB7p7^K7T6SN|j`%AKY#ksk0tWstrrj(OEELQiq93}sk6tWlXqFC%q=lV$3PLHtkbuPwiF#(ZUil?;|L8T=EA ZMfKyvSq;s;jvksD{1U+GnOb^_{{m()ev1GA diff --git a/datasets/common_voice/dummy/es/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/es/6.1.0/dummy_data.zip deleted file mode 100644 index 8a396c6648cad9c611653cfed50631647b84f5c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4007 zcmeH}c|6qlAIHZGatv}Gi!~@mjxnOeBv*|XL#V8K%qWJLa)yK=7G)!Hj;v`$MB~a* z$&sVoT(h9LjC|#6(-${>{WX=Bb`)g+&^PTTJ9`krVzn}N(eY|WeA?!RL#%to* zaobNfpD(;1X%Ie$NDOtuV*|082kp5*kX(fW?u<`R2rr0@eFg#ou|EecHX8;}WDbH0 zR>vLlKOPjIuB)M?j({WJ>RJeOEiD$K=TkhG9#M(K=$-*90D(NijUHbFI5iijFWU_kJvL?!1My1%_-_;sS%lkXFo;83$p3(+EX*!{)V z7fkix$_XQ)a!#%Tn5X;M^oZOfwHNaD^je3Hyk-00=M3TYp$qa(?zhTrNMCa8yHc;Fp8 z^dM-t0!2mwYfwP`yJT1GR`3~9;?-d&3mkVXKIMuT-}@See& z3u8=zz@U5k7et)%1hpkN>6D`E5iGu5(H?#dagmn(R3)q|MRkdE>*S-wOYg^R5SmLl z6bJ7;i6vWH!HLuQk(K0Kf^pZm{Tcr-H$;iw_3BqjGu3LC%qS zHNYl_DVrU*^BA`Cr~+T`oThu}?CQk&xGOs>K*?-4v?2Wig zlXB{x#~#X`=n4C^=zS(c-D;ts?qGIApKAiLYFQ!_>Gq=PsH1G}g9@ZrQ77G=hZsp% z{?x}S**4Ghu7$$GYfvA*8G#PGy_$vLADzR?WCKdh1-%0aix>FnN% zfrHy7`EF(_AO~9TwzVMiiAUhs`!?rq0O;3nq^@hu%6;(oN`jH+vUUp`+x_HtPw z)6ngO^kgsIv9Hq1_XXla)Yts(;`^^&SeewI|-!Yrsl z(%qEi9JYAZb5M_Vsgq+$@Al80#Ad4WVUv9CE;d>Sr-IAw>)w81ake?!s!j>hwA~^( zI2KlET{RxXtJ7bp`wsrvSY^nYxcG-4Oi!rMV&RIx4ijiM!QQ!YbkN+VxC@z7%iogZ z8b(1iL;O?EB#xxm$(|pNb?QyI=Qq=Lbx9*cNCZ7e3C=p<_^RcXX=&b2mzTSrOMB%Z zxF(A)V95Pe<#g+x#<`JcW2ZDLx2kdNpg}|GazwYtSMztB&h(SDdr zop)~gPcEBU@V2S@#O{hQDYd)T<9Anh=iyk(Gu%G0Pw1_v)51+PzT&E{T+JG&kNLMP z9cl?1?l|}HVAC$<{>soTR^tLxQ3O=cWxc<=N$UsZ25r{fX-R65gL3|*vztRn;_P9$ z6H_wsI5cNGD)mC)U{AXJZliP;Ny?*3d)nd&Zg%bl3rYqkQ_*;rr9}c6BQ}WoaH?^i zFGXs~J?e)PwOui29AO9Llr~l@U}2n!U~eRtU3->ao0yOvm>bw-ayNf(SNA}(Bn%vi zqbRa#?~hV$ydAYqP@HIUO!1B$_ib!j7C{UJCY6&Q3Yf|(PY}F``E|J zLS)@NW<@!Op9^2W2{i;SQ-f6Me6+4a0tN|*hnSynoKR z_c3PE#z_4-cgqHrg=}P!B|h`Ff@X{vw@j9qW5qK6i-3xt)qo$2_c90k2m8D^X}x(~ zCCwO9SR{S2(g2KMrN4QsRky}$q*q`-e}Jx>W(U63KDhjIdi>JTFc~PXPNoc7y!uM@Ju$Aj9OehE%Q=eOCIK6nY_BNoQ%*s Oz?%eAj{+~_*Z%-(SclI5 diff --git a/datasets/common_voice/dummy/et/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/et/6.1.0/dummy_data.zip deleted file mode 100644 index 1e24b21b907d10b87b9cb3928e994045e5a33cd4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4524 zcmeH~c{tQ-8^_0*t&C-w5(jaNEsP?O@RDVw#AJ)gGRDqe2Ibgy$2L>6$i57TL$*`O znjzT^NA_iGk&>iR4BnaQ3YF8Gf8Xn!x#l;&nLqB&?|HuWeLwft0M5Y32B5yC@3yL$LFQ2ng#GQ%LZ{z z*IfAw9ROfQ4*ch1Y-)>YpUd3%#4jAFQ!h1ZpKC&^l zpe~4AL?W&yrprBN`uBuz-6GjUslq$hDl>5r*A?uBm0i%sntGDN(=wlz%wK9CFypQ=U&d@$aef zCVjS{Zn-N0`ZkS*=a6}O!Z8qG-OMJvx22&M&)OGeUyROtp_i4aZ)XsCM+6H|#c|HM4@HJ^*Q+@pE*yxep#wwEz8cmpHggd>9|3J)Rga;wW9ON3c z-lUN12F%3j%oT9In_n={&(H36RueV8+i1lJjgI`%jlB;x8rJE;H~a#}9XF<(Go+N( z+2DWF)y}{r&8$xy+!7T**tTZPG#}+>?4KJ zi3w!W2G9TqQx_M!tlEND#FbcHxb9XfEQ!p znIUbqB2f-)vn#QNaMX-Y3N9msbyWH2xI=cdQz@x2UknLiDQ?KY@|X9@H(JeDD-t4a z%C;hYS-8^7UfF7uRAN5RhcbQ%o-$bmu@}!k#4gICv6}09$U0~bVDWAtgO)^G@yyyo z)ARWU(eB~)WO8BR1GU6rH#rYX*H?2^r3>j0_mFCxL_(#&d6}o}HG;(Te+EA-T(j5I=h#?(d_?c7uL# z0q2`DZy`1)zQ!{kxGAA-nhk=*hfn$`MZ$85ghyzX(A(O@k$DSv;OuHGA z``(&Dvb3iV7KOon&7K{)_y4o!e?EKMhfkjiq%@%Re|Pw!u?{HDpQTSbPBnf=4jAtI zj28nyYUnBcdJu&`t-8i>9lK&r6x0;00KWnG(O-}@D%CA`OLuGe@zI3PP9*}a*o7#t zbUMdLd>{98z=#b&)P>BF&TY?qmo=&dD^zJMqO4Fe(yy zJWss4OZz-u#-`-VgKWmk5HMDr>V)Phms!}y!6zE95e3=x(deTO@vTHu29SmOT-fYB zZr@Y5maY&wfjr-{VAXmAci}2Bur1|&Rnp=1@}xw0JN=vBQ~PU1`bqqM$Kc*gC9s7^ zNe2dTB@#FEavg`(!AX4$Bu*eN$v<$fX(eo&FEwm6H;6mS_Es#! zgz|>pB&61Ey*;zNP3Q?qCs6S*`Zss<1{=+gn(Mqp+U%77^o{< z_PFB#KdBE8x^AYM&qzKmZu*$j@2*hufw8(mf^}6)zzMEb@3eyj3>FHp!4GtLwI=ug?F{fDZtgooV_#msmTZNGyp7I;A87hI?S(~&fC0p z3B;pj&hCHPLekY}biLbCZ#F?ym@BgU*{Xt(l3)+g{E}UWUN5J~dt(o?#rR3^v+kI} zXot|eJK$HBWEdgp#TJgP?5{WcqTrc~ohlmx9Vwh7SuHSsA?PDRH!5h5eyn7#*?6k# z8=mXefSoadZ}eH=wQ5|~wbIY6F;M0FY+R^af2(m(pk2-{VmhL?PwrGjzq2@W^uI9d zgS~I9n>aiUL%`E(kDMffl>X` z50|w$ly7%apJ9};1LOoVQogqA(~ircz_xfR3UxvpEBO&K&E(Ai=;zC&R^0ZJax0XMm1gLcbCWje z_O`v1I!CEZ8ioC?iT@LOrylc}P03_RD&Oaznf-0Y*$K@=Q4RoLqrBZH*(1K2`s=@m C%$?}~ diff --git a/datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip deleted file mode 100644 index f04c5f3ced9218f62754ce6b851b3a5808ec6c20..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4174 zcmeH}c|25mAIA@4%bx7JB5{o|wwf%FAzR3jZrS&BraOirlwHZbKDw5yVM=xtw`^Hc z$`Y<6GGfSpAl}XU<>W_xXK4zwhVwG0=ff&;rD3qUfUG=a;V! zI)E3j^K^IjwzRXxTEk3?_W_XfLuagrx94R#fQ(`m0ss{6!H2<~MTB=3vGs!5qA?de zJ)rWEaHzDDv=kIB4TZx=te%x^v+EN@Nvv+~`31m~sl3rZ5 zuf2|UE1;oU?yd3KI9y)kBgNQ~%AB}5a7sjLYCl=XBKv_rez7BQdWiYf`0`2sOfo;^7Aw2BJ{(5Gfqte1?b2OXQ^RkW)w* zz7%!C&!Angcs`M?JifVX!F5O`KZJbNyh9{Q$sW$MjVzYgofwAL zJfILo8Q&Wl=0=9zscJVBuck}>YaE@!W}}X62~6P;TWKuMcE5Yd)iR;Vr#El)fY1G^ zc#-O^)VlM81c#xLFfMOtM(!y67OtUNda-9K$G!h3Z0Jxza8jLaxVc}ZJX9btB5N>$ZAk3oJOxXw$lSL9_TZLzbz2fRl|1_s?5wG&JyzW?5d$mYf!x& z%WCBN^22drkF*!An~5c#;!ztOSSnE@pT#uE$a`J6*NLha-oo5wq6gFF% zn6{PCOt&$7*sgik3RH#>R7RF`WxTB2oI&TXxBI3KaUgy&6z(x~Q9MHQq0eWGIu77~ zpt^kRE9VPLHfUnWWcWn5vRZY2o0#hBudKq9Ufq)aWCn}CZ8$sQpKj=T^dWo2@zrVm zk>>C7B1Z@#F^=#u+-doSio#cB8WRyteZD;(nvRzmBek;zy5iGo~eR= z9T>qN)YPU&V;maDu3noMpm@9;NIS26+kuZ&?D)G%t-5m`mcl>a9EAo;)k>aGh37>P zJQe%Q3;%q3>;Wz^y`Hd4g*WG)v>g)gO-J4O`_gdqRLM*jE;V{FAz4Acbg9uLe#t;f z-nqGG;VG9NCWV7eEZ*jJ!_1?^90kk^r5rkHnp_oim{8CAKVeC6^@5ESwoe1i?yL>N z^EIJr1^pA<9%W8ER`XiXrY@LVd8N?%u0ryTLQYlgd{d8OBmuvZ{sM$6PBmeqtBJMuzj8&*Lj#)c(UA(^U~ z!ZZfAD+~fIJnOX|!_%LR)P_=YiZVePX&7n~=M4`AEK6p?p}{c-)*J9X-58boF28vd zh8lW{vLDp#*$jjEOr`s;DO1lyyj!byH}S66gSUpjHtE`D%W=^n80R`YqH_Y>M%%Y8 zGVt&Y-=rE2g?^x}ApyC1ZHs*OcrGh2u{eXON`R`$lRlo#s2}ZiuPJ?@6Zf|uJ=Lh3 z2d$-`ks<~Ah8u=9nK>)Wc@W!=br8>IO4k;@g}a;(yykCFG~-wByV9M<1s=S1j%S`X zFP3>COi%(F8PFl*kll045gYzQ7kAyFoA(CgqhMvRTo8nsSV`EJU?PowFQ?8yAlL~m zNNXLy!V;fu-Fajnnr=np^;-$DY~+bYX1vr*`nR&fQVj`zGaWjnU9X@&0;CY{5jvI^7x%QP?lhn&`VbqqQ9A z!hV(28NEAO61Dvygp-l;L-xw@oL~9Oz^p3qbyu>d{F3bdML}U;yWpm(`tE}Nkn(>` z`Z42glV(-3NhEzv{2+{&_r?cljP?|*@T0|?{s0VkhC>bLOwYvzwZZYPtCf@J1*J_C|}66p+dhi+_^6TXi zCvNuvxf6<`CKKv*DXEC1 PX~7=~+&yg|0090AFxU5D diff --git a/datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip deleted file mode 100644 index e77110ad47766ca8d889731ebd7c3b369c4012da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4186 zcmeH~c|4SBAI8Tn6p6|boudUA%wR}Pk=>YSOtQ-|rWz;9%rHWcEgD;dH)0TBBuADa zk|iODvM*W2TAew1(aT5VeZ(gtInDXw{qLRm+|N9J{l52oU-$L9uR+1t*m*!JpRs%H z=&vtp4_=Ts$jQ^i#oNxw!NWn}jOjKI8%4^*er0%G;svp=)7d~E_P>A!YRe)AHWtB< zz!*1zyC)G0lZSwzicm!`1PX>gn5=#$!*RnD4>MW)+}iZ{(~fU7eIa1g+@Ov-j%d4+ zR~?O6Kp+QJ5J-`EfCs?==PK_(Bz@he7|L<``Gn)&Z&dRQ1hQ69K(A|Ds8Z9!@$ds; zQp1T6L8_sw=%lDlubfXb=p@-_XV~_Mf+p;(R@a`Q2bB-Tixw%Cai~YSxJGn|xl;&R zJ7+_4VX2Fi#m>IG4@6Vqeqs*TW*3&vHC7kX&RU@IhkJ#6zYeIM#XPA@*~li(mfcv> zal`;#R~pzJAaL?f;6WL<_=MT5V0djXyqFUlSNO&#hON|?CNhSX*FAS8mzZLmf`vr~ zpQeUeK8I4An=upU{Oh>f+{8+2QR!q^TJl|J>~x_Ofu_GKvM^=ay)f*nU;D&Al~r#! z?2%X7P<_GO^z1KEv%}T=4nbbWd@l3Y&ez;+E{jm`EPai@Pw}VE)#7J;)o;J1j@xPL z`+Ivq8v zYS-VMy#~QHQR0^dDq_a3_*rT9MZYJ%wDH%MKxuvm(0HYH%9fu$^Zht|sta+|IPLxr zMQsXVn3dvny6V1R(Db2J@{wIa7RfXAiJV;dcaUeI#OU#SFBPS9-#L9mMNvI$jw;6| zXx$!41jEg)*@;U{eXRPfB92-V=jtaxZsR+t-J?9`-NTVYsvF_}wwU|JtAJe_ zt+3j_Pw{C)D6Qhuk(zA>m&D7v8nPSZ4BSTt16$1n*mfTut+DGK5BpWDenz74**;`v znGCGuqF;`UaAA`%G^S-fKx4P=KrKh+^d7R?N`qL{%7EgCrc{V=n*v(>2@GZCwIVrO z!~v}c>-0^r6^H%6W!M(Ds~CijbEt&Cle&LDwd;ppOCLtr)ih-zWzYwp<2UB&z zEVE3=ES{!PI&QNG&T!{g^8Ud1yt2cej%r_*Mp=sscpR6QNF988&n5ICo*aj1d{j^5 zYswy*Irqfw#QYVDL4}rqQorR=Z|~`$v2)&+y{#(q=K@$cPiWU{cM|1rFYe8{w(llF z*jw-g3ND)jF+cel_C{Jo__+hVG%;5v0iE~&N4cEGs_xeiM?{9Gp)B5SBUpO?Izyo-4KsbO@J=aufoo8T_H-1FIagBq91Jb4#O7Qb^Fqk;;|E9ikG zdVCXaix&BR@&4E1Rnu7ZS_9Vm1dM-h!MS-{z!H9@p37@S>!s;LN@PQ%$sU{^FGV=T zCRZVMS86BF4{?jmDk^=#=Hhrf+9LN0E^zn>PWc`+?yF3aLU@g*3lr=i(e?B;wZ9&%WVKwxieE1t8Trw_2w{s?{m+7 z3ie)cNl)F4-}-mxD>vc3BtLDQYQhyIPoy?|_yqOe`+JqK*V0nEYWj-;pNQD9#t5gN z$4N#nE4PQ=s`4?C$?!0?z(}_{a78ungf>79Iyhib^JOc@!ogu^{3%p{Hh9N$AGGwM~vRNL`CYKR(UQX z9n@~`vQspu%seum_g#?bkSzBZO-w}*N~jS1MrKsPD%{i740h>^Grc1C@$`$pXEvCe z-_)dthtu{&_RhX#vHt`AqT!hxA}&onpC%Vhe1AdGT;xfe@V?3TPnnd?`iau8 zCb^P_vxxE{1IhC-XJZ#%?V`|<=Qs|2j(9vuf2FO|`#RuuctrMJG_-*U_u0r@%P^5E z$V2InaeRGZ>mIsNuU*x=+I^+o0T5kl<#j{K=U7eo|DfOj(0aioE!~X; z|0ePOoU}RbuaoFn8BCJCrhfpolKy|Vto0PRG5fE=0F!|-1K5EdhJD&{SpbaTp8!DJ zQk`|`#_Y;SeGIspN$R&G`*Y^z493Vj$IdkKyIum2e~`vd@RhT;zS@o1fsriD$uxOA zN&I}dD~{WE=P^RZxR{1A+%jDrAW9mJwPZCRw>d# z5hCmYL0QA4XsDqqD9hpkYmkc+l_qV;xfAdh5LvSS-^`gg^GDA6o9}tw@A)1}b1n!k z!2G=~^S9o3`S{=i4g+q%K0YCiZq7l@8kcN%0InREl@s$0zQG4@KxViA0P+@mSngUx zVS5o*teUG|fPZkHnzlMZ?F9S;Tn%wT4S`^@dVYb+wog3HW_5SZO@Lso-fo{S2CkYL zc%JWK{dzLNS&9Py6gdF^&ORV0z}eGRJtz>nAyi6^p3`!=-seKmk~@5w;ry}PlVbPe zde*K@Lb4lk-IT?>%?(tGCnyDOX{xqG6E#a{6Gj+LcHaJ{#L}*5PI~r?LM6tn1yi?( z`zCYk+?v3j3DXHaRF!{cm8Ox9&9^m!hfV4w4bPk5ER?>Fu{HH7fC|e?IvDZ{8pd(z zt|b4VipV!=4^_L3FEz=q`Lzjx%*di-RP`Qdw7vYWKxc9A`DA+4! z5eRM~wR-aAxi)@O4BRby;WU95hc0=M|Ihg(e)m*AIG`&8x`iiP$8+#HWcQJd6?j{tYAnf{^GQ;d%P~u z@Tj?7I4oxVtOa2{MTiHs5MmZ&2-i8?lN2dDX-otx1cC|Q&E0GM6>LRQx zoUHOKvaUtCkXxQ^=h8Tpc$L3QrA2D(Y8h3lA4}}7U;U~p_4I`i>$dy-ek|=xa$Kz8 zUvq>Bg4#nt?KRog9_#Gw3Hk)a?NbMsgEf6B`9BCmJjhbFRb$qkpa6DY5O#F$ld#8GrS zssv&$^~1=S?;pyXTA}as2Btb`Cl60waAmrgvU0a`6 zAHLvqCqH~ zqGMeeAXB2EYqd`%afITde2tzHP{zF2oo6e zN%!5X=l{CzuDhwhQZxh{`UUX*2cGX2Fr*ix;;J_>Y*60)w)~?jjKPoKe zE#j@qalC5%5p&?9uBXZz`AUJ3GCUEe^vb?vq752+j41Hvw8ENp#Q5!)<-ckeWVrlY z|HWm4hR0-6Yix2Q>VmoSeIObbG}SyW{Wgk&Zl|B;+(W^T%o<4t%BxZZ?Kv-ZDr3Ym zDB`v%vILn&1f=VsQ11w1EhCuy%r-nM(q0UPuB5wo1}qk#Ebb{hoskcgVY?NO&BWAoMgl**3fUb6pHPRppG0n z|3+86^a@e)!6Mp|a({&8idz!#6Tq)?Fg5}L%ieA4@DIKNm7WG~ZT6?!(|4!I>@w^= zDW<*-2>#fCcBNq+ezSZiI&Vm(Ay={8Y7PoZmmbrrzolVIocWnAiZ1vbhMB3u2Ud)? z)1SX|&;O<5v19jtlJaye;=9)!TP3t4{wRx(Ja{+F9T|N&`$30xpM=fxSBKZE&HN8# z3E*_Ou2Fk(F$TAJvx>S!$}atyTa}|*ZAcunyD}R&IB+aAl{A6#_Ej&HpdEE87drV{ zssBNfs6xf8vm9qN1H6Z3DZUcO+6Zh?K$Cv)%p^?^OB-HB7aSOv2BG z>B=5=O*~1jxtX9BQtPMPR~O>BPZ@HVXj$$-A=;!JG}lU3EH!q7X%AG6-$V6{7Y^oY z|6mhcBz`MvjMvDhRm9y*(Jk|8tfV4OL3c8|jIaIZ^c85jX9@e%}iE)owqbw~{0Gu}x;3*LyFQx#PAUPg|kOP`0644lDMk+iiL) z)t#Gd>JHQXJND+5-{~=%Y;cmnQ9cU(iOr$<+;Mh8?*l~y0ABF&1*eCw@fH6Atq)s& diff --git a/datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip deleted file mode 100644 index c8f82000515c785b28536e3aa24b65d126d53687..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4085 zcmeH}dpuNmAIHy(k2h~(NMX_#Tn(i-KG#rKZz80+|8NltY%})5Ab9t9Cli~{R0F52yC1O0ALf~1HWbwMb09w zG^DE!<&?i45~GSns-x6VNVGZvcaR;o(MMwi2boGhk$&h}Q9TIM|+?b!S_L0THG??SrPUO4v<(jn$hV{*%`0tDe4lUJ=ktkEGiK3i^=YzuAsI8Gz*hwDNv zNmom^v|o>Bkg0X{%^Im4-OaC#ybGz<_DWBdJNaf9h~QIxJ`(lxibvR!UYNXR6UxUS zr2M|T`3#c*?{tt&;c>4k1t;pnzMlt+Oq zdSxmr1fzW*Ka9fnD2&y2VF-sE>%OjbY4k9Lsb}~65VmFk;oD%%+No6B|4wNC`gY`xR1k5J8es;61$t|DL9bzq!c$C|4ejmix*DJ-^|#e{>sUl z0~P#rN5&1kbABE6^UH1sa?JjoJlk4vF?j16*dW_x1hJP?P!(M?h~6>B!wQ^yYb2do zC5nq7@XgSA?Di9xG#qTcg=`mk_vOU^(J-wP7cyKUF~32*Q#>aVW99~xpji;J82W-vc?Cuet(`yF<@qBV>u<*>_>?6b*Ak%Y8%jkdpERX(3A%J zUNr7Km`1&GYtW;1Ose5!@YMi@tjt^!Mkew8>+unh&D!_gs?SnB`CpywxtTE7x;2%c zTd`3gMZ)HLb|#hTQR=_V^+!i}JN`@8JqevjI~` z4j=aN2=^p4qkD=A=HHi0 z^@r)=gB&(ivxj4Z6@v?kgu-Fvx)Tlk4=h|Ok4~H&Gbm{|-F`a2V(#g0GrgQ{Tq(2~ z_d-+M{it6xUSl%;U*rD~jjamRZwi8DhW)eS&xd-_owA)acHIw#1qh_0x;ns}XTD2&K4E|=LtMWCdSG1*`Rb&L=XU{x)iRy%Lz*@8PS9~sp zW~7O1&z!!?*J-hZ=!La*xa^eqT$6Pn#p8sSV@H<$^(zs+H)uzn+C^(#VnW77>MQU9 zS6IFwQTbMv;0x|}VA(+^C{DD1$8w`TZo>a&d7ks>>_;zCWFcF!41LYYqX z>^*TO%-c)0%d)=9@ozX#F_FZIQ2!Ih%QhJLm=NqTgse9$M@c)1=kw5zcQ8&@ZYP%OU2YAZV|T70bDhhIrVX*Zn! ziPcda`Vl31%R=Cuko0Cr9U3F2a!fB!iqD-9eoXnXZM3W8&n4(GIeWNj*z~-Z!IZH> z)`kRCdsEe&ll1t_7}>WLQd7^ExvUW@ya%*aC`tcK`l!D6$76>8=-8cSU8gl#QdiJ3t!Ki(TZ0{5#LKfbjfy`xQo0B(h3SE>KOjy=p2_PL8v4Wxmst+4i|KD=0@mbS zhc7A+eK;fFq71scWjUOy$g< zE2(tQs9aLNrqT78tK-y4CI-ee^P8~%k{83(Iz8q*-&c}lHgHW|KJC|E?xN#351f@y z9yr&~6~_;ERL*(3k{ZO%HFeeW{X6!O7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7Llf>RqFcr zK;K2z`(}Dz+g<09VJDHnR$98#bw~&nmErc;pa3v z^x!(%@9oE;DBy6tf{Sl~u$?cn`-~$yN^dqU$~b9qAyeY)owzqUK5$%H75D0Qf_KEP zNlZ6m|10D&h&dG9pML(<#)$#@PvyS~Q_-H*z+T9(cyf;Downqxv&{3?%(C5aLVf+k zC~=iyx!eFZuj9ND1;zz@Zjq~91bo~U);zH)NbqA5Hmxvw_+-oV8WY9Gfm|<_K3vf| z&Fzd+%gtY$TX;6}J#GpOlzjI^L!oh2@!v{wM!k<8KYiQ9VHSOQ$*w1{ld_lpy!T;V zX4aPFSD2Q4^QqR=mQxSV*|M|mI45Vhj<^zsvhumsC-%|jmOuU7oHl)1!KyoadrG4W zE{0^-Y?wB|PxYVsOKDJoK?L_>2g`?|z~Gh#2Dc$e!JU#?Mli4^8s;4~;Big=DzeLg z*?XE!<@<<|myA4JON5lC$=oRFjaz>G#fE$Q?6^S z7PyJs&iyfidwtV0+y7n`n=Ojf;#d@}`5K<(5WarJGW55IC;Q#Z%ZVG7tTa@+#`z%X zac#a-7@LL3iVHXSm|WgFl;d*;ca6}57 zW~%gg`69sYOhx4BzuNylHp(Ny-*aA4P+k0D(_~=y^8>@*oTTtCOU%gxmJX>Y@K^vP zcPvRhcY<%9h#^nw^_jsgn-hsko-SNzgDS+&3SLNb#O+typpKN>pC&hjj!pHmTiG2m*z4%1q)-I)dH?j zYc__wn6N@z{ohCC>opa3%1ieBxOAyrQ#Rd!9$3&f<= zNcklhsYQh1^Xx=hP$Ah0EF_(#`SGUw{msGH^=>ZDtR+IM+j5R%HOH*Y)(=yCaYc9v z_s@>+40Q^AGreVvIz?Z3=J*ykipjtJuruTyi|+ML)`un@oyh&sl_9CPE7Nhtj-`FY zll@;Tcy74J@KnQF-lGR{s*AqAJ6yZk+2h!ei5hO^Sr^n=?`3sxHec0~O6lj?)v>0t zW9#K>Ro4|aFfX@h-PY5nWT=tNTTnc?@y5X!EP9{!+&O>tg!8Vwd-n20Rj$vy^1IUX zQNYu(xDO$z%@cN>_!zd``1IOmo^|zC_r7QmS}Pa%(Z8|j)45&j7vAkXkv>iH^HMXO zJbkTCRg-s~n)-0lNf!RYoyH&QzO%n)N6kXtWtf#2fZ_cch%HIVLYaA(1tX4}bhiI& zE-2*BzY&RA(70CKTW(>-MNweD%b0IVIa1V|6R-MWq4JdJ$G)nJnWp00%+x_Oi&uMkhb6C_ob;2DMS#f%+#VpD5&EBg2WW#jB35LCnx25^wj(mF~ zck8G7s=fWkS|(bum}gy4`*kmiXX2i#dJ>cRq;i-$)6y>bMc>{zEkI`1vgG)~4=gt% zB(HT`c{#^mi-p^QJ?nmceCK0yN3jYrzMN$hrHUt@diVuJWpA_jKVc z3I87aL25dJN-gB}rv*tt4h`_keALz#DAX|;tBg!?%(&XT5>O`yFuZjHF;QB;tdJHk z2?nY#APoHL=!$9}p4Kp7n=sqL2%FwIE+*0@Sj!k_AE;%F)n|x?8fv>3VjnPbFf3^V zl}$)Nh0|vc`;c16K#Ot45yE2B79z66WxzHI@fO3|i-fJkY#1S1`wiIZA>LZF(g0{V zmL^hwHz-{q{ES+IBU^rol@!Yn^*LcrgYqD1RfBBrOm3kcg-7#Kc*9RdK8$%*Cw diff --git a/datasets/common_voice/dummy/ga-IE/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ga-IE/6.1.0/dummy_data.zip deleted file mode 100644 index 6c207721ef8b523d1f70931a55c4249cb2b4c4a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4096 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7Llf>C+d2- zLhRsQV28^GB0!IGFkD@f6m%$brmzGf1A{UX1A{S1c9aw)X6EUY6qkX6YvMe+grC#w z(1Yvj1j9TQMIP7eBCajIt6Z3Qc&1D=bM5ljdGM5ku>#}fTQkqjQ)fTHdgRAhwS+wCngZ}{k5=wwV%kBk)H`nq!G zw+i)Ik-QmeHhF2wACs87>BZD}TQ&%ovL!DOF-wriyzr#O{fm@)daIybfQd@Tl&Fkm zw}%-z3CE|%Zc)geyO}Ga;1)}w zuPbik)z|-x@t!AKzqjs*aG^!vgW{caPu=zi_)iGu>rK{@ta%l1+wdgYw%A>X@8VaO zKdxFgH|I?{`|snAihc;zXZ_LD-N&%3_xWKj!bu2PGp!BqTgy+wmJ1C2YV* zFeE7wQc}wZMnS)?-%$gemg~<%_A+SH=RQ{DZn1EEs^TZT(s$l18w+9XFqwto%KQiC zscN5(kju8)uXe7~i%~B`++xyb>oka-vhFv|QwyICbM*&Sl~nQ^mY0 zLZ+SMy3(cU{dj3fn=Y$#vz(^FBDpy-fo{zQc@BDhNWFb(ZQduFvnBsOnRtq97yFgC zJEP*v+~`N9Yws5BE5CB}@1Kmr`q?vgSJcTZaBzsd6c^0gx99GMnv*mC1aa<*KUDwB zWdEG=r|b4D|C#R)j|gSYc}YR5AMA+81}1fGU?`iD6v}0ZIhnwcAvFaa)}W-0CAsGY z?CldVNUs0URQ{f)yYZy*3@KcW972+FzekPY{G-?1i$H?XdS2+6Y(#n zew+R0!i!(4B96>?X*Ii+AvW^t+nOIcV@u69{f+wHX0mc-iRfO5B)PfIb{NW*Zdi7? zfnPw?J$Io8-@;!n_7oekzH)V68NP;-k%?>ix>!N$U*{Cnh2jG_T$}himM*@`6i}R| zexsO0ENHr`YgX3Ts}rQQHMum4SeLTPu6p>kD;M3 z{UZ8+(uGZItDFT=@;n*>EY!*#RF_$<*tX)3P2BSvfxHs6nJX{XFJ)eGUHRS;`@8es z*8R`Tyjv<=7yYeX?C5sKHdeU5%d40v3@MdRWfnRik+g}BX%TTd0v30mo^wd}yk9Spa3N+cfX z5`M7%U$yMz=IgJkt7F9DxD+HZ%9S7ff5mQm@5}k?=eG(!z4GzQ@t6Jo%u(}+_}Rp=PGDfV z0n0#3lJZGr9%dh^W}fAZj)=*mzTU9 z78S4aCF%AG0l9tNN_=`53Z1u~eyN^c(y^<)N1roYC|zpmIwpae zn%86#Rp(jd$LyQ(D(RAG+nhz7a?S=RA$?N04Qo2GHX2ssDp$GtvX_Z0NaERgOJvoh zJ-SOj%EdKq6!D6^{{6)(z3=n0M0!3ddOh`M0&A@g=j2@W6W>|a6}K9yd|iCL zRb`GUPi5Wx>&N%Xf41D6-+iZH_j)I8}7B{DMai|%@eGv)q z?bY?)GNpLsHBplWwS=pDoBe;pJDq&t?^M5!KXO{x>(y(xlNa1BovwZI@uwZ}mS0Zs zB<$ln8u_#K=r49q+(TNtpfVEL*acG-B*i^6<}>qA+k2qs$7miiGRZOHYGq46ogl#Q z))B-+X>+qe+T0`&v+Aq5pqpF!+HYL^2o#u-Nli&0yj$QJhiO9|pFhPOotTZ`EoL$>x4 zu%AV|wP?i>&~hvdvjA^Ux{Y}@Y}=p z%QgT%0OjH6=w*hoaJPUOpI`>iC5jrDulzhbw*hE?V{`xjaGY|{-!h2U#voQ$DJv&e zXAd_iMQNCntch-xOt7ZbU-Lo{D)~Q$E zrvU(QtCdmFsR1F;IBzN8AQ5dlrAGeX!i3mZOy4jcx-Yaeq$JtlUT3I<1V&CjRBU{f z?0dL{5Vn+D8fl?!seUe%j|1dp_`#d?K*Py6#kbrluTUbIS|&IRW1d{OjI1o(mhxzh z2f0r*WzM%J?jGUcZ)Z4k#QoXMq;2vX1&aQ`wCD~R;%Uva!U8sV^9=Q1*?ZFscvh?< zl0KMqFutX88E;B@C93vKc@Db2n~G`OpDv_#_aHn+=_zTlFjka3 zzR$!+($?mbtWf;<+xv3wDkXU8(gtzW2$$|JzKb0}IPgt5MNHlnX^v3Cdc`z>M>Rjx zzMl*2gsvpfnhMO#2MrV{Dj+E;NK>x@${PE74NCQDWVk|XKXUofq845ig&z&~(c#O~ z66YPh%rorI!?6Ql=_1Ae6i`EujTa&DV%8|zJiojBc@?DGtWI}L?&sZDoxCa#pU{iC zk|z$OPlS_u73&G-H1fibVMDdv*N}M^o}SYzE6ji;dZweziGcu&K)N0sy=L*$RWCLh zK>|=FO6Kf4nwwj^niseRwAtq{geUUYOfa>4|nS(xq9!yE*a}}@FYHC78H;< zxZG|y{%8IpY(%5)_=Kx>{lKn)AUr05$nBlU$cQq%?kAxtNWZ(ar*;Nbz5Atf`cQni zfw~^MsCD=8z%G43q7U*kX7AkV^omKpZzaB}j#i?jp}|b56po^fJVhOO>eazoIG`!b z#2WQOqm>%@MV(kj9l+dzfr9OAxlL)Ntrkx=wRzh3Oh9{YKMyWYy+JE14gsZoRSnpU zK(Oq<5j^+7{1BY4iOMgP+lfbev}V)WYDdD-uaxi{nH3x9a{09AjNeul+PD;0TbNP! z!8B1^j907h5C~&k&|YdmLU`0d6ja%oLkGJo+C{4VkW`KVYT=wS+E2!5OaJ}cq~3oJ zNkbDM&6Fphk5{)5Ep~1(Hexcqm@Iz%4bw$PRlrfX`4Bv`*+lm;TWRmWL<#4M3vvWW zyX~fN;L;SMr@kM??vtFn7rJ|f;1x;}W9m*77ZY zS2UWogr4hxn_ZbAn=RRBu`8H;D&Ch}!0RNJDF+7Vv(mv#!OVw1N{YuYXDfx3N%Sy= zFc<=IzU)&;l&2%UG1@9i^MIN_%kT~)qJd5tC|H3`sp6S6xG~(u=^FtXS9gS}eN2@a=nSBQIeg5L%)Xx~w*9R{dX4_be>-C0GueZCkoBI!`w`hs~ zm->H8>O3yxhXs_o=pctoIx*y$i3PREn`FiUg@oP8wud>a&wnhetI;g-OiN1?z|&`9oa1F?Xk3d!dCiHBcF7XaWa8ux9|7UD(ziVItH$kke-N+S-M*-)GV8 z)2os6eDfM)_QdjIA*GIyqB{9dx7LjJ`0NAlOU>$-C-Vmv<|NX0J`f}|)p))8Y;xFH z<5+U-rCzh~i|N#ooO0rwa=p^ykL&9s^*%P#hu7AUmxadUGOp#5XCnt={ZfXGEw5|*E9NL8 z`@vs2lA{+pPt}kW82lw*sAjQ<2^RZcSD?yNT4%eZ_G{ju^*p6 z5k+?U(}a$~t~!QYtfrep!p16(i#lE5dnWL_QiIPj3U*5y*G!OOEs@k2y^@X6&&)o{ z(x75moPlG82R7gW0WluMvdW!Fk&jqh38+|4Yfgazm~7KcgTE!raI8rQdMPt#e77O_ zWxXV4ivn`LPf^Gz-4VolqNz2$*XBO`!1jU0u*qBVa@CwUIhhr^%S0D#hCAx9^Tjx$ z>T92ZI{Gs!#C%eh!@P>S%SC*wl)z@Ee#|5>~$R@ zU6!q?&pfU)?W}c`SE^*J&J`Zc4Y8Qlm@B)$K5%ljQxtsL5B!y8&6cl7x8B<;P%qtg z;m}SS&FzYMo2CF7T0y$4h6$eUd=OwvZRK&pXhFX^TKtEAVt}=POKO@M1OCgH@q5zd zA!Cg+rj|k_>D$Obfvt=jzg*Uu)!jI7timWI21pG6Qf}+^Y0G6%VC%*Y3Uy0$)~Fi| z_Im1NN^Mg~{b9F%&)jSq*E1nNs+m7^5(@bj22x*VB^TFLyV1a`C-X2+O;7GqrZlw70Qob1!yo?7%l8Kh zumf-l^z{vPc0=Nj3f2fFfF?)6%7y$3Ji!7`(az8S0PQ>QVYy)t$+baTgP^WxjDH{& zswxkIDk&-{LSaf!7;KZ#^RJ@TEKz2Y(LJ!J6#{#n&PMC!2Di-!bmybsG|fwOc2od> zp#}ih76>>D($h~KhYkAYs1kE@TozJwem<)9guUhjMUF^sEVDB()=Mx6v6J*G7aVvM zH|J_xsZVeUkL9uqpx0y;RQ=Q-r)OfJGN+6-LhJ0+AT5lt+KPUXnB4#XR*hMlgFdzxykzVd2JSkpQYLgn_DO-qZ6TtXcF|pLpH~RK4w`JOU7#@>i zw({mKN`&6YG=bJW6&z5ptF&vud%0;urSO!XO7UJcEZ)^V_Ku|o0~DF%+yS8gZ#_&! z?8;}!Vgic990-|8_UNqsN2Tn8wAs}CZcEn^{Ed{J$GmXSKL;B3EYxQ#%mB~K*L=4k3=&itT! z`Tty@O^(FgO^WOql}rktpZ2-ZZ*v!ZS(IogQK2N9*wuhQpj1ttcQSR}Hef5c<`!uS zkCPW@^o(7s7P&r?hEHiwF7oR_X7UU;9U9d6ERh@f-ZXzM$!2tJ{$-VPZ5Ej9ZA$PYQ z>X3`;7X{{@LWgXah0qLRBCWPM$^1@DiCmowLWTx% zu`=n1F{7I1vmZRDyly0MN9_*bY@p|=(1+?Pwd{(#OU%}OZ6{zWa|1O3k$^)}%7((; zVx3vq(DS<6diVy%B}g+4Y;6l2WxbODQcB4rgr17<9MVwnaBX##u}sOVS#Da}AiP`6 z5ntqRWbb@u5ESa1W_kseVIDp4#aCnU8&wf81REbb=DNc z=#Sbyb-RJ1R!maeN#PuqQD|>Z@3`?nY2}uMv1M^S-R2JG**$x89#NTzaY-Ey#r@@` zw#Zq@cxoc6ippY;mb8t;beQY;2w`Mx9Z&F-FKUCL=z^lCZJ~}m{npbzt(LwQm=x9n=90vTNc_u9oMFF z0`>^QUCk(%NYI?geZJrJGfptfSUo|#bIOx9{75$H03%JxP5s>;VxQfj9zi@s-r(_U zA7iEYZ=Uk;o5=3Bc`Cy8N<~IW^|x-K5WK!Uwz-quH zJ;SvD|7A4!J!$=5vPzoKyShoz%9sMu$YaVc_qFP8*A6KwG_b6If(SbBkK&*<+!vWf zF|vTz4fR>YuC>*a*jBKhH;esYvVRX=f_m6kxGv}Q7=lP!B`Tm~gJdc$r8#@oc_>ATG zB0gSxKJWq(01A)A5>BAp{N0r8Y`Fopl-<^@j2}Lb7hqwZVgms7Y4BjRX%M-MK|BJW z9zJJ%@i?dk3=UOQQB{G$RiSV=lhM;N+&A2!Jd@Ee_)mZ>w{$nVeGzcgTtH8{JE8>X zdR&7A0CZUaK!tgL|5-QBGcbQ#z{dv_m!j)RC+U9upgQ9^u@_YYFFml>1_O+nS0!+! zJ1|9;rJ&7mUt6N_8X^vvE$dS~kGE3k-nT!ULJ68)yln@TZ>UZt>yM~99EPMPkGvLeYK z-NenE_+)gS?2_nRLE_Q~O=SfBWBP-4eZ$qBhoPU!B)8n;A!<{n2$8fn3= z?zw7LY+9pm?7Skq!hhlN^h&TC&s8&rE_|3(7k{<(qtZoDYD;l6$CSq}*+n*!RF9{x z$ChtqFAqo6D}(Jz3@Y_nh1d#`aqCBV1;BR0{O73-kkH-VX{L z@IyPX2QBV_cD!*sQb4}M068H(!DwWrFMKxd9! zZ#*HMv(u5~Ih(OZqF=#GJ6l+P`eVK)1edpO1x%bnrZZLB+j9?g-f*`J-f1vA^z(+m0HtqB*h((npl4_64amvYEWlhLx9_9 zPq3b#QD0;*(&Rg-g0|K!c&5U4`<+Fau+EjmJTq9aG^Bmv?~0;YO0@65ezN zB~gjR+k`4x>4+Qm771_*vvXh~5n0>0M)`9^+*{ehYAlS?N_sSThm;l%jVBs-DY<*3 zg}w1mXgC!?@^2-!CRi36GX68CA9aCfYaiIeZ6K_+C%lt-ju4)Q&{Px?6q7yTAm^7oj z3FjKQPOd%VfJ9aCQ9fJ#@w*BG207PF_91Ce9Vl6cG3%Z_C&2X&Cma>-rIcuB(>lnJ zA#vC~Yj2*RiDYh===tL4{w9*#o-kHoYW!+-8PDmWSB+EhT66w?)g{MRO!xiT;37aP z90`_!NL4>0BnN6Qd=P7FeI9T#ChowVphxyE9aHq8;|50b?$9z6l4J0z@}YWb<{1TY zy?JHk&HJNq{$HeTQXl^>{eM-uARE%#5=?jazk4hC_+!v#e`GzsOF#NJoD1kGqlv7t z?;dcC+|q?JavhONSYFLoP8BPwxTfa+1Pil|vCX*wQ`9(r=%sbxspAWcm6bOppi&F)xn^&6rupf}~g|EBPmZIvIUM!K@&2 zqFuP5`mH@$Kfmc>;V^8Z3SNmAMP`<4Av*mYUliLTalvTXCDEK4;tkDBx+0RxwG%x* z=%7+nMV(v3PgnL2&ubb<|CN*Zgw8oVFtcQPK#Y0>U1|2q!-+Xssa^PeHyPEm)~ zFHk(oA#EWI^(-5+b(lgzod(76f#Nin&sNVf-&X3)lJ(@ z98YkbS-`I>-Ou>b+MaZy-Kf^4Q+&)Z$jd2+eu2uHR^%rXU&P7!$f}fP(nB9*MW{p0 zb9UPsxI%>}=MN`X@w^^Q78o>&w7a9~*4^u%jEH!p+s#&T1!LAFe?2jm7eYdsQtlDU z(N9xo7IN8hD<~|Hg;EbYmR;9cR4*m<#LWxa+PF9s_)J$JD3EOxn&fC>P+pMfjb8B0A>+Xc0ou%FPg++D7*>F+cMaEF2ArbC4S`s2MH0A!d zhlP-NceAv6X_5zI$HB|-yCI7&h9hTWC{ismBLO59|lp0bS3OT-QEtH1ww1b z3~rL**xXTSs*ntwTb@|zif)Og+r7@F>pbt5TP+S@W4Qg)t}`m}`gVOk^KL(z_4L`; zuQK$0)7Z1HO0sP>ABcYD3j$O6jLQwv0>`Im;a>!l1J(kr=o@bg_z#oAk4fLo4Qr$+ z{R}2aAEyWq#+V|$yR5Yyb>j^22?pjMkog~W@U-qZZMrNFwr-*Tshiqaqi*bs*HbgW zFXJyVgLY35g@1|)x{A+$1O?8UX!Zfrx=lOdc;lh>M(A1{}YaT{NQ z>!CZjn1-%7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3PlLY>dQ zzz&!5MS#BMV7R&{DJbIUKGByz-YZ511_R@anfo{)l;c?D%(b1BNsrES6_y?RnXsgLihE?SORB+Z2}2d_xgX;)ojk9x zKE3Jd)xIKSiH55Bf_=GvZMO^PvxMu4>ia-SE_7~1-kAOk?6d0s> z#0P0gY8k-*ofw#P#DK?T|JC3pEZw@dJBwT9DQRq)Bb8*Gv+4Hh@aS9T|9y8qbAacJ z^B(cc>p?;?CA9}ux=C@{^gX?)Z>jk8{5R!H;Tua-cQPAh3w=AcrF+}9Ro~94_ZY5a zoa}CN@rlJE3-Juz6q)YA*Hg|&c)G=GPCZw(__?Kz#9HyFz1xK*D=MkC+4Q9q3B4)T ze~@w`g~#y09QD^}(}M*=2Az7A~lL^elsVVUA1SL)^2{d)0Z{86Fp4RJML|*P&#qfFen{~EFPe>RyPie`R z|0A8OpU0`pMKn;BE0m^n)>iPn|6JQ`nhg` z$CNC&7UR>aUn@QJ_Bi|VkY=W|l*sf5rRPhc54)MF8yHqTV^|tGIalGnQ=YEmr<)1u zW+{hdz0ftTc-&mLl1cn!V8g~69BR#*Pb^csAy9LLQ@PQ8|GKVYn|g}x_|?3u*f_!0 z;Nm`g=@sh@?-m@&-OcT6U-4@9l2V(3`C9(-cijJF)c0}SnoC}O?q4UBc7(BcE-6j9 z)|UT|n-ns(~{#GY>xd@7V`WgcC-c>I@{uFYew?S|!vH4AS1`1Qo1&aZXH z_BnpGGwT=;5eF(&kRwi?_=qb>EiQq_8ID3_G|p%gXG%iLWxfK7t$(D|1oTm1wA_D|ElxN^5+@D^o=H|t&g{SaceL4M~)ASe>BwZozryP z__$KsJ-$L&EO&m~*C)%CQ~x+gt6&@30|5 ze2(07$C(#(E*APn_1jgYIBuO%wRd*M@|MX`&E|`*Zr>#R@oGw9rMu+i#^3kMtUf-O z9l*5f!BxhU%BNX{IFJ6n62b8KTFth%7dML-hn1D4HM?P4YUY=5Uc zYL*bwPFVUJ7`#t_!D~i*mdMP*EX{D_jI;gLd7#=M`ciS-_Zqy`n!q@|E3l4(YeyJH9JTW$nwgyRBE=@pkt=wd z$qH#gl3<_;1H!<+j;^Q%;%QD2wh6N_iLmLd<6>zAKZF?cx8qP$tt?-r@VLLHfDadv%1ojq)^eI{{ z2ilCKy%OLJN{a~JqE@rWHb=1%Z8M_MCG1^Ljzg^{kS*n7C)!e^T7x)SQHxJxTXQ*x twiTl=#b-CJG6rHdFeL+%@)1s=?M5$ga9PdD1~Qrp2=iDN7>vQL000hrDTe?6 diff --git a/datasets/common_voice/dummy/id/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/id/6.1.0/dummy_data.zip deleted file mode 100644 index 9b9006d32c6164677bcfe41721a4e0900d7289ea..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4029 zcmeH~dpy(oAIHC<5t9;4XVoA*RiPKSzWzpocayi|8-&K#0oOb^C{q@@(`(7T8@8kXae%`P5`~7*%4ksxk z3y40v;y}Wui!TQ`00xjk{QVEPkv!-gMtd9<0FpUc_U@uzNT?hjAtjIm0I3o1U^izF z?U_Nm7zSRnpumt|1C$}c00~FJ4G>5J1VYT{=P!89ctssCqx+IR0~B)2=6Zc4aMdzE z$9Ydec(Ir819-P!9st0_2hf8&d;<*W!HiFdO35*EpG-6RHc>4sbil80sFmL{rHZYP zp7fw0SZ(fVihg&gywS2|ftJF6nPL>!`@kx>z>I= z-`(zyeu4=;FAuNB*0L?Li(I!*nChls8Y?)a?%PJFgjUb%W>WatwdAv?9j`ZQO}bl@ z67>8m`}xbWNPRebPUS#n=!h5fHc$1&5+5~-&Fh}tC`c$6x6yrG+Je?WYM*Xil6RX*ZzFVSP2R#u??xi0rj)hs+WJxbuv6?8qundlsWs-Bd)86~x_ z;RZTwEG`Ufarn_4Wd`l(TRG`it=7u~0cR3cFWi-?E-I6$Ja2a{DjWy%JRt!9hT`X$ zL}vUj%Wd`nb*rF|C*BO%opy`i68j4uUqF`K-?yZwhu#rjjuae_?mp#zKkWmD?dbdV zR%u25hpQuAEbRG}T|)0Z-qoAty)-7X$V#mVuhHJ^;HTq4b}{Hxb%XA{-f4@n9iu~vjn9YTQVeW+p^!++A3a(L1Qznn3w^N2qebSo5+<%c1Ecl ztYC3`DQ99qqS4}Wu;u9V2G<%W>m@lxvq?#QeH-`d2{h{Ciequ zau^;Ew>|OTyS>m1^=4bfRDgVG3>IJ>e;TRq9h2Vf$P1mbBxhY24H{SuNVl*9&)LA^?{UN!(*0hD|JMFq)E6G?k{yX(Ya!baF8LtCDjJrT;7WKUH#!NET#0x^v+*!X;*qA|BXSw(+(?nl+)m|T^7U_*htJeJXz`-8Z?#uDfCoQbBUhd;k zI3>yFUGmH0C7nzPxw;C^w=|acH*6ZwM4h?1ILrokeQOJ0;+Ik0B%#`^l|0I0U+bt3 zy3@?@3lE262WJX6*Q@esUkq4WR%A=8zmi{bMJ3Rp3?)oEE}L=KmZ(>KiW0r~vJIzO zKeHefy*_UfnYQ6SYYfgn3qw$rw=l3G#PY0qQyxEGr#)kpygi@)Ob+z)+TrUHdmt# zt>E`Jam3hTB_$D~aZTLeAT!hA#8&$iirjpM5dUQJVWPslVa_W~zV{wZMc`xQRLxY1 zlN^4J1BY0f97%Qh>yOMd=ZB;yT(MekUXjJs4$ag^R3r`Bnbm05cqoX-R9v9dq>NE= zjMsLS$DXQWKEi7Ix2@*1q{bkudUqJ#LpQqcj;S~h2M>jp_u`Zb;&wh^O9(g6ut9=(sbq$n=r0y`hM=6TJ zH}W5$vhHruR9U3n8=+Bk@I&l7yb$~6-~k#f!Cuw7ru!OkVMT6ENX713Biq&8*he0R z4kjmDZ^{@Px2AFLOqo#8y!>KfKbyYtO3Asbv+5WAB0zu< zUCuDl^FJHue-Tg{mL72KbYztlV)4$DUtwlK}^yogAKw&2K&3qnzDT}?e%9E z*m%IjC@JtXEl+bU3xrKG+aPsLb*88@&E|9}73_F1sb8(N zB!94Hz4xLcqVKy=DF>3w*2M-2KM@DzL14(FBnSj~30yFX2GJ7@ z5*%q9Obgo=5pIk|po~qCrbuIysWA#AX7v05yvQT!iy56v{|tcTS}*qaaNyLkpzeG; z4(hqLR}%~ZEt3F&km3XAVL@anf*v0EDN)JU)`9;?xBfO!?Hmrd38?^L(+)sk`?Sxe zJZTM;8C9@N9@^B^0d2AUV>~=ZV z8xysv5G_XU$s}w<^`7l^(zX*V^=IF;)GbaaW~^ zEKOXjS&2r$Mg`!LN?Hf}xeoQo=ME>P=1*gzpAt+8Gw!PQ;Ga49*cx@XA1g|&$krRP z%5Y`1t;3k(u^+vke3T97+C-suq9(Cw9KQK-Nc%1Aim={)8|u$~=zM$JYum=wU6odE zrlV}8VDA-$B{Z*rVkLJR4rs6n&;TL62858vpDMs~r8ccnIK>mdN$9>r*(6AM<%EGw za3(z7CKc&XQe-H9;l3mnz=0M9~2i4QX#Htkz>P3NBiYY@J z-P3V}vAnht_VV1ljr*%44RL!P=7vgy499SE1n>e9gS3mCP2615DDg*C_eibWAtwr_ znY`z0J+)RPgL%F|)p1o%&gOx@9hM`89nvW0@%Ou@A?ZnG>ZCgXRz*fU&y^UQ8}AHz zYrKC+eDtGD4F_pXomnA_bpD}e{6tpGEvuB1v&B~$rcq0+d#45`QnRK^24VBLcgn`& z)~^{Kbp~?(8u+XiKlhPAd&$6{2qAn`Kv@4j=z)-ASE}A>1qU7<9&w`O1Dv5tu&^NY zG}Cpx%&#MU9ma*9sd>W2qOra`|I9fEd&Cu3Bd%YNNzB|pEI5D-w?^EbVgHsoH!|nd z>DBJ_%07o-H(c0rkA2lFqAzqtTFtrC(b{`wePS9ORJLY^)Wa%_TM{P$FJ09=|J*Hx$S-x?m6pk>O4m?Y1x1*!o}q*Q$e--Ixonys0@h^DN-5tDv*_ zcW|K1NdLRj7WWVDL{lC;%lSChpcUHqxQlV|=_u6l_)Y0X-VArG;e}&*Og|UlGizkfXo`@oTHfVh4x=`qfNj zN|XCbCZ$Jn%S>3okR^;9>yV??%O2ub^kqJ!=h=^1kqo+CTNafHc!nLEhLSm%dEJC_ zYoTr?;_F5a38#ORY>~0`f64wgk}bbh8#@gwo)pX~jq0o7mQM(VNa7k)guD2KN&b<|gQ`P9DOE%q3xhrVY zs<50M^V14|bTcQZK}6J_4oB5uo@C-nWaZzL9h#haz=y9ZCDi*|^Eg*_oiphI76WN@qju>H41vKdwL&L_QrQDeZ^p2sJUbuqV6@KNIf44~ z>(nLXBRAY@)J|?{p8?Bf`}*JXoEVOb&A9K_FWM6Cf>c(Lf!yi>YXmKROOUDG53@x& zgTiq+r@dN1dcZs!x;}HluWqG!i{Sy1MlW3#YhQB7sXV2YkIy4kJeFu;p5=E0(&{>n z=7HS}m?v|FaC8Kf=(Pwv@Y~oGF{&awa7cE@@EgLhihj@%Luq_@C;M?tH9LL1MjKnxYS=O z8R2b4tjL`mcUTjNP(ISJVX2QAw5;1ubNikp>Bqn~)L{_`NfA(k-y!F&#}gGBkFRMf zCziSeOH&MjkRKoFU)eJCs{c8m=bq77OFh9HWj2dYGhoSD&P}r_r)5V+^*`HkaZr9* zT`25lpDLVn3w^4&_`-gXl}r;gYC>(lbF^TI6_Se?<7Hp?%AiSG;jM^Gmilay|3*MP z(0ssITYJ%fe_@$_PWs+3&yyx?3&bRSvd;iaXrF(0ta;-mYMwvCfCdB=9)|#53-Yw+ zu>jZtD-BQ=RcD?mY8Mw${+#)}O^RX?wGc{^5gWQ-afwG29pDS89kOCmzZ>U&#a?X2d}afc43u&) c@Fy{g>f4F47+OkN7#a$^sX+Cp%L{+~7aMJq2><{9 diff --git a/datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip deleted file mode 100644 index 95256af4cd44dff9a27e2c6bd35c4e3dfb5294a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4274 zcmeH~c|6p67sqE9j3|Sl3D*{3!pykVEE&sZSjR-nC}S*$=-60#JDJ0)48DLpgQD^zZC|9oDrXXZ7(nfd3uf8X;t=lnj${4gtM2Y~UK zD)Y1a_~Y}13m^jU2=Mg{cJ^>3xgw5Qa{^d%_gc6xJ^?{o03c|V6#xLeqF>Cn4I;HQ zh&x%qop{PGz+XXC5v8DnR6;7CloU`XCZiV=x^1~d878B9f4+U5e4XuXA4)$p7|>tf zW_kTu#k4;V065A503ewMkWRUJ5fn-OQ{+PQ?%B#p;!C+#*h(Hr zgomz&QB?`d)?`g;0un1O2-z=U`Et^3Rpg$MHE>tZRgJ|`L<&MS&3XdH&vGjw%0yPg z>sM_AnC*&I%BXv$n#=hEDJt^no{rto#xlBV z>&G-k1Z;_v+WT*k7w)%YRsWRH`woxFoo*TvE8nHsKGKqv+L3F&*po$672dsQs*#;` z=@@GCSY=ft_ZqkmJ03t#S2S(~s7Svx1A-HBfFjOw7g`J8z7_tphG#V4>4B|e8h&Lc z3sItW@O*h$C04AaezIPFomSt0&4o)H9m}#E2(5Tt_YMiiY6Zd?XvlYLjN04~*X%<> zg&Gx-52s}q;bIDvNGGt_~G#xb`Yy~ z(#q}Nyi3Se4Z>E9Ldtsr9lNcexnAv=P|-|%0VDqtB58jP+?HFqor;mIVmp@di#sJQ zkY|i%2V1lfx>EcUe7HOmw@O05z-8oTe#jvIDt83dlP; z?(+SzYn@WCYLSg-6x==J^FAnTVv$Nnn^+$BYXxzcxRHJ1TsKRd)LNGediIg@>?<>$ zeX^^M7kvidJichaXsz$G7p~{AcM1vuc=BS&RcgAqS3WuOejU#2kFqhsdWeKQFI1o$ za~Te=Hs?LRyV2z3Q`Cc|nN*3yOey{aY4yDMQ<|AccFik|gG=Yq+a{idEF7<%`d5Dc zWnzx#vF7};7<1Geo{{_s~0yv2j~Kpk9yYen$6Hfl!UYr%B$TYG{Oi3 zhbryTHXtdU1|M)BUR;Lq8@ii#9tNWgS%Hr6d{&()InND5DwSbFEI3NCKm1TcS*x20 zHZV#{<6NwbO|i?=&DP%$BDN}|hg{u{cNZg5a>qk;YS%+XqDm-@BI!|LzUn8M z-;haTQMQyuf`0h{!m?ADm_yjZsA*p{utL$yETknOE`oD#pGRv&kX%?_TMTRmwawy2 z%`Mb@ou~=FILZ<{zo#`?vM`BLy?)(1wgJCKSxDFzNP|Q!${I&f=k7rzq;+w=7M-W9 zS;rvNo3SgW0`u_s>Unl@RhtTH|^f9pWpFLrTq~o|# zKdOL^uBIc+NC^AuAuaBORBF8^Y9$?BP=l#}~49 zM=v`fF()vO%V&kJ*kZa;RQui>Ot($xl+(7TE2+!{PF{0}&+rC4(iFL5zv7I`9B#iZ zH~P>I-ORP~MUJOk&ugP*j^(+&@lc{oijLf_oms<1Hg+&){VcOMt7PYBYsl+sNykDPt^- zm~qaK+GpCc;Q>}+k6@m061XCCL;)J?{(xK1Alg33PG33}*$3lAozItW?1uQqvj}ly zOSXXdgrX$}xKHxK^-qG2iooUf7D1w7xI|HsoR;=3k)>!*63k6PA{LSuob`ORF19bn zbZnmAOBd!~MuYVwpSHEfi&5Llp05n2PQIP$9^%255RRAeJ2R8G@u>79?LtR#$ZJyb3ki|{J`|SRXwjowEc_0WVOTKxbg@$oZb4iX2 zhwnSrlaS#_#404@I4SqmygfN_>;;~O+U0cNvC!~`+OV)u%Yr7kCLlx2PxhTLSvT8v z73S6atfd!mtAAw(`i;>CvWT*7w=cjx^C5s)G~?%%?Sbvn_V5n^N&z+kzDFBw4fr>k z#E(he?hzZLS#%+jq>oz#9md!yzI&_<+qre8_ynVO9iS)zME}~9r)`f#hi%$0=+tf1 z*`RJUyPK&`>Fv%W^^5iWG4orCx0&e-Vw(BYaG{gGWA@AsGiq_;v|DY(W-^YQY4V22 z`0;TW9k=ySwi)UPW*WNb!NweQ>(t&%o#$kl`ppdgJN9-n<`bJ<$@Efg;$oWp^~Bi@ U%|=%a0N6o)6X?~W$j$ioU#l<W2e@{J|0|BQQs{_}RY-t>xE520A>{kjncr%7Y7PCs`LkDlD}Nc~zVu!a+np-u<(qwF}|PcVb!cgDKY^bN=V~~HE(?P z=}?dSULJIc9;`hga1i?bYB-40;cuXQkh*Zrvg{#9x2P6J5D9sBg)84^Oq}}A;)l|R6addB*oKeY)_T$As=_}WmtQ8@(ovizXa%-?Y zlYHd~MjdUW0*&ST89$PX*KR+lvQF5OgCd@lt4+G4%X%|x>$=%+{7bZOB$n*>ePl&J zD0jT2mCG;PZm5A7kY-XT%Aeg48=(bDyD*xf-q>FEP~}#vDwelEL?^BbU4JW1U)W#2 zx-N;GHkHS*l?GH0OhdrB1qWO7yB9jumX)S~_FzW_Fz(&V*Tg*5Jm>J`oI#url{{Z+ zzKpv|FUF6MOuV`j+jrb@x0gwyHGSHd_Na}9^NvM_M->6C&V zGaxidA=D=WLS&F>yR*oeA{OElLtvGWb3QE=MmPMbKJ{tGSaQ5$vXwyc+w#@-gS%1g zFEh0BiWSnKL_Dl+Rt|TP$yMgl$hG5W)>;MJsL9_A1(#`^7kyW5Su`vhIo7l!Q_14| z)Ti$8zHWPywZ;&(crfqXV}FHpt?a24ApSWRJ7vAQPmb8ATtZ)}pF|y*z2w?8Tt!Vp z(-T%_S$$l6kDyB~@AA2*1S>nj=GRz<8qUMv?fcDL%acn{Bb|w-suSMEu9?dXPe4qI z5&jOsgEuapVUBE{Z+ZP38B2=&W3&ej9o>QZv|}UAI*FHHJF`&jvE(@8$q9Cd@;JL# zT(u=vA}h;p(OyrmpCp+T%+8*hpJnx=egtC|2sdGyOGRtVk>9JUFiBO4mB=(Ep}G&< zH(EG7nC9!8mZc3EU)br9By7KvZyS?muBR*@0l% z2AykCPj1T86X;mfkIFuk7$8AmRL8ZXAEX$())AXY?cEiMiip3A=4zN6tI$kakk3m8 zkiNz_Xy#lzGeMMy{%eIw9AJsoMw(*X-7B|)+FIayykK;M6KU-qvV*8 zRrS%PQ>0az4IQi(+RZwUqFe_~R*p{JB(P1L{l5hMHzY8s`z*10El6P0j>OTG4jgEcGd ziyy1%Cv@Oay(_`76xnsCg98jqvm65`bCv! zyzGqJ+@dTPjL_nx7BM>XV>%2tYoV5%Y#BzkW&V(fW(w{yh^5|p^Kl0ZC0+>8V8kQD z9fP#9Irz=h0f-xeRig9>wg?Gaq%RClujvsl=!;6=!g0b*8p|Uq5H!!UcHgDbP}IS7 zhwxcnOc+x9IQFK$@0hh({1B(YGA$}sEmKwZ_X`g+GtluVVxB-X){J4N zpRBhxCuCONovjDsks957LpiA`%By}F=i=Wj9O$W1$^zz7U_01AI`$N)GZsc0lsuXXdr)9_#07pDvU|Y$XS*) zN}vPS1xmkDIajaS42i6)*A=q~$jvE!#gm`a6QmM??l+S}bYBA^i**HQT#y5yZ9{wy z@Ioc; zd6&9ejX8g(GDT~+EdzmO8hr+j+1We>6QvqKHTx=@DBKqqm?H`i_iNG{C>4GFyvZ!* zP_j!DV{TmiUQ@eJU0H7RW21(4@CNg6aVtaNo8bX3p*{ipP~y>9v3@N0H+i>~qvEoK z!ei3HB}b>zEL&%J&z(>*nh6O`TJr=sU!Ln&+Rs=xs`b8#g^L&m3T^PxZ5q;Ae#;{p zt5)!0bfWSNn{wN1;zU+w@rRUymIIHJ%u_bZmkF|x1OI~Yh(1@y7PR2k&_un^9GauHu-7WYms4_HXt%}+vsdi zw+{T9sh;F2LLv3r`2TC>kHhC?rWKfC=65xVO#aCjYM{5S#f{r;9d|a9k+c+(H-?{I zUw1v@wl>(!P!t`-&`s@@GV0c5u$c-a=O%@L{h>Ad8+*GN^My@TGFi$OnJ8v|UvaiW U(~u(v01V`>Jy|`iWB>sC19W+e$N&HU diff --git a/datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip deleted file mode 100644 index 3a8d0bf1c94d47868ef3df0458ed5aad5b3302d4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3902 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_29c&`CniB` z;9p>e%l#riZ*wqQU6d5mE!8@gfsujX0}vaMU_(h!VrHIRNpTrCs3y&`OZYj>4n3&O zPPFwq;=tpUUEEdB;QD#?nGl(VX?C9~<`~@a5mJo|d&D=VVb-KNOQz!|d}p|B`BL)l z$$QT_Bi_#AYkdXN?#xTjZFEnvIQ#bVmXBP8R{i^xzwy}iPZW1pbg3m#NoGett=`AB z&?&uY?%T?)O*nciX+qI8?FE_Bzoc;Oag5VY`H%34!DF8?zz-b)xvp4!Bo_dW7f(0TQekE^~t(mO9DXYpJs zOgimfRrilgp*!m99#nC2vE)2oG@&bPpO)johyS@w$}c?r_fwtL{2Th~*<<)OdWBkj z+O*`ieeXU-P*Onz^`gpkeYb%De+d}W`XmH(N@^LwfbRG9JM19fa{X+qP~|`4?+N)| zzZ{RKc+()eXilqep|)C+Zgtt$C0Z-xbT_X#Zebkrp`vkb^BR?cy*=``6|Fy%Mb|6) zF-=~h^S;4lFXw5u9Uk$%Yaa(+dGjZMInpn#d1YU1;Ta(v*2h=f<|j`{aXLDw>Bf`E z#X+%46lV%CNljKx-nzoA^{j)$Uil+zXH{L@r|uO_w@AL?(t5K)D{#NCsAxuZZuOnH zp2x!1EZ%zZ$E43YV`XgWYft^vd(C-UQirFYV~zESE$u8{?Eb{Jep+0x`_=3%k4ye! ze&9ufq367$pp4rxlkI`wcOQsNNeIKT#GFiE&Q48%ha@PuVo9>O6MggU81S@SKc!jI z39{%W!}bjp&`MXpW#EcSML_4mn6KYFOSKmK8@$i9{B zp9_EPmH#J~Q=j)gXkXQiUupluB1$z5*M{%=J!99p#WkHXlg>uoirM1kyLH7R*S>ex zHY`gsUUtG)bC&nf8xkzZZ($1JA;pC;I-v)%T5K-=XVeP%$@HV}Uf#d&=Tk1voxjI7huP!JpEiqm zhnrJ>Y^kgI#QOy;#Usa-0SU2Hl3H8>k0=~v%VE4H!saGE-~}yDr)s8ORz4z^*ZGRXH}b$_xX^eFK+FX zws#P!St3#y^ZUh7&THu#ryNk;XfglS^vtKOKQg83y(J=eY^vt53QYBzsMd5OHS2_` zod!qO6y9~L!IqC@Ot%{COIgeO%gjO{!!guq1)Hw{%Y*FJZJ9cuVmTQhcNe;@&k6tO zcU^YDvA$hhlB=VvbF=zA&IK_?WeU$Iy{yFglp*lKl+Pvh=0e}y59|wzy{(n}?dOuC zjcE>?GlQE{7T4T8x}N`Ejl0~_x2GKE-LH^%P%bi@sDGE z+D@6p-!~ehPgv;jcyY`VHqDPax}yWv@1DdJ^z0L(ozQdZx@pf(vwph&DIZb9fofz; zjfuQgz_@q>#O5Sqn#?@RLJmjXDV-SG7j4MVI^TBn3AQN74ZV@REr#b@cG_QjlD2By zQtemkUiJJ6St-QdaL)0$@V1y*>vPxT-c{{sKQ2>a%zys(&tm&Og1`PH|Gxd|=;5Q= z-^n|kU1p@S`{kp{rIF8nEc2{BzAVmzwcTrT1ouQ!6%qNija=!A&Yci#3sZmoigiPx zvEbR(Qw#K=UMM2*qZxnYm zS-L+Ey7-ToF<`%fnLVHF*UQhg-_ESI`x4D>_`qJ{_eZ<^UygoeM>PIGQH9)sG$SFZ zpb?dskJ^F)#TG_mlaWb|8CRQC0_p$(hPRF&CQ1vI71Dww!9W!Tgn@q@T~Q6h)0!o0 z6J}c$Vbfd3#YEZ!Y1sm81GQ|i`U}z6LT%SVYy)N}h9!-xKnf|KaQX{m8&WG5Xf4k8 zL0F60#zVHY09dpVYc0IRN7z!#MiH{5TY&u)Vl720J%CnYX&wc5gVH0yzo<1lveikf z#9NK1?+JStl%ZRcwwnDZdV#Zo% zDzZ$4WQ&&D;>tZ!y+Y+S_rKpCzd5ft=lu1&&-eRzp3nDrjP+^hSOCgzqQt}W%geVL z1P}m_-mb3x)<|2dt(=89GeCPq%*2NB_x6JTG<0*c06;en-i$X4BECL|y^oB&yO)PI zMn+i{CL<4(hswa@WneHWqhHIhTla~QR7Q99{|fjcTYaO?=LT2J1dy)Tnd*959G0d5 z0D(;a0Hq#)^|D2~$zm}+Up6Y?in`6m6!jlBs^`AGQ7e>#fn2SsWn|oDG>bIa{1GN= zl978Yw0N%&M7mOAIIQf{77Ii@nML^)M`Dx&qs>!-oPYagmjb1lGZuT52bPsG%$gsH zHe_1(o`}Uv)@v;_knEB6E`p;_SD8%YK<~7aDXI3VAHB52(C*k#em43;b9ESoXdVAF zxLavQXCs+>-||u#d_iw(>lt({Pu>oR9rH;K;&z@OY98o-WP}n0iKb55%qmWb>J^)a z4zL%VmFHuTGVO4Aq&TuLJY=V2d8D;8?f}fmfp2e9vv+;*Acd$x%=fS?V=GqI`a2BTp2#2=FoFis>YJv_nX%IcP&>-_faQ&ORJpmy~Dt4K26vCzXhg7lr(i!EToAPD6a#7~Yy_gP$cS)OrUo2I;U8-R2s$kjK z^#0b0&TXNVJ#e0pz-L(%F3+R(XMa+Q%{eL{v<$;F1N}hUQU6Qt!K4eIJYVLaGqq% z_{{Sc|EgZPSL{r0EV{9!fO_o*^-`o>FCSYMG-x&`&M~;Hw^)>p1WKM%#9iol5g|u1--$Vw6@c+%}hv$+4m}-0scwixMRzL$W0eL+D%A zdfX~CYr;nueMXsFxhfAoX=|&>!9)y9ObVZqsFZMxQbG;A0OHQH)9Vw2k3BxZ+E57qZ?M*$9Prgo{Wy(!9tD$P;xFe zlXh`xbxa?lMwtur9Im80=j|T)lzdNIKwZhfi#JNbJ=z&jvoC8c@Tbd{0?cB9Z?jEQ z3UtN>s3b*t6=rprsbW9em>&Z#e*Db|{~dC33-tnB7*!vyT}?|u={2ZMvtWF|bdOY` zK}Ah(+2aeb5zg03t(SoE2xSGPXXujT;7N7v?n9Q8SXMPabt<=?F-zSB8q&WqCOC0xq+!`@A^qsfE799TL2a zvbH_jxt2)9?paT?Qb*K#QSUXHCVm7B0)M_K#6L&-BdVfGna{5UY` zjTl9|P+|Yv(tlf>B$aSzpzNSt6SC;=J3GF`9JnlWnNp&Dik> zv~&`r<(QK_LkG7?wzrM0-(jg_j;d$FOjphFla|KoAcNVg?~n7P;VRm_1RFW;gtAHV z-aGaNsXmj`7){2vDDb87D4L-8K6$uCpIu4co!?CwKU8Nhz0=J#1!s{+NdkBx8x2V?f3bCeB9 z4932BBO;Zyci3zOc(khS3lAw64Dow#&9bD-#^?2&{wsR?y+L_MQ0AX0tk{ulVq3D^ zL#){PXNn^@D@lec1teDy*C_-qmuqx}_^r+ha3zM+io>!IN7~d`9 zD9XY{QrnV#s9jSBPB_w@Hqo5+kkelANIX|1Tb-UcZzWDzs}*`Q9f4M8Qxs2;e4r;* zL{#jNV9F%anKWzCTZM?^OLuh!--O2b5Tq{FMc182OoVJc-E4@7Z2-nRNd_L50TdE;{zpYOD@FC##W5!zVz5FWm&6$B2HB1uQiuSfu$l?npLT-5I*~x0MZU~b%i=G0n#kPkOpvrv@r7do)1MO-+aPK^ zD~hk#C?b9x87MZtc4SbZUc_$_qTSc`%oH7e8ZKxy3DIseqws#?a{zO2%Ims`h5qZr z@*e_<1FHd-;ackh{>$|8d(zLd%PMIOoj#g6g81fWHk>9=Q${abZe|~@cwrlt9UDthmUibI?{(e69dpoSeC1e1> zt3Bfg@$<>o0~AmO_S3w)e4O^XP+c&#yQBeerlyUv;6pnB1?EZgi35N{KX}+NXA!NL zMclk~-N=+9v}3xf_4IWOum)INeFI&6eG#j(*se2fQCr07o*rKSa>>SX-M&1yYAJxn zawR^xkm9&-9sn#C0|2b(04l}hAW4sU%=@!YkxXOf@mS-33RQP;-=PXDY*Pu(_5{SV z_wPT>lhmv4M8*%g#c9b#@tRyd@TX)X_E;>UrbZxnURxHZ6!?`rgWjM!pV|7-i;Z%{ z_heTvoKqHeODk+l3V1H|dr#KYk)iUx>GPGN>+cUIGola-DslL7RWdemsB9VI4f|JJ z^B_WHFhN?U*EvxM*3%qen>Xe-pvHKe*IUou20xyDuVcwwX8BFG;KdMgd_y`*Y)5 zi3`K?1>0v@f2Q5?JOrq}8mPaX==$$>_x_>!5xFFBb-3B#Ayl>Xx(n^Ct;On_j9+m| zDFoxvTk6^$-QzvKn9XW5+z!2@m?6)3XRF4#u_C2pRd$JK- zm5)CT){OQXy>{{0P!5Km&XS8-Sy|6IXTm|4nE-{YVj6ZKc83b3R+js>z|bSGSinQ} zv<6Y7tmnbm&7t}?q#~7V-%1P45pxCxkS@IGSnK;8LTOvCKXMD z46q`kIjH4(n>$_o9&0>5dq}J7k#tP_nLlcAZJWce z%h-&zbVdv!fI}RPIk~IVa&1D480GX5U2W*nqWQZvhWI38qGh{yk8keoOlV!oc%9K( z-?ST^pPbN9$#-nKVdN|8U%tLRl~<9|VV`|mvmx@n#**^?@H%40&cBd5f_-`q>DSo8 zUl?wcFCk3|9b2zD=u;VXmsni-n@n#BQSqbo`<5-}A?Rv_`_L`P8M6EEt)8aR*yzkO z)rWC>_}xjlM$GgW6m-quz>GnEP-!zz>D8il%|X)aK{!|QL|TRMv7p}ZI~DS}^&+;u zz1;`#4LGZz6As@XRz+%Mo{n>lMRDLYNdd)9tm`$moRqHM`<*&c=qt233Y@003fIH3 zx$T|7>^Pf+v2|(}m(_6)+gJ{5H(!}ouayi!#3gKJOh(X8A#N@xvyv=bp@8+cq(GlH z?-Y4$G<~Af-cZ%HtyF4@^gLj|8uEsmUD?^Rd{Z zzRZLgowE+js)W_!scMH*kE(4eR@)cYaw()&ivFRiXNrIi&s> zTEKL{Ny4U?1^*?1|Cls8e@~P82y78apVK%96QuF)E^9gq&dlOpU|?qgsdNeOOGu|V zmj%Lvk~v77)17JR%mgY-4FE@ih}3U+^vBHE*-M!D3H*a4BJ&>u2qb?eh3&8s?8WKT z&I}2{WG+Nx@^obQ@p1(nH}h#G481a6WT?=yO*HDv$u3NVNQ+FJHRu10{iWq+&zLW4 ia3_OXc?`Vh=M4TfappopBn72qz&A+(0DgxGe*G7tQN^tQ diff --git a/datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip deleted file mode 100644 index e12366b49dce2dba4ad322a71fd20705ea43d3c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3824 zcmeH}c|6o<9LL8s#vn(8RBqNXZbFeG4CT(QQsW%Vjv361k*Ezha%70|5`z$Otul&j zPwsK8`&Kf)S-FUN4R?F#{?hJbpndN}+d?X;-E z0s^VA0*iS70q2gwsu1wRRiRSIhHeX&48IlXVTu8^NsTAU=cACmvD^K{!F?2TaAeA> zBZ+=u)o&$Tu*n5KbZLHo-ZR6cw43!$Wz9qO3ogTJMFZL}@4U_}TfIyf86hcq{(db% z^ZkVGvekghpQ9sz(j&(MQ_^=GRCZdb%jZLg$!O1-9CZyXSB`Cu?9Wo?wHVc((6@JK zs&eSlP>G_9p!-k=?$or>I3sUayp=P!mc{AvHN*%afPt20H_V+(d~4pcM(-~$g$vDuK^(7sn&o_ee0}i`gm;|4-Uz~hyFE{R z;Qnw0$eH-2j~WdZ)Ett-l*x8FkNh9Yde8VL_wvoiC@&Sn@J`H`{mjD|Yc_ziCY)1T zaNPDmkKA`VD=qqbUA|(P$o$1m;NX zk{t)#ZnMepI*$lOTzn8ZnkCQ|73zDH*nr-Z>1Qu$N6po1C;nssPH-87>Z?hyfa8;u zCWPiiYu?%Hv<$mf^K969&r;>w@$`=L3)wTuZ8Fz!n zm%#(CcO~x1VZSVCNbynX?ypQds1Qpj)q-jHskSO)iTL)e zEGj(POPAe}?BVdr2W9{QWd8sxE#_q>y8EGkxq?o>27&IC0drwH;Eyw;g&V0DN-4}x3auJh*=GuAv$6T10bNoLm#dMGg~xA? zFfos2sW+N_Ti@`+NQwl;{2{g*oeqgj?cHULOv59VDsLG) U&w6~g(cebq7hU;mb z6*#b3X9F0jh&18UId4)v5|^WL{p66sj~XEmb^wJYyjr?p>l*qX#49;5T9 zEhi+jlCRGfs%RHU^v|wb?7SU(OOrTp$@4-%NubiNxH=ZP+O*-rsJ1HeY7>xn!dIa; zn?3&v%`i=*B7SL?1xi}#zfF06f-e%crP8E_w33eM%!mHyEcPHt?^=|W8?PF%E25HT zsCNg+#Gz>jAGZX(GM3JY%h=ofpl693&vIEGZ2MDDMSc~BNQ{?j>!EVz*v|g(pBodr zTAD}A>uZRX9z_1i(+`WwY!Tvl;HiRYa!4+$ts_!G-t$y=i*>to%F!myva)d#OLT*# zA#a4`k*wDlMp&WXW5>3c@?~f{Rzz=G(h0RQz_VT{OqHa%DZkOWd`hDoVsqR^@xTjk zjAyCLbdW-tU18PSSV(pgD_2p>>1(YP120FA2TUV$L03aWu9o`rW%~CdjxsTlUUZpjg|90CYAbBFM8C*Jr6ykmf2*> z_qJkB-Ku4v&vpy*7e&HAHpGE!=rCXQDD1{Dvsv{&x{m>FUQpLf{U~{e^3h=qed9+Z z#+Fq|2XF8k4J29Orc@>nPF;x5>GQHif?+wf0{uG8MmoL{Ueg@qd+s5xT<8u^$UWXa zEVM(GyPQRdWja*Bve~rJkTQm-W5<+f?1xXXa+~ZUnVxm?Mq7mUb+&h==-p53>j=!v z$McJ3C%Ost=0uYvZ^ebec5`Y*c%MeudDmCA3OsJRQ(xz3#oy@M3Vv8XF1r!2-4&Y3h({g}FxN%(m!Ch8nXR9) zbD>a6?pNQ6foxXA@}wo3;T%InLex&FZWoT7cMX-wecF=AmAePVLkh*j$L`)6ru#P9 z>|r#2+RP{6OA1i_>)j764;ySE3+o=X&D^Hw7d}5|(wO$S&UcFxMFO! zKH$Ij(5*=uT_~M2X`II-Y1NYgFq$X*?y>0IvEGrc!GLA~<%hEaF9w}9Jr)3C@TUNE z)9lcx>-`oZH52GkCaGWD*VfF9W`mKrotZUPf{_2h(Kw zHN5q4X&txz5@LkD1~Uz1xRRKot{=9H)RPdVsT)S`zp>X^e&dW;V*`~8l=2D})9i02 V&Sq#1KtvD-3jDD^_4uvs;vYOgF*g7J diff --git a/datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip deleted file mode 100644 index 2ba4a4df6f13aff548d30afe058271c5626f4050..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3862 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3R5ltC=u zUtovJ`657Hb1+<8loX^b5uPW`$iN`N#K2%cyagpiiJ5tNCB8T7R)xErGEDN zym+B!W=kE+GW3^}HQrRV%)h+UWR1#{?|+OB^`sn=DNuhv2Q8YZReT9>Zpo+&*%ZT?orq~pn zRWW9BU%a=oPqWP_>$tRRw@O~kyvtH~Eaq0F zy?^isW8<7= zYs9r`mXw_n{-4S}p8s!o+}l3UB4(*mT;)9f#E*%C(hnj6j?dr=;sHhg8!!U&h>w7j z)G~s>KfyNdumO)-`Zp0P#|8UuT++2lG0*5M3b`>YQCn|BaJj7X zYQdJtN+FA)Uik*ZZ3)x9%Mvo__Tpg1lOD%AwAxHFpB@XlRTOnPS8qGB$Abyy3(s!+ zmsC}8c*o)WQ>Wd(w48gfu2H@2_uAi=A95qY+;d)1ka~J+jx8|EZve3g@nK$;n3D-C z9a2-^ArDIQSQ38f1Y5r&4m_=|ABtEqtg+cI7d~Z6$QJGXw>lFydAdw5!^~~P9;?*o5say6ZX|-)|v)SKfx>M zU9&7Vd9A#AyCcdgLHWlOlVb^*Gq}WD9tmcZw@DaSM4o^C_4ggmHx}{z^MnFi>Mimw z+WIY9eUI<9zb%iy4e#&g$IIR~F<(#Kw`+D+gy&M_o{MWdA1kful$si7DgHG?LMK`B ztg^<6j!F7TcAGA69P(va<@)f*i=|%#lU?u64`MLN@b6R(Ji$0ifL-vw5wC>}f1EiO zRF`?!bMV{5=RVAof5%q)US_`h^u);a+Nz6xKTLm&6jh+|2|23tiI1w1)Z!9&Ea51g zMq`O;vE}Mi&jf&@<&D1jYm>TzB zf=qOS*@;a-zcp4Cw@&%C*U0SBGXJCh8HH;4fBPNZ`j#av%D-Tpt9#gvGjGk-R)TlB5r}=yDfD|ueQd?wnxRUSYefxZ4;!VsVH^v(9zo&GgPKJKU`xMSt8+L zB02epve&j!j%NbJCyvSmO+MjzaLPn}!Ol4r@)vffxou5~O>SB-eZ`|#A(KeM6xMld ziB%8E*5vNrI^$J(`lY%Vx_ip2{_lxx4|m8&uq{1(D|Ge4M25d?|Ap`Gd@}Dx{;rFg zq)W~3uRFbNfBo6%={D!L=kHqcd;gj`a1#SjMZ9aBU&;*(*?&N6MtmN~%)>0OaAbwE z=dFW|81S$?h?;Xq{YIy}=Nsn>7t>R`&q;lnb)-ddSMCl5^##wkrrfN4n5tpPF!jrM z52@Nj51XfYf;uzKZKf?Iq{88A8(p&RrqN6kp@m*OQYF5b^pmpPx@pOKGf-GPc9L#ZygsCX%o2R3bYK= za>eQ`M4muxw?ZrfW)FrXjVwS4DV%V63uqZqD;8)g&e%cNirQF1wzU=5d?V6Uc+-ur zotTXkWIKNXJ0nEeiB=K-ZN}1U3GfD`MTBor>t1A=m#`9TGomIY>|IcfL#;}XE%jn2 z+ES#7g*aPLi&$h^S8xz*D@K8f&u&PT$1GzYb^}u~Fexho6_FOf@Dc}?)vRnFqq%@E LkA;C@64(_0QP4k+ diff --git a/datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip deleted file mode 100644 index 0c27e7ab976464a08a66e48f9426be54a08f028d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4768 zcmeH~XIN9&7KQ@>R3Ma4WDHUR66u5@(uW$9DuPm^B?6L!mVg3Mq>0i6G)N0o5D=vo z0YN~DD4>KU4gz|IVxdSI^d{px1B}C%`Fo$6^JJfM{;c=xy}rHH{!C8*=@Ik`@|KJf^Z(J|O>0M=~vy*{6lJnQ{{mpM-6 zwq3DMA`JjAKnnoKP!GU)JGx<|a6Z1jFH};dy2E;^`d=67Y5#rfm<$`>IbnwOw5CRS8!~b%CKMk%5onp#o1%LGUj#qwE%+Z6!jVz` z$x;~7nh_&MEKO2B_ETj~m2uBF)H*FsWi&jn@fn7u(*(AX3X`M7nwn?lIyko$zg6ti(k0#nvLT^oU#TYd>dk zlo_ChTbIh)#aWugR%2@V7l08ezdiAa_%LbE)Uba=AxD${a`b#^GVEBw;L&c)XOl~{ zq(>a-zeM|ITutjWu~^pJ8VcU%FLyQk^s4>-qypEV!JSSGbTE*j`8H+&isJ6dnC zamWj%6y+gui+i;H{0;m>qqLB{p2~c^43Gik@atYW2OG zzKQjjRPvI>{-*$I!9=XVYd3@z+tKn*jr!t)+QzC(!)fX6iuE^xViZCjj(M!k^36}M zM#cN^fFc>{ANjJ%0@T-J;(X8=S{0Sz2Iu{vP2dgcPXmv0gzOu}1H;Y(FU`Amb|4*_ z$2m(PjclKNW{TMg2)?pGvz=Q+rjz+9*4VL?oZDn_Zsn=Zt*;~6ja&~=$Uieo+2((< z)lrE?l&u~?>V@G#gxJ377s~3T6Wv>q!a6Y32q2JtV7}qK)ZN;ugDg&)%og6nxHrht z`-RrzKd2>S4Fq;=8`PiSd}zBYgBjSOOA|)SMj>g zWV(pLf=;H}D=4}pdsTTjLBQ2zL?kw(U;d0rD_aifh3Ls%?)EasP^JL~;R$c8NZd)f z$oqXwT65O7F;gWp0Bt1P6@ZnK7kzvy7N;^77HgfaOOouE>I0k92jvJ}@iWcxJ7}3= zKT|XxwK?g3=Amkns%2X-@64sL?lN=~gz%)~XmGc64CE{cWz#j?#M}Kjc};zsg|xUO zaMtT`G1ON@5#3|P(77o8Hs7ZG{S_XVP+aTsJEG$pV&qmzQLB8Y3sf7#F0r1SJLL;|0oOGsyYBHHG67Sr{rgN}| z+b);7ye_B~b?12Cf&tZh4TW>3|8~TWcGZRrP1iRI>~~x{o3b^Q;wSXp7%E;j6I3e> znY0+15~rJ})}Xr`a`zm6_uCb%@rJ~YDo5FaP0%l*J2bZSLTL*-^&(BZUT`QM+@Dg} zqbvMhD*szjiTKsktA|`#vj5!)!UN}u^8S%lFiCb7AhK+oSt+L^0sy%lZpO~bzw$J? z-$36WRf;Zmxdp_ix0f23Pi?T6`xE;PX&>NxB=fcTJi}Agh<4SLHXL79H;6RNtvjnN z#BVv+Fq42q*iNk8eDM@a$X73by!pAs@WB~|8iQelIn#5YEC`jmTBaVV@II&%T26}5 zG3T23ag=c3s6kiqgP~BPv)1$iTsnpotK%*%5ljf4o3;sr`wqzk%#Eg+6;%$*HC1p$4`nO>+w16Enxk8hxIuCVulB&;RR=DS1)KS^#^Vaqn@FlzpJorC;)W6m{+D78(x*X6!zbY2DBCKc2wL&biFZT>zyoU=GtmE%eS}FRi zNk;F>&4R-EOtwgu+D`jrzFzy?^@WKMeVEdZipZi-q`tnmVZZN4dz2Y`A_7a6jxK2# zc9AysScRqXwU)O6Or|w-Hi+stkT4B+hOJJMv*#l~UmZx`$gdatFi+oQ-#iPPcD9vT zEGI~UZK9t3)Q~O2Dq9ghH&Xag)V{oOLzNeG%SG{-cK17p#|Aghb=MpTFoh3As#=f= z&_vFaK71?v&HTlYh`i(zqHc#{Ou!|B);v>}HM6dNW-x=C1mw>lt*%%Jo}c-d#q6M9 z4xF3Lw_AYoL%(v7f{vM{wOtA5OcGv*i`(jQ^PzhUS1%2*!Y5-D4vo7tTb6t&Zt!<~ zt$a9mEZ(0ag-|1@jQTZxDN>Q+E5a*`tRk!CHXeW7DtJ+C_i00OG1@x5lK)PL^r<&> z@8_62G{zL})nGP5X>Oo>s*k;vxN5BdcG_w&W>Xa_D!*|HDXSx3Ee6w@vhxykHn8O8 zl7Av$#Ne!XFjx2SG~A2!NDFYrQm9jsmnoldQ}m%i+15uW$z`6<7~%el@PEB4BK^dRjs9mpwZ(&%X2}*Wij&<5|T4dM`KS`$UnVoIwZnLtJ%1XBAR8s%6 zERH!ZxL$E5kJX9fWx%EkgSB2?0$8;$s6ETY zrn$iG280h-89xZ5&O3nY@9afVB?kn35-K6hz-g9d@U>8krwqIR1}fNpFV{s@nAKMt zHc%I(6r^I_3u6X(u=Z`1Fce1^IqE=vg7i=K;;yzxms%97!8=it+x6lMHN`4YXdTQE zdVW=VAMCJ;@}cXgQLx5;yf_M#y_8oR8=8%lQLUZr|1B}T`)!K$0ipgNT8nP&D^1kE zx!>+Rz(PF!)vGAD4el?KBGsT=guHzKq*y%O;zx?mX;L(4l}jCzyZlN6l0^04H&Q(1 zQgFGI^nM6uK5|i_&i^qx42zmBYH23awbh3%C>B}f55|a}v)W{#@n9D7UVoEkS*G2{ z+rF5Gaj!8QC>GzyY+8*$`p*p8@Uk}f?v2la5Y848R#lGEr;NsH;`=yTW<_&ddn1-G zpg9Q8T$Ojt-Q0t?6lZ@&@woj~_3EM~9YYd$?b|sBy@FJ6x#Q8Edyu7nHXO%6aaH`; z{i^z~jV{e;JxdcySB73b+7QG*K(rZTqhLwP>KM*}S` zN}V5;Bx#c%d$cIOGbSGf(2w0}G*4p(i<;r^Pb#JE7J2Oy&%F?S@kV(5uQQ8}s9u9Z zbZJj9-yJoiif*a$aOPV`W6kqwz7@R>q`BA_?|ih+56~M4=)IeFy@Q;6yntcg?)F7> z&L02HJ||KwNwOGdR7Vr)tWSBNyqAKRuA!B)wSs%|zqh)YT1EbnBR%rqR+a-9e|P2s zlJB$v!;*R0>o)Sa?;DumVX>)GzZjILo*x*Sn=O5mmy>r}rNS_aN|-R!X-+CvHLA8t zj)K6OXqzit{h}0U5@j%oVcPBgw5+?b!C6Y3ie>UgH)a&$&sGlFbV`_;D1oicZOcHx zqM$O9PR2HrtLCCf&u&V@uQ$4yl0~#H=^NEbXl_+|QWh^?7ZPIeW^7`z%prX7xkpuz ztJ7V}zMq}jV)ID#k7VD4>NIAi*T1=k*b+!=9H~92ubdq#Q%s&3NC@5IH-`JXqBZpQiyY4T*g(HQeHtSu z8j802DK=E}85lk6-BdNCl0s+A>&Ptm)W%4A6x)#7k!qdUrmKeBb*viH`oheOl5 zDjDLX4avRQ1~(U3mr8qRBU2^N>G8MoIV{&|pE6`aR1yx3mDIwd7ZH34cGFSY zX#Mg+^GuB+GY6YLm#yqj@RFL_xS!!tc_|yq)AXA^oUA?N}JJ0N!RILnL{N1>hg~$pCbWr zF9sWrcU2r~e7ex5bR^)&i{zfvoG0d`Ta=O+zmF!3uRlqx^NABqnUQA=bYPZBtXBeNqa*fP~^NXbk3O>gtQ@ILs6Z=f9y%z{Ti=cm>=S! zPp&&``=MYb4t3PF&@C3LPH2NIiX?O{39>zsRVwBe+dJkweA6wOPvn;9>m`XU1b06T zl+rS2VuNP|IL<9cVa^a;9fdV{ckZ)qUcTH}kYnCAp_z{#DX>=dhJEIXg2wi99=VdZ z;BpfG8wC|WD+L$!n{XHWi&Xw&(zls>g*3MR8jqw;2_1lO68d+SwUQON^Z7Ci=q}K8 z3K;lVrPG?r0${7sIzV01ofRr~LS0P_28J<@)GvAU$INe2+G^%&;7=Ei%&!IsK>ki% zvo`1K#g)}^Cy3SL20@<5D|zC_%jI+&_vyA8nl8jMbk$kM87MDa;B0 diff --git a/datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip deleted file mode 100644 index 38399d0736bac119321a57456f4d13df50fc280d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4075 zcmeH~c~nw)8^G1{i) zHfgRAZsk(%xRa%mTgxP=iFLxMnNiD3=aQY&%8CAZ|9RnXxtw#)^SQt8_xC*a3(*lO ztpJE$PYU=fc{vInFT0cv{>x+Vc@Fe zf$nTSw{MKE#y}wepeY3aD9HhA7MaFa$KDt6PN?Jz3*X69i;snBzvIkk&{L8TV9jN| z)ryKr)XmZy*??&ZATGhbKARLaiP^5)K{*h~%^XT_7!JF=>BQ1I*Gz4P-~F6!lU|KIfKTkG$3eIays^LjKRn zvB26>T&fYWyg2w2-$A`{v5_aXphdK-+2Gub>KHpc?sx$`)&jVTb|MW%x;)EwjV0+zflwZH59wOc=b7H3j8j{6g z%YtIBlU(coO2`+bP6DMy$sg$*T^k;qv~>D>AzMww8uQdNgp<48&BZ6-zB7}kaa>M0 z&Si*eXMXHRH$mvU(%Y^*+oJMu=hjWI%H3~~G!3jH<(~SJhv}DjPBxQ0H6gEu8{iy4 zOS0Ss1FHb*fh_<$Z8gc7ceC-ruT<~x4Xah4o7MbTFr*dANYP2RW-PRv)Q3LZWW_bj zIuMzisCP3~i5lwUkqEODq7Ig_cJ_|r4zyyrFoz)xX9QeE{v#X9bFkFQahC&e*ooBV z6BVr`^`yayJ8oXbx;74+l%lvNcy-lN_F3-X+jc8IEA8Bv^t7wIJ*>bB*BU$?&a`

zyP-+v&r4AwQp@9Q{67WHA@xK1d-liw<>AGm8 zAMUW!w?$>m2&p%EyUH%EVIg;ZhYTFTCZ_H>s&mq)1#OlD*uOCME9?^HX#^j8ZFG7& z(N;Bcg75Ey!)yDz^-=Ip^>}t91aiD^8!xJZ>g>%d3{N&}$&GI%G@^wSNoRnhG21f4kGVp+kBG;f&1s$P* zj^a)GwR;~j2F)s0dxu@m7i?oSnfH$pYJM9}ZYGtMTW_xDAFE{^sX&hp)+J0^71#aL zk@_@o<;lBlR&P1qji^OC0tHoYt-$oFd~$U%Cw#AhH`l$Rj8`IU%dg%VukGE_ z_{;NcoZFoIo=7+S8O?rz&dk6J?x65ub!3eR$IzFN&{^TX%;G7>Gnh5?Q4h^G)>v|5 z*_3_k51uYEum11p|HxAdQ|t*U*sOYA_Yh*TsT9^1P3?4KJVYr09px;AOua<>+p1@{ zOIk#@C^(1@eQ+1TDp<8z^Q2KLw_^)j9}*5XzFP4Y=45r}L9DF{@5;G1qCXF89z$`> z@4MKgoU=ba!Z==CZ^CclQD?Bl<`GMDTRb#>jJRCV5bf$(E2H>)z_VNRkncdWdo;|d ztTBxvb=-Owal|Ye{`f}R*_hJCZmPEXnmFjCp1|f7PidBeN`r=Q%htxQ-Jv^sMfXN; zh0*wpS3_(y8GhKsF9F z;3@BC6gQ_#- z@=`*GJCu5oxn2`qw2|=T%T2CjdBn)F=?`#psJ^}w4VAC-BQlpwnA{qpm9FSeuoXe( zq*C)VJHGR$@z%)cH~Qj**L<}~HkI~XSD_(jsyPhh`0>=r40B_mnrI3;uqCcJ{m=kk zJK{o|TK8MM4F9;|9d)vG+uN=*ZiY?cX{UWMHyWJ@Xts?x5q#Xo zQ0t|^<2;f8pGVf$-JNHf0lEKodcXh68a!?tnkh8Qw)eEo#Xsq}@y4=aHipdRUa>jz zb1znMbKYA+V=fGSVt+nK$PlSj(8Yqb)_eYPKx8F8T@brvW@GnP6odnF1>aiPEG+mh z5&X-fPXqWINo18HA?aNl2Vvqk{@G>C<->(xd=>^a5s(`t4SvnjY0+hYuzArOq%P{t z9CcwTolgx0yH`T$hm87V=BM#&KJy{?4@*MkN3#MXeZ!+dhNjKt)* zH1Xx-iaT!M18zPvPEKOzyvLnn)P+NMK6U33iK(BApHAw-su7|Mv!Fb-oG>u^Sl>|@PRLK2#kHA<91 zb}eKVsneU2PGl)thwzT-7n0MQPw$s^<~PsGe7XM5{rvCiy06O&#Ry^tR$s3RJ<*?k ze7$f0!T^rw?oP79VSTaiBbICcW7cl-ldB)%?;HRFXq*uMKohi!*_J_|8-qCb$vb#> zdlG%*RTYr(iU>u7JW^2}iKH`n?q&N8x44JS=pNo*0Nb;*wz_>@+NxQB-dub1jiLg3 zJq7@9*aiRy`T@S)SUf?&*T?VkgG$cQI=PUl_5FitODr>OK=4F5P4fl;;leEm<5#1F zpW%0_7@cFryahNZNnH7i?hzeK3>>9rt?n4)r<^^P-c++VNq)5A^d{o^V7Vs_J1A%! z*=*I9GOZtAUP(5x4YC$NDb3h z@8SBp^4A$~%WUGZgNZt`P1LlnV%{W{XvK z1ZdN2`Os`B&~FRp==XD1)ihVU5j0mMAp8@gWv*Lq85P~rctq&0$EVW-z-AK24r5M* zf~QsY3qP9QE*rdo+&$2u)!&jj>2$StGOCK`oEL>E2Zm3Lb@mq3aEIB6^QSIh3-^c~ z3Y`2TjCYlY&Z`^EkFrVOewQwN_GAO>e7IbLkbtR@?pdBlygj052y8ApCWj5Li(`L# zyWY5ThqS!VYZP2rRZpPHa}Z;Wydq0bPYG3i>`+2Imy6@_F%`6Cdu0eK>gF5hAYwle zT!TbLRHTyaqu=VQImk3c@!H*N=B-740?DKA9r8L~F%Ki$52Sv~Y#YXiSite8SXgXY z`7aB(dcbPl^z3b#lyvW-NXA7wx>`cg?i^3OAMp^b~7)<)1AI*~j&66_yp8T+Gcv|^!#C@~1 znpHo@EcZlYLJE>6!gUfh2Wp*rn)Sy{gXm_ZVIJ56v2kYS94xB9tV6u4_@*u|D->k) z-W(C^C&g}yArGr{EsA=mRqZcPwdeK?#1f)9Gz&>2Om)VY09PEjIykWFSlhArT(!uw zsBSl>yyAr7SwgEEzsw%LnuSqWU2&n?rD3nqTzY>|!Ny4THD`E7I|s>g*WID)Fy`9D zusioRp>R}wat@|k#T0Y=r9e5WIKNdBNx|^wp8bwua$SG+Ox=6d7CcPASWUM*JS7GJ zt+@FE-OutX?W&hB6Bhext0%^7JZeuZmc~s)lK&d7xiqQpYH7B+T_AW@d+f3|a%S%R zo%hWdUL!(2`XzcXhD4}>Eig(^X4r|v?GW?+fQGWbAp1nZPg=Bxg=NkP12WKamPvK7 zl`h2`60&g>R46y7b0O}upd$9Hq1ynHV8q^}k9G?hqaD{7m{$KC^Uaxv=8OfKH!tejx!RX=Z z?CAY7zimm6j2pChBm0LfJ_3?T=_bh47SQG53M|U#YCHzN|-q^ zVy=-+C61w6U$3nAm?lr_`|qnzb0Kn$KI#yk_G(a0}7fUPmz60WQKcqU*!$1%(fybwO`pjB^n~=@J@zgVMHM5%@-n1`uCn zv*$rCX*ah!jWM43ihPb4dn_s8_=6UviY`b$tBJVUmq=j>_6y2jn#k`JQ;U> zCd*t*p@L1l?p>INeV7=$+wVT96-avb93?fxPZ4R&gm2FoeH85vnX<~6Ico57f7qq4 zfWaBtzTp~;d4l=C2i7jgq-pit_Y?ax`iiwlY8+&w0SK$Si?NuSr%f*4nm~4NtOCA6 zWzpkz4baNOb9c<-w%j)b0u?7B-znH$SU%ELARWZ_^w5EseYLibi#@+p2r{Rgxg{I6 z?Zte~P0}B}OE;H+h3^j1zCWj%$TYjlG`p(wCoi6`xkPV~!);0Gri@5lZC9Tdbs)P; zGoj_GAn5`EEe(+hDL1>K4L(8%PjtqhpYn8SXL#-uV}i5oVA4!Jzf!w0GXFZevjN{L zgLu`ggm@n#YRuJc(5tW33(2k;@PvP;VGh!?4B)(d_63Ap58X zIK5cuQhVT1@es)qnS<6RS>S9{+Xm<>Ti4q?CAu3%pCh!>M&rt^bjT z)wcleJ1nR)Kf~&M`o8Z-j*6(8ka#G8b6HLYG!L5GUW>u)5B8ng+coIOhMHm**OH_( zjw*&c71d_C{*3xd3hBuRH1?A}7Meo|^66W6-8O~@N$i4_YE4Oxus^UWhXo;jz}EuQs{s8+{e}ZGUx*_t@C!t_t5Y5-@BNW!!4A5ctaH0mgM!e{PsFn7+&z z{~{n1SPQtMqrWlWKg=9ICvBcM)=1+zm+2&Zoc%d5J@p2y9MDPq)>HqSxw#`;&pZaAoB3Vwp^<;k8=9e3 z_u|@WH})y($y&^Glh?YIpD%Yc<2Ih2>!Bj7bVJvjrSwrZR`vDNMK-#rn`-<&vA?AJ m<{I;bP20(|t^AgQZua*TXDc)l%^U!*)7}Ky?!j=b{`)WG+8R&* diff --git a/datasets/common_voice/dummy/pa-IN/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/pa-IN/6.1.0/dummy_data.zip deleted file mode 100644 index de46a6bdfe71da091c227e57ad90a818f1b58c42..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4416 zcmeH~c{r4P7sv0((%3?l2#;;7p_#I#(kL1evW=|85XL$gOG!e{ScYksY^5fVq4asVLaNhSbbeo4QWEE@#E7{n<+*~!Nj z>yJ}bSAi*Sfo_2+!?q~HU@MHCk%?w_M8y?G$2uw-7%wom({?VG0Y5!)Fz`47Z3Y3U z8SVf90BQ{YY+V(@&({&-t>T9Z`1ZQUl${PUNjrajT`h6i#!N7t2oD^=4k!wlxHb(| zwR2P}sdvnt*xkr#lLkgS@XDb( z(FW~^(w8+L2z$N0qcBw(FH!ukHSCRi!u?RWv*6HUv+& z2a^h~CI7*Xs)0z^yaqV5x6@xcgj#47fIS*)#(r?h5O>5PkX$#lKC)S}qHanxTJr9o z;yRDKmrfs}2DKx2DZ^N!<{%N1&wAIRYgzb|9INbGX9+o`q%qG~@NC8RW}YTC1VxjDtjdkIfEQubQ8OkrNZx6d}*d!q;4t*_2?aS5p`bG#<8L zPtQ^eMw>a)5P790qC#WmhRHBv@b&21+qa%q(1Rp~3cKhEVifRApT**k56n22z3i!j zfRXkZMk~cU=J6`0P*)r+3@P==Q<6O-#W#DdIl7RT zTe^B0t5Pq((Gx>&ONX6*XgFVIzo|8^cq4Cq7!SUyRpA!UFsHNEp;kbD)c2SW#wBQv ztfR;i$c6?f$#RJi7hX6d>pm;J*s{vx5K5Qu_H99ypHAF+PbitK%Zn157>F?!EYe%= zT3+|*dak)?1gSS!FbR@Wr2GyJNNLvv?Xuz%FnLiMre#Lu37@!f>1$pKLT+_(N3lnw-vXOWyVAmg}5a`Zrz&g`yQ$ ze2>5$C9teZZGr9x#1}oO`G@*Sut$h51LmWyEghu&?!y)8K;@3sf4UCb{Mu1}v9ho5 zV}HGPW#5tMY*KQC{fF0M1+!(O4Z-go)bF2oPQ1XKe=*2ImqX#!So#n|ODwmM^@@2m zl;sMF6*@vhX&Hdq(+l0)A*uVwTJ`+bk;0HO1=ItS zR@1}d1)OLH2X*uV!)MiykNb=KpW~TxZ@rj&D|$ohbbCP#6gN?CO%vBky+z!xce;c3 z&ZOZ8G(9Eigi|BuBOOi(h_Q;7`e4hdeBgH&=l8<`%e347Ti}1d0uo%>hDdrVa9Z_| z>*MEk*!NE!Xo+h<*6!lRcMY45uq52yPi~aH^;+Og%Y7p*Rh^>C-j9tqh6+XCG{o6Z zT>&M(%mQEWIs+TKvZS!V^QEpSS{KhISp`~StU3B*Qjec1hL^bAypmI^b+g6)L;@79 zUehxa8wT6im_rOxn6W+6)XR6fK^Bt_lqzH(p4%x{PYF2@Q<|Z%&=1lS8x}qCzBn6< z2V6itAvP>B%}XV**dTtBGKzB>392D)-mKODVP@0CUO2^t=fy}8xLyC{3Lv(Cndk0>dE!E$Gffim_Up!*J%r06A_ukp$_#bW(yUM&RCYvGK&Gnc*s~ zkQz^YLG1#{XHY500sNl2Ha7Bp|IJUQ>bMyz_a;hv@HAaEUAk;qtEya#_pd#CnF97V zF(M?uczogutqb2ajuw=PQn^das>8E~d%siEBD_Db zHPu=OeEGopJhNBrE-mH5zV?qL#liGPv6*>DZx5 z#aPeg^s@0Me3nip0JQCH=QbEf!oA7PIj@+8cHeTw9#u5pJ>5~}F*GVUaC{1jkPi~%W9`A~|tU7wA^}cJk4yj`lEk#k0Yc)`8 zV?n-{qjy0z!brfM4Kxi~;{;fB7@%*X?DI zGzrgKA?d%n44pT>%ltO8#rc?VlUd-=%N589TTB02@~W22Y=O6Auc3pN)o2mSIG8U5 z-=MdOm4knb>3_!lI#Djg`Y^8s`%`bC!+$edn(UkR$obOr$Y2bWk(?f?J) diff --git a/datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip deleted file mode 100644 index 44dc0a8e87e5e2530d3083554a752a35e7f71527..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4103 zcmeH~dpy(oAIHCJxVg#q?& zDAx<~>EX*w1b_i96e7{@xC@qy-D+bk1c1_1EuGjG<&+4(2Ob9j0Qep2X0d7!HSQuf zDgsCH@uK)5wre60+FIIL2&6UwiR7{R?-|Zqueg!N>RyD;0O&P?)m~qkwd$XM*592m z<63`pg82Y|pC16Ucn6Suu5KBR#pEi|}qu<*=+}azvXF`laW2;eF#p`9)ImUW5DoF`B(wITV zhD3KC&rX*NICBh6lTxzWj!h9Vu5$=_JcFUoI_8t;w_sHm#OShy zgCxfLE#6ArD$m2K=U*fgNNLpIC3Z zL?>Ivuy?&lC!Ub+abkqLBeM!+DNXKD$(flMe5vGaH6K(a9t$lF#&(I@0V48k9x-x0 zUjEe;+LghGZF41WiRjUAJ=2^K^P}qq7yr)Do`{%VEM$<_!?N7%$$HW?30Ak4SeGX6 z-FCrKf7t7|UULvqv^xDyY3!Lz!J2j5(zh!#=&2r^T|);PHqC|WgGlScB%2VGYMIi? zbUxaapmYMd_i3Dz`MiO0^5W?y)Yc~OBXQp)MXU+FB5ecsT8IiGbFFaH|6^-;kMZh)guFW9V253 z!CPL&PCwaGK8mwzJiT*5%J5($bqX3;z1HgBZF}EemuT6uLw+GsV#9;w-ahu410uZ= zhw!$I_`BBXX*1Y^egO)9D=+Xb_uVwj8Dn)RZ`z90^M2N)!+YPUSb{st4)88tyU(8T z?@T&Ac8{k5O7!elv&K68y$|~rj%Yd#%2qbSS+DCctj*oRxIyaoy=gu&b4*S_0F$`( z=N{FWGX2Tj`PpgDZgyH4he^6B4czdW~ zI6bb!ZjFEC>pG|K%&Id_jRKCF3FQ3ZVKd^LVmQi7I9%_w57!*-ayt6FWzE0`?{W&c zqoize$@CS6l93PYqf2E43r&t^g{_nE7J^!Jf}7RaLY>UtMw^)36Bf_`P_BeUtAM@x zBBZ0X4U)aX!`qA>JL1%7hDngzjstYK(Kya0E@KBdQ4Q_}Q&H;KFV=j-S;lNOhF=k8 zKXYK#G~ns`;CoY@hzTY&J7(Ck^eA&|j@qkuc^>)^x-1~urZ@%M%DaGMyf68ycB{D5 z|F!!cYp1NYeb)Jl>HUi%hD3J5`}{~eVsj}GhLjNKOU!ZZ4tdz0Y%bdfQyi|Em}5e^ z)1dciEyNyMpEJ{X9a(F3ZqB{Y1MX-lHBnrjB@?P2M=(yU>C&olZWC5TO4yGo2=@k11sN}+S4}}$rNR+ z$(OMq_d)gx2}QYqCERr_j6SB|`jpmUaz@YpK?hU>2{57sHAZ)h6Muxusnn6F#ZuKV zY_fPGe9Wq0dSduVyRl*dX(OxvbbOGxqoMp5sRj#U7RQzixZl^ibHrgQbpA!f#I8od zrMval4#_6{Hi|*@M>iKYw7RCtop39b_Et~C70}OXCs^#Xgeqrw7F(J;AVx|`Sv?be z^-$90r@rJVNL`z~V=RrIdeuKWB~aO?f2rt|p-Z~=ser3w2Sc&)U<2qC^TPTWrcR(8 zzH%ZW8`^$)96i@QCCI%Gd!k#4y;)jHu;RjY-uI!q=SnMGWjhk%(D718(R1x-Rx`jY zjdGXzBTf}-F-02$x23+*Y`MPqj}!+BSQuhg?E^1a8iBCXQdl#hPq|uxb1iORnjSqD zcQ@)@K}}&jT&AktU@4%*^ttJ;w5?Gm)*sSEB*5itvM1sW*ZJomu5|J{7>i@#FK6^$ zPLA9VDJTO@GH<-v5Oe>7rmj)sjSzZMD9&tyQG?iipf~zF_|Lh)vR2z^5NqZfk+A@kATVZR z{!%?UHkMDlI@xtNP$By@rK1!!Y4ZAw{TvD^8}z2U{0&jAdCb)>+va_EFh`ta8!cxh z*5zH`FY>yRxI-aZ;}z+dk6#hAS_qf@!j}NXQS3*qNDlcN$^S(`HDI~mBI-Zf1^*$Q z|CqEgoG+8cQ5igvK1FpFj2+d#yR7Bt$Q{%_!&n;;$UMW2?;L$vby+MJM{H+NS5;@3 z${koasXi*VRV<{@4A!aPOGM1qW0c`&Oi6B`#i7TGtXb&_w)OHzTfZfGj^7I{K5d| z8Y&DzFTH$zhyb#HHwljqclY)ndT9RSC)~Xb?j}mlBndg*-f$j{G`N&B@k;8i;7j<98CW+1FW? zkiwhC5*y$?@nBxw6g{+9N5X|`_n7n)sR-NF6a$%Al$m?3ufUwX|ADrsnwHyc%J z&=EIek{KON%1r@hyEL!)30|7Dn2LPVTI2A(L`-GAk5!N)X3>VoI&Pp=Q)j7scKxP6 zN4rk3AA4Th(en_y+h9CUrT)abCK^JnNhF*#C)=g$^hR;-^oOZA zsh-`O>RC?zW;(%;kDdOH?4`kd`n|ECKTdu#zFJGvv~^qp3<16K?Ex#i0tuSfC-Pvl z1C+NF7T1Qk#6ZcVcow^;lj`A?E-*O()9X1r)@Yj2m8CPGF=z1r2 zA)7Yl^IUgCKR)yF^wb-{C+oJ#*rwPTZ-&-H1E#|^NxFxBN@#~AIH3Y%^CIR% zD7R#vl;}NyL3+Py<+q z=#fkjNuj;Y82;kvi0#I9BU;eyF?N}>@;p7N_V$hX0u(*Je@+~9G#)E=%?I<;L^DQ{ z5F*{$lr#}cl6bk`hF8T8e4F85LNDV@h!mr~UEZ(r$=1G+P{g!#*)?kwq0G*R$B(-P z7G!Zfv;1cWH(AMi>l&`~Z$wfoAeI;D?_G>EnzHyuNOfTKs-UmvV#OLydvx^C=`;UTcZjaM6f&N4XnoUREvxhfdw~Zay8HvjwK{|lqkClr{ zgLk=gR_dwAy`DsD2=R-QYYeXcOZmT*Qf{$q=p?%}5r6Lx3MBergMXwYKDqgv41B3A z9V-Wz?@4r~QwWu})8-}Shc0<}P?)=#%Ecq76?aT?B394EtIaB26Z5k9e_4|Wp zT_;ta4SUL!;%oHmMH{+Yt*l*iwnNUV8mf`}0Ge!J+RNTF2fm9H!;V^NF|o+U&J(&t8l+I8}cDE}!Hn4F^)8PCcXK10D={b^x3w`cI1N5B!T0&XWj87#Z zN_ZP-oZZB*ab~An(J@$Te73x8;*#4h_ANFA>~THYvqyENPrcbUKb1 zf6JP}yWPXy>Kp0bq=9*MIc086W2b?8H#2@qS)&z;HN)y*4wfBy8Q+@qBz&kHF?{@_ zOn9ZQ!sYM}Jj*kf<2F=Mz0X*)g`Q+DUG94pNBDLMuhF`A_7wiF+)-EDEp|Zai>k6w zkMtvVft?4b+57O@dYxZF_G`R|dd}DIpV{O$8RH`)FVgcrt_*Lwomwz*bV1MNQhYsa zVIp?7{q7zYIpH`)-aWBj@82lIxW84;WibxkMkz}qTm@fGv(icvw;;sr73$t^JI|HY#o+TrNpVE}jkJxcKjGYxyKx?cqPc*bN3yqxjiBEAq7F zw%D*0j-E|jQ=Mh%YU{d^dWt>9xukwEvOi{i>)}>1Ie*N!WPUYE*yQiHIXg>EEiUhN zwcA)prVDUQUiKb8-Y%!(R-cP2p>gZDhORgzxudQg(krRKLR?e78QFiuUTemDVzVom eUCNpwT(iHPIBTH=1UaFF**Ae70J7Ks0QeKQaMq6i diff --git a/datasets/common_voice/dummy/rm-sursilv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/rm-sursilv/6.1.0/dummy_data.zip deleted file mode 100644 index eccefac1af29da768c557f35fb3d6af4f505ba82..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4041 zcmeHJc{r47AAT`|?6i?Ag-8v?BwLNGNE6vwMV66uFlIS&R8C0Y8`Fg2L?T;C7$b)G zQbh7uipFxv5x0W<-FIeilCig(pKxX19>*(r&~90KU8cpvOHBKFHPEUjvUL{!JVu73ngUi2RzkM!N@?8byiFE9z>B zm$dlTttX!h(StS2iPd+Ab*vwD_Es9pqV(ElhT0joAWspE-^3?lD=C`@0oj(+LA!#n$zD67_)%mgTql{(cG)9&PqP28$|xSkf9tM&3(x5ShE zdZgUP1abS#ySfh57Q{90h(pygFXU~r-;n$wlj5%%E86EUIB7Igm4y_B#d{6?A~e}E z1rKZ<%91H74$H14I19IjBxR%eUld1=b|sqo&(Eq&ZSEmj7agiqZdGmFKg)N^ zf+haxWB(D9Z#5{`PVNOmdk}vp*TFwA)mjqshjUtZMbW$a<;ugc<%TM}+Z(6@_yu+2 zm_>Y8w|E?2Nv5Cbo}*a4X&JvE`cSq=k2!eEP}gJzH-EOT_EoLyC_UO3eVflPV znQ76^%N{9L2+^?uE&l(OvcBXJ1gF@ls}l!VQ3+;Vt%7Ba-?^a^{d7MLjf8svO@psb7)^P%xsC_`N>jx%k5#6PfV^Ei-kr#05 zV(*^3E}fp{Cwn8OlolfB)Gj?}lhCD1`!UtNChfa&o3B57W}|yCSFx zd2oK^?67oMnKjqyu*jx=)!>k+s0Nj{Oqtm0RY%VmYUDWMF?U}FZppK2hYw3d>ib$d z);3=hPfF(Xi0>HRFw?^`KP)>z=&|(-4LC13^K8KLs*cWNZiy6i4RShxI(Bq%;3Mxc z^Q^JN&b%)7%;P<9_%EcdG?o5O`d6g$lyt|7gLYT?-_C*nyq8DN4_S}1Vp% zSkl}c`MAU(4pN-0} zAK}1Z!$P9Xsu=n62$j(R^DM10mkN}DsZcF{v;3?qB8iN!6I4gby2DeIV(gwmG9dX4 zMQ1A=FK1&;ch8r!e?ZC%R^Y6!B_^-+B$t)e$_>ce%6u}?^odWi+rUp?-&Id?$L|iZ zyLpr_}PJjqq)8gQ#|4Y&2O|@v5%(JoS zzkZz0K9ZQy$#Y)#M2&uhLnq0xam2FDY(^68ZaXwJ+a*4%K7PN%2vr_a-jJBJ)lFXa zc@YApeVj?CDfJ6g9t$iH)Ay2MyfLFt4r~#m>4=c346Zwwt{cOxf414%J7m9Qu^1VlRxBX#9bnCt30ug*9zNAD#JG8oLWCC&G81))|l*!{mCI&xsT7twBl;uq+c;@+2zxh_IH*prWY5D1h?(n;B;p_b#9&> zvhDIh)pn$7+iSwrJ87HZ21GAau)oDfc#^m zZ{zVY(wIrcPa<)IWe}7VmcP5a<>bW~m6t%l%?f1xCIbGj@wMXe*g)BWGl;&TTFdC1 zxsM&a5}X!XqJK$&KgRwx2eD&sg>a4i)rrMz?C5_8a*h7Y5%EtwtTdyRdH|gdTAu{|iA(c+J+W45 S0m{#U5CX4!&}C3z*4tlnySBXm diff --git a/datasets/common_voice/dummy/rm-vallader/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/rm-vallader/6.1.0/dummy_data.zip deleted file mode 100644 index 3d474f92bee3e3cc3495d138b69e7c03e5165215..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4384 zcmeH~dpy&98^?bPX_iBY%Q?g-Xall zoMCgTFGf#EwO^8=+Al^YyQz2)U0sQeB=Tn^fh6p5>ktN) z&kMAt*b^EaC#K1A0Du-J0PI+Wk|&wyauVfv+UwKSMJHm9&R)lUyLAuo`sNHZn9fD( z0Z4Xqkx7h`v`o&mQLP?h7ee|gS!SsFr0B^H!LfoO{1Q|E>*GqR(SSqJcXHQaZa5Wf z$E^*O?DDL;<~1~Rde(&~Q-mj5!KI7LL^KT2Y7Rf)nJ5!VD{%aX3ke?|>J)gcJ-E(U zzR)ge=BbnpH%=?4_~M6O~EpAcx%D$XSkBr3}>s^^0te= zqVG|nxBnTlN0|VdE*r}AhO}Bend4*dGGAn#*qM58=*~&qGu>|_)+hRyYrp856VJs< z=s^Uo=%vd|T$?b;G#Y%{~dkU!Y!#wTQ+?Qp(uHvQ53 zS+)O&G~VY3EI&pw;J`Z79=UN=dzOoJxE7)CulaMk5C}yDgtC1VP#j5KKSdISCNrdA zh9}5SRFq^j{Js$R++&2y9b0wyneYznn4omiV$m>6jO>wT+oCIxAq&CsmOZuc4CtKU zHW8~U42mR;l%dc{zNsg_7pagHis86KSE@gczht*hqagzjs=(Ymze(%;e0Da**i<~1 z@D`QQez1Jxj2;drfc8!IyTh19q%i4dDnI}IX6}xzkyPmKA-#LmpL)fJve~!By#E`vs7IB}hPb6$C&hx_}jhQYExmnJ~>G81#86nfk zk>*qxkz(c)J<;z{et_LPE_~}ATj&JmZhykwEyTV`%|{7XJW7)%9S->;$&K*Lpa8?Q z!73jY8Bg5zIP&y6tW5%+Pkx^2Y*weyzjm1_QTui1d54^k8j4dzijfeTpfIz*xl$|O zfJ3&&6?KySs7|hTX~ci?o1_xa`$l|K_ZSV|g3_js{Z&gE$P^mLsF_pZ8|cWfwUsGO z*o+-4nriXr*4|(2g$t;mSGK8o<2sCp!6Fs&voeU1k!+%=37!|@G?o4|&Y-={MT{p# zsdS((sYQNEQ5~EVM#}Is59*MIJsHF)9L7`99prp?8{Wd|6~83!|owE6m|);yu~VSLU&JR z68UG`{Rs_jl4zJt!+{%t@YdHaaO0W1fqJ@p&Ewdy;lbDHV`8la<|dnSkmSBy7YOAo}oNSN)GGtSPk+sB->@Mf#jkl)hm9#EnRM&@B{1x}(-mAxes z{dXk0_;UKkkLHb%5=3p@c1@d&iZ$^wnKK6(aweKFamgyyy2_NfnUBpdDP3Bn#Qsmsm@p%7saVLOAZa$2W+lrco(Cj|wX9_eDYwkW0SnzBS`3CU=yTMq7y z@2qL`(ONH9KN*o)ySBww+;u9`STjz3UCa7~QGKXP?F)q8_;4As?4e>su|GA>xrM?Q zZN>BuJ3LXfJUaOm5BqnD-JN$DE=%lcsWEJvs9Knqulm%?E#9{UYeI5$vsRI_;V(8~^tyr|7P{dEegrq@jvHv%IwC#Evs+s$msU zbDq7&H;FbBTx-Kl(zBg+4AF|6`eATgJFmFSV%W#saBZ~~;O%er6LNregkU<r3*7nPRI9-t~~_W0rY7EQNw^7yO@dJsqpTnB!!ZP>EwV^?LHLqPduc`eE>?XlVA zQ@|>)Qu(hP_s_+@Z!g)!kz6Ym|EBnWDg2<(#GA3s>*D&Cb~)@RNb{^bg~gud=NrKC z>C(%VU0sxSp@4a6#jSR|DTdru2y|c0zChq0lN8Cp2WA6Yq=b_ SLAn3{1>Z&BQL}=v{{0QNzdHl~ diff --git a/datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip deleted file mode 100644 index 292c905b122bef413fe8303d22b5acab4bebc7e0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3976 zcmeH~eK?c*AIE2Ao;6Y=ra~F5P;(S*M$XUE4n`hQVzytDv5XOZE1{z>l6fj4=IJOz zQBKq)rz46`!tpFRV&tS%XPkcb(iJ;$tbcxg{dQlwci-2w>+`z5@6YG^`Mhs?8&QZ9 zi2oTaq2NDVd_GBoR6)dGGWp~YqF0dD&I3-8AW@cvgD3wR94ZYGfpA1YAjlMOvR^TX zreF|n8qzx;kP=Kq8t%|X8t575A@vQA`uajf&&?wUJmNMXqXz~o0M_K7S9*L!VAT?! zXSoD?cEa`=Nf8i85)1s)peGaad`R&& zsPV(`WcPA{dp^S$l5h3QaF=*`9)&sN!Hz{8r_t@W9X#Trb4~gMjh&P`m9^43mRGNh z+=Sn#bPX5{kx?Qj>0B^e594iZUT0oF4|c1GIN!0U&23|XcwV&DHY@^pXO;lBmn~w- zgYC83I~(1ufJA6uWL7HnH*I!c2a%nf<)Q;hj@fk|p$GLfbnr2UJ2Xfbx)yHU|71|y zAkyn_YFH=Ea>V&xlAX#WbIlbxT<)zm`J-j6RZkcu1@$)M2+rf`2k(My)Qj)vJv^;e z?O(t!=?DZ@T&wtKT{!gP6v}VUEGZM>yY(bRhyB4ByQP9iwnl~j3a@zMMr+BJq`_2yLuJ?iTX9x@RCuBjIMewa-XxP}&_|E{#~H zmS7OT+;T#*4s@5fcV8UDJ8N}d{yc;$+m9B7}r$dzN7o3JXz)6Y#O8kZ&V z>eO^uMzN6aiTl%KZy%ZFHy?SbzNokri-6ZCLPZ|}6qf)L-z~i2G%r72pc{~gUv%de z`Bw^^l!Rj_>H~$I;rLYYdxUXItHOz-RE`nOs*i~ehO)a`u&xwlMz}o8@18EEnI4w? zwyGPw!#!9$ap#nN4CmO?=&cCOeRwH1fR~+GHdb2YVysruf(w~w`tFj;b6$#FQiYF4 zo{nXkDuEHBX&9HW2jPWFawqbQXx6Qdpd1$8TwjXwEUEw@^`tGp*v9!t~_l~-r8~i}e;YA!gJUdU`^Y6|#q;6hS zmv>itTHalMbxc+F)I71$L5?s75Eu zA+@9iubscH7;AFO^-^FFz8tsBHU-h3yrC!``cQ`YXv{nlBhRAsCVP5sKe*50FLfC` zPw1R1*D66h>SMiA_&PYPr=l;BF4zBSv@k2!Ual(P~9^umo7*mBg(;kJFhWXlZe(J-Mk-M(nQtPPQjkegHIOjgx zL>N`QEv5M?i=Z35@m+E7^o&+ZR6!f>cd${ONuo;pb)DaHQX?Okb{gfMPVt(^@{mE; z4<0QKyP zgCp`H6jKD7Bw^x1%Mi`(eb@gec~N;iKEmQ;v%+-xMvqa(bNo&-Nf@UMTb!rVpS|B9 z?tL11jWthwz&)AT)aP7&N<-^p%GT#)t*Tlq@(`!}@TtZ+5y5?P%EJ$~7m(ZzklaxC zedFuDya%qZeI8%8aAYI9uZPG;i&pRP1n$(i6ZxQ$H zDBNTU#;6b2RYT5`q#6yim8z=GZ9(Crb9dE&?B z$QFBTDv#QhAEQS*Eyqq)*%ze75WUs57W-QfZ$pO7L?(k8m4_&_lI%;U=N1NcOrkmt ziO$T^M_{2=`DipaBuEUd!c|b|9eG0!ACsdy#TadEL{gE2eLYEBd>W6_0CBu}$ZW@z zla+9E`QEd&GpgFjxKwA>vpk~r8rAxWRSG63Uy<*zPl(Y6UXV$;Q)MfqopV%w) lm<2Xa$v`PHfj@m)g1>Z}mC#~PerPG+@P`0z(9-<7{{m$TbeRAE diff --git a/datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip deleted file mode 100644 index 032973e097f490a7ab00002bcc0451ff0afebf56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4811 zcmeH~c{tQ-8^=dOg)mvh?#Q0ZFq5e4Wh@P{hDx@v%&|7cphA{H_MHg{MM!v!^++gd zDoY5Hy^)xbEQP$I&UH}7Y0ls8J9Eu%esj%r-Jjq6+~4QEpRYa=NXr1&cuf|#8-Ba_ z^TG(=1E4%zTzo81)>vz46JvS+FhkhDYUAVS%?O~Oodp5_v}DRff5#vq+k@ERAhvE8 zcTW$9925qD%fMwIFgOGTqcVC{w#~Lj6sIye#`8OXB~y8)$7iQxy%+E@%f?XD->V{z z1^^JGBql>W0E@9kyF#%ZxNi#;m!WL+B}w_mh3YUk=i05xz9w*Hl+O62;_7)_Spmxs zt&4nj=+{Jp%e4|h%m7X@2cXw`?HLa%j76k>Y*s0qs9H)}QMRwje2FWU=s)XbS$gPb z8T>Ks-K&C2NYlc+2c`~2&%-*?t`fT5IZxCoNIxJliHzU$L=Zi zRmBX2O6W9XY0#DoR4WYbTDA*$^i`lU-etDCMTk{PK1;LbN*RBbH8Xjfomh{a zQx?o!;%`n{@i^$kh>3ph9zYIo`&hALG{QVZj2P!5)}MymIgGTipha#gF30H!w6P%6bZtYnCZU}l)5e9_LA!mvr zQg;CWQ0f(dvcvsS4>7uWjWVEXeQ(&00rFYbMUUxDEayh{$eP@2NGiV?Y*uU?a1?P* zLj8B%BAJK#$H8G7mr;le?mGFll~0wW@#T5@QuA@@Z~FVg%26;#5x)0X7`jMYPpvK8z17OQ;m%WRC3O(Tg|4AiK>R&QEGLm=)P{9As0M z_ap`n-!FsMhX*#H<8n_hPahQiu;Pem$q48cn=uX&aIvL|fB5Ql}%d+<+)x z>lb7cn)q#?(P#Kp#6z0Rnrdg*7Bt+^a-V=veefUXYGupv_m`8UzOe1<$tr& z?P8IxgEFA&eNPW!uL^QKZ0+W{(t2cslTYE*z%wC>94*GO!Hjxc;bsCWiD81{Qe4yw zEy4hjmpZDl?x=iiR4|fMt0rP3zpe^}X^+RW*bLO*pLUCe#lM?7H}xg#Q}@xk+!qgI z#_bcYXJW%GXr^QY^aS?F!i_@mB<7QXJi|1@V*3ci`_fRxyNe17;vJekqJDRg!akVd zCn=r)u!+*v%X%cxFb*9@IORq#o_Cy1DAh?(t#z<6o&cqP%G6GY(1<5{>^ZPtA1*K+ zII8FY!(-y=NXhw9^J42QA*Ii5mO{NqCa;aFT6GgGKS;=bhRI6|WE9m*L*JJ;Sgu|0 zyI)LbRe`y*RkxkCc&}VBiToV8K$^}Q%6dYB*khb+d5-)MSPdDz58W%Q`XLq4!p+r2 z*CXUj`Xku!50gw#oQ#-{zHts0LGJ?Z8+F}U{iajz*`@>|BjI?w!Cl!3ypKS-Y2Za* zZ%uVCUtIF8ytoVVdF)Sy_wxhBL3uEp_VpNpaJGn2anx{|WQC7cpvQUAT-g_3(K0r; z>PbgUhhMtsZR4nyPrQu^PLlKYvs@;XXuQu}uY z5I3x&9p+bB>M%SPbyJ3Y4QD#aq0S}KR1g*c@!a> zag|d?Gl`h@o`o=FfjOR<-V617#s*D&j875NMWp1OZ$2Rr%bR#N7Np35;G=_gBE*&j>+>x8_zzM*?#ab=qgs9N9(W04FatV2c@gYCq^rBTXd4SljIXECp7#<}y4mYma zd6D@S!F`MMzO;+<_R+N!wJQ^^4Od(q^WCe_jADLQPZS(iN*}TP0&97w&p!2{1=xlq z&ts1X5ff_=ZmPo56a3LVzTD3opKE$byfrF6+@Ph3565&T#N>W7jOj>PeQ-Rd-aO<1 z*LX!=gN2dTyVs}Zy`T9p$NPV9UT&>dzpQb|9@MwX`$L&-p`o+F$_(&r3`Zf$L!#v| z2{Q#IH7^IA!2n{AX7-+zp3ehHJXP~7W2?~Pt1EBmpFWaZ;q3GOq338ns@z>o5sC;! zD00;I9JK4tQ*4LU(_x6j45)%EPV2$MGevH5N9vsn`wYqlWyw4~7m=PLS=k~~V;qOj zQ!SpfvTZe;sxo-Jx9!ygAQn9BH~)q3WnrqdUyn1W#QHxrb4eD0B}Oi}gMkEPqh{LH9V^8oXkcC|udd{EyZ2Vafh)$85imYo59D@?O!%_B-411pE}*DjM) zy7p_Ggv=gW1I#feh`m6QR7NfXbm~^ci^;Ka{55mD?A=R;N&A&D-J>h6KL(Bv>L&e| z?u+}+R`5A8wI)UR@t#^OZ?;J)ej=*F*Ej`FX?ufQymkCWc3o7)?#8quOQwpIyF_#@ zH1^6Z5AO2TsBRkFFM#uA{LM7)%9Ntw^7FKZSZiXvCS@m%MGZW9U`@=A@BeHv;`hXS z+)XbMlwL8s(6 zSIVC{LAz}?HXCC3$pO;r5&-UGB)R_Ng8;K?8@JnRCEa&h`7Z*B05$`zsUfxp{0EEq zYtqjKbCWcymQ5w;o86?qHtgo#rnPx&*lsqz!zeo%kP|>l`P#BhJElc}ZLyvd>Wd|E0`1N0%c1P9# diff --git a/datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip deleted file mode 100644 index 4c837eecc194e53ea7ed4f7d0175e297dfbdf83a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4150 zcmeH}c|2768^;gEGRB}r)(J6L!q{3yi>53ovTxZMW0ZA$CE3fq zbR~r*m8F_O3RyxfWvSb>T)(4wh2%E(uirnvGp}>z{Pn!g^L(D?`F>0dIUu~i;x(A> zWB&Q&>jMTL0m2Dy@6$&K&J<@=D@!QAaYfb)w|Jj81q0Y1V;leg83!MxYX(8B4C3Oi z;zA<(obXfGj#gLEP}5LTQP)sWS7$MLMkap6C(5%JoqX~OKp;(ftNDKU4L%nfaC+naYYY&CSGKZW{G}w;>L_AQEFlX+eEtMu&bzL zPeO(ITO!u;wAxlTkEaqIdQ!K_LNhf|p9(`=TT`V_hPfSG`uwRjSkIQxAfk+9r62Q! zduC#At11VkeZ84waq21Vn?>*6rR4L7kakZs%8Zr!p21_=(t0tM9Xtf?S1NnS&)Gj6 zXC$KIhV#T4Um7LYn%Bka&fFc*oe*@-o7D_TQI5>1z6kTHgJ;{@{GBxmUu>JDy5BZ% z>@fiAjsxqCX1(qNSO1?^{V;JrT2Q}zKqw_`P)h9$q*SW8pfDq+LQKNVS=0JB&GXTR zHz>seorCP_%*{66+8-!Ti5*FziU?qYi>I*AgG|Qs4yiKOZ>`gvca8AZ26B0@1amJj zXW4GLY>X;h-67# zpj!(ZQq*ai;aBm{-xZvj+3l8wR_O z0o7mzJ(~VT*(81hIC%`pk9wpvS^m6KUm-mt%BtnWQ5@WX5N^T`pG+P+!=A}W%s>=k z!k?h~T(OB3f$g94gBljZd_wyMV;lmFCv8-ko_QxI*<%#UL#H9hf8U;WQJjr;JcjUd z3@v56=e?9aaE|gC^ID+ECn9#R30BU`ax$i?8h5iT>2>f5-n?B~P0R=L93Mhlg3b z+zBcxP=eX%h06*hQBBYi(o)v3^c2$`s$= zA1T6-L)Bx{p>?)}J?s6*Pi~k;BsYESyz_g>PPt|>KY61|Dyh{L?sJj#>n8f?*4bj z8j0fOO8%LuCt?kwk(>Ci?T-{O*sbhF6I%PMJCAi~;M&v1GwePjTeprdHo7MB2v!uS zbmvzZ_g76{8$T78pJX2`pKxrc(WRIPP%55w;{3=>#&s zn@*#~C6M?n-iS-b*6iAq^;UXlf8O53x*MK1bTsh-N6T=Nt1JpvMZMbFfRB`{J{X`FYbfLxR>)}cB5z;MdHIjM#RUJE zb06|!YNaGbemlj(uxU3*nN8SQJ4}%({lg0Pp%gYCu;Y?+=~Zmub$HF}kzpMRj`l zb2l#?>5Utp%2N(kd=chH|8zQ8FIBHcr5Bb;2QTAE8_Xl6)UGH&I zI5`?67H39tgV?-kyv?bpS_IwFduJkBX4Boci-r;0_vhyS`Y?%b*i?V8HpX2h;y^W` z{}Ny8M9t*JPvx{(wzIism$~1?<+ms115VP%o}&Ex)@x!Rd6XDUNjW`4IC zK=KbfnyKMpFD_-f(m5~(?>Ej`XiiW$0N@3GM6i2GKmY*z7ec?#umAu6 diff --git a/datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip deleted file mode 100644 index 24059a67ada958d92e28ad720d825f8a6880064e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4569 zcmeH~dpwi<8^>?UA|-R2EJSEz4kPANi4Ha-v{2654>`n~Dj{=Dk26upVM9(G7$v1P z5yhjF$6-7~p+afq5b>Mp_2B98#J|7SZ+q>&@BMSVzu)_FeXr~Km|)hia{&vVw?&?& z-yVKkHUe7#oUgn48C#sakGZv@!bN7exV`zY&TvSJXaFGfO;cNgv1p$Ea8VEiFdlwG{A8)^JuPX7{Zo4n3yZ`>Gni8};`jsFNt^)#^dv9eK zhR3}W9*-{tTcq;t>`<#nIGIfdDXE}K9>r}7=R3YRo`2KZjrb^30pUEDjbO7UlEFI= zZu_dAc00Gk$!&kE?Rh8lXJhV1X9T@ln`=8LX3cp)uV;6a0p<+@g6)+%S}9VKG;fY{ zkYaLbXYBp51DqT(<%v$9uzRMUN@hWVWtN01esqYggr zm7t3H-F*`19MOYgW*g8%=$_C&UPC7~h~28ahJ|71X%gGr#tw;eCOr=`a#k=9?Dh=a zB5iFxdY$T&R8~1tfB&l3CnarD5|nGA{e|@8*yYoGuJUK;WxNA!p0U=m1!fsNdsQoRgO}-2dXd*H+14 zr*mBN*Ww|sdx&Q^7V>pb?;&>Kaz~cl;Vius*Pu5J@Apf!O)wrbC5UxZem$EB{>d~d zUw6(Bh?ctk-0#`JVtADeQB+WVA1QU`>E3;!dg>2!bxO4v15NcGJH|NQTL!;r*Kdw+ zLxW!lwKK@`Esb_hPcW?J^UeZpKzZnnyb@^{m^wBtByYW73QszrLUsLJwp^}^L&x`e zgd@4G9&O=NAje;RBB^4TcTw5ERD{E-Q#A$6XnAb;&QPvAX`8!EUW>Fe6b(GyeDU^v zEMe3kcmEa)?@kGZIo|HHeWhJRL7~h!e_ZyS)|zS)IjE(icu_QNZs>T|mzy?k=$M1k zi4HN0D3JirBx>?t*OXp;RR11Xs}o%os?Z}J^(Tj$?@D;;#m{H`9{uHU<%mR>Ocl(S z5!ToFQTE=gl-i3bn&avd!6WEVzX$y=j{x01Ia;mn!O}}@-M*0@;G84^Z&*^7T8>kk z>k0M()DxpOJXG-uy~B4tMuik%B8#dX1YewDTh!9vnuDpv6gGy&(lV8$<*qeo>1Xfe z!YXrk++T_=oZWvozwYr`9(3hRbxv;ucI#$yFg1UZI5MlXzK`SN9ZN^3E=SuCx2DfC zb6Ard*&qO1P-!BoaCOZ)NoG>#A?vI}$juFojp61S8jc@qu}pjsk+_$~j=~%w zlBt@fkb?47XmC`WV{~n$VTd*Ww?%J;AS_%;(Fcf=omy|T!`>uLbVLr_e^l#MF*InL zP*>!mgQaS4N>pD#{B}K#`D%iRH;-dJAAO%4*Aw7sdysiMaC)YnnLXz=Q&Ey%T{V>X z-sP2=b)9(Rs9V)Ux~qEO;hQ`ou}1Pi4(}4UuT;1PWQX&8I39t-dzNiB_@vV(=WI=E zL$RkD;kJf~l#+#|_yl59ajvAJ7LdS(2K|iUW7((@fNzT>9JM7WlC6w_h8bv(*tV3h ziON2T=Pkz;W>uJ6WrpU0jIBsd=ON+S9*jZtdxY!qzf^U%>BUj{JIBZKzchAK=+LEy zZ#beHwLjn232wD*9Go_M|CUzZ|3~K7S9?`0o3gBKmA)E$tM#y>gK3mHdj4CpT0Hzq z6Gmwb3FCwJ_W3JUR;cy=&z1kJT;azg6rW=im(n#32B&t@C+sJT{~YdM_D2CA!)Rt z&nKH1;lS6!&+9DGKA2A1-G7yR(JItTPw4VJ4?@+^^VF_5jusIovV2ghO6@1T4(oE*?L3krgUKQ#{5Lqzs8@8xzm~@Ob3SMwjc_ zXV1&o!F9Vo!;cAUgnD#U38A5!#AAn03Bh}ze?Ce_ANKz|nfb1HB0%*jB_Q^IbO)f8ukaI%1>r19I zJtZTJ%}lMfzA7xvpqw$zgI`V1Y$~F_DGeEjjO$2=2Cm!`zH~lgX}NGSODx({h97wk zyYF}*xcaVK3Ohd%`<|vm-?ZLnw)|D}}k;N3L=ZUb` z>@kb5Gco-`I>of)8EcEa46k5q9cv+;vX`!D5rP*+-hrjr8Mnq=V`bWvs8ThdJ3Hoc z<4?e~9U3juW=eWy5TDaTQ8Xcy5{G+ZAC#D`!Opd_Q4QObhCw3jo|Iw|JNs_QP*KQ% zrnWNyLK|o^^1Tnqti%b_<52b7qxg8wc=fm0;Jdk5&~>S~QCowqKU8%&wS2f=koG4Z z#0C;ux00O_{;>cEFrvHgxXfIEzniNy1e5|61Af)rzdYc-7_DEEezsYQq!C@xDoNi> z7mK!Fy8dxri(PoR<@!!zoj`yb0%qNoyr>oTwLn{9z*yK7^;yI&pZu3%nXI$FTI^pw z;MdTfN7SWID*Ni8e=9XC^gj%%CYXhMT-@*SVP`4&^ZM1J7e}98Z}?k1UDh_2QpGl` zp1P!ot`fVfj4#E4H?AK0Q&Imn_)4|rJDioztc32~w0ij8E6+-7FiSlE;9`CKSSe)0 Hz3}h9dt|%< diff --git a/datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip deleted file mode 100644 index f1a939b8c4185f5250411a96f422b77d014f1c33..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3947 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3Oq=Rhpr zUtovJ`657Hb1+<8loYgtV-?d^An!dR1A_ta7L*hvX6EUY6qkVmYSKKrgkRI_&;#o1 zd0)RH20W|}wpzOKH3ZbDd2V*{xU;w@C;h|)v(Vc!CtBD_9+_rqZGUsxJ@??LPbIVu zcO7ScV|&7h>3Zp&tAdQiuAJtK_Q*wIXAX+^ zZIf7Ds@1o`MdbLo$a*8i)<-P83m36Z-|-~q#=-u>$~%Y0g6_i<6* zvTN67Uk~4~;NgyPm$w&wHa68Z{ns-3<^H9w`^_t*SKByGFZ$8;s>pWRuTRsTOM_Ae zB6uCk#HhyQfl(Y?z%(NaX1h#tx0*88;Uk^iZyFExrBf^&OW@^D0g6>33iI zwe4+KZOkPVe*Yb>3`HzjE*FHYnfv{VhQH?5Wo}DDeKszAC!Jb-bEWa>`qgQ5SF--! z{4nKVPQjJ$3q#xDzWr&xXpRV4&v{8f0ulb@M!;Y_2E-=B2W?qmP9`vmr>4LI7?g6c zq~6s2v$@iaJgw&!YsEAg{+T^LnCpY8EQhg7P-MnZ(Jfsut-Fg7<=8D*PFPPoqWse~ zR(!Wz-)WBR-+qXGyLls2_s>4r`L_>$GuZb1_5DqAb>_UWdLuXMRY_avymgro5E)a4=? zy%SkH|H$iZ&N-f$R($aB)i;YL_8zH;@tgm5!rg-Zx9YrYx-Cy1D>C&kF*|#SQ(Q9q zdFdAZ6OLPcamlQV67he*b#=;`T@p5eKd+b+IDT8%am+!{;%m>w1*h%YG*(4*thpND zq_cLKn-j}YgNezCTMex5{4Y?Kjy$vk*<9GzURkZ*3L7FQ9qV=$Np(V)$9FE6LZbW@7(>$ zkD9GEi?_N41LKW@iGjh4_-vJ#hgtC9$X%rq&i39h02X|=v|=1KD11F%&E>()+_fmp zcH`57H$H|kAAaMUA#Q3vZ+o$IiFxVoH}&58 zR`1MS|I0|nNbT$%$~0ZhZoAeH zcr~-+Tyl=%B-JefnvB;3j8ena#jWaT&e`nyNO8x62+h`>hRQWLdae$9IVsWC-*#Dd z-~0Bo?WEvOmiMnO&&;nnvv=N~8((L9smXnA^8KX0cxTGMfhVOqv;$$s7IgNuyJmqpRN1Co@899kp&*{19Q+Gm%eul7ERwqmk|P zru-QzO>z$(S}5ij;gV{@%@yMM!7Mye=&tF~5Bz5Rk2yc`-wyw~@5Pr3EBQNGm)F*P zSp53S#t-Y~|INDpQ|8$JM}OHsQ3z>wf=WhclM_st5+8-o2+Yh!ZFqrV5u=65$Rx)M zu@JMND*?4%fZ?qph>6ndWrZ|*Nia}_0b$@@M^{t>@ic)6+l1KwM%eV$aWRoLft$lX z%RtRxtlmPj)=(S65X*pBjbTY63y?wzC!F2_T87j#2HJ`b04sWL!_N(B?{1HEX}9@Z%|r9_!hNUfNXOdE73M18VQ8G3(9e*6&|vs zKiP=36sh_n&Q{c-8rjyZ97NlSQE=n48&??vu^X6@fl0XnsED)(hL7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7LleGm+1z( zLhRsQV28^GB0!IGFkD@f6!goku=693_neV|!I&gFN{SLQ^YluJ%fP`kX`WreuW5GZ z!F9IZ+wZ6WkL&R?(VGnZnRUKyJ_)QIKHA3%e{0=ZmUVfPV%ec%9HPo$4eoZ~*C+A& zD(f$m7G4waZbSHkR*Al?*2NEHgZ@R-h092=#R=s%xNPMNR-3aUj<@(I`;;eZ`FpvJ z_<5$^Ua*uiXyPl*srfpA`9eh*OFi{Gi`p(~q%R5e6;>9sbSi6HIZ1VXm2klJgTLam zxqMlB<6cVyda1oldT}yBa;@w7hEF!n=a;)b%Cnb`kGE2-{je`H|MRMcypw!)=`5l+@m*4hZGY6#)MCk5) zeRd@eFl2?97#Ivm3f+{{GJ;_{VQ=AK1D@9G6fV9AKRiDA%N+2%#j}9fGvL}uq0&Wf zbe^nvz`7+OL%sA>NyYmH_8N}ojnCFfE9vZ*yCB4|SYe0GUDNy!g=gn~Za*?_!E(z> zToO)iTZ;KAmRVi(58bywblq2`rJp^E{Vr-rCtZzkS-I4G-UUnP7C(`S$I-1prh7R1 zx^E@;^Yhj>=h|9^X+6 z2w#^ke|+7;TN1iY5~EaP(x-j<9i?fw>2%b$+dKB!>5640tNOp$a$id^`kTI4SpAve zg)V+;(t!O z>bXT}nL6UFE{~T*l^+T{wz^XK?d0_isk0`rCvePEaewmW&J=H5yQKnZlP4F?lQa}z zKgfBE!OYX}kwPjDW7G7khz$phL>eqV!MewAW^hNC$E2e#CcIVlFzSxbVx1J?`g_~k z{!;Ntmka-X{q+1vO;Pfa%6sSH{!Uw`|NVCL;qJP_k?+6M-TD72`TiD@y@}D&BhP=o zy;L{1F3eBzs(pKo+{b-NGnf5N>aGGJVCps$d2l#nk8;T~l^P?#?U&X%4~ z*Yy1CUUmL-p+bML!1kptxBq^->rukDKK?nOu}f;F|7Au*DX5%6j#49%qO>HnxC9=X zIEty!*rac4uAOG<2PzjifEkpuS|`6GBejT7be`=$oeL}(+s?ldky7B*IXP)5Bh!~_ z&g=#vlQ#90u4 z-Y2+0@AR+F?^v!(|NHgvmF&CQPDPZ)taIDr8Yx#EX@2W@<_h+m{+j|OPM>kQ^~sSE z_lp}g+g?9fpLWTAS-Hea#p0Zz&Pb(DOS^sBnwOl?I`{kWzLT2i?|NmXmrmUNOZ>Am zYBnl$`fybV81M$bfVU(m8)fEUmW?=a)7go;K^0S5b{e>1@~R2VoWxl)J&b+g1fS?d zH6m+$o``O05a(XwJ=aRKLB9Vmdy8h`y4bEaOkYogaT&0uT=(9tD;h2~ef^);hdOeO z$ZRy=Xw%7^^i8dOyFIU~tWY*^l+^ZMhbM zixuyq?<#dMDo5P@zBhM^a8Ptk$ri!p?O75RJY9uFJDHd)euY^iXQ>F5h|W`Q1-dyZX47 z>%?a+tx1@q8P>n@ic9$n`7=54=l^MM4)J<5_owiwb0uwO{0%<2$UjroJn;LsE>rx& z9##7{KFd9lYyUETX9qRqG)gk_Q5#~Qc)@75GBU|A<7)m&K%F4K z@YWH;L}?7OLK?#)7^uR4Fz~OVE2@Eb8pece!fX~JYqEmKGGz4A}+~%933vAtIqH z8Cy{(j*R^zvb@h!Z%9sa{(L{5cjhzCJb&Ha`@VkH^}DajP>+h59w2?*=eQevefj=i z0N4RL9~T$@({|S0)`yS(Ne57+@E)@w{e1iw019d%6#!6AfDgkh7vbBuh^;T&*3Hx1 z#|thkA_f;16&HnziNnRj$Xq==!)C)41<73fzuVt|Q{HO(tl+6>fzEUrBaskoj$IT0 zu#XY|M9Bwuds-u1MZCRyzXs|?in7%rM)~JJwX`Z2)rvxaR+Jn>&}yz&IJv$UV{+w; zjY6!fVTUS&LrUW;E?NhnmtUlX2$pPi2P&3djQ16cosGXG@?>8Yzq6j2I(K(o;zy5k zc-^#v?4|qMX01>p6`NW@XeXPyhP_i8c1}&!^kaPJ^*qnQr}7$6I{Sj#%EX!JZVGX+ z3t&S_^JfTmN7LQ%a?kgeWVbz(Hyg5u5#mjjG$;$z+4j!TuphOz7pFfL9_h<>FiqY5 zQN6qx+m}Gaq}-%`;M=9R&_hTjlo*YI2*gD@c zL{q0s?aXeAlTa$?O*=wxf;B!td8;J91Y@Y zmC~09ET<6;Fc!BgNYV{_)eNCq5&z7?C{v@7xd;uPeVZC`VaE!;tH578S0~zMrTw_V zbM9Aazd`XWY2A(S>t*RHL2*n*w)zp{-PNvqG>Le8tx(GL2wrA*-~(^qY!CmQ`9|Ut zOOt?H|9OSi4O5tUv#!|M><9YQ0*qZ3dRD2|bka$&F;aDB@ks)8!hkwSlCP7mwKEd* z6MMTKYLUw17o}n!qwmTp1`X?a0h2`x3t?9cBs{T19lef@3_MR?0Yxaaf^_bg7`t~g zlqWoQaY3>YDRxTVM&+Ag2F~>VjZ|@2Jh-4hop(Hqj(Hf%+uu|1T> zVhDD_dKYESWwx4FvFsa=J@Te`T4?X84Yc$IBB~l`c zi84)M`^!rq%%$v33+7EHN4iSmYyQmJnB z&cTScQ%BwhHhPo?$%^=|esKHCHHBy4zKtv~c?R3yQ#RHYHdkAPADPeUTUB*h&aw!U zM2#?|vf5~TVpz)rl3}jTgu~=#g15bw_m3pl!Y}`y1pg~Z;O|s7a}Bfu%YS!kx_O_q z_xxQJxI|Zxtf1a$1ZL3ZHPcT=TC2o=w+ekf zgf-1dFT-uAhp?n9&xlSsr_P)udiv@|!c>geBV}U$QHj$Ll9fs;G1f0_UN+R39)Ol6 z4vHR=bq^@zIuWq57=cv_y*z4<*$veZGzzXHj6dU_L?xK@q{wN!=+zyPDrcf$+@E^m z&$pI@Q|X>|$Dia1k2XK&(_^jCa;gq#6br+%->`Sl(aL*-L$YVtzYg}3xy0djuizX? zA*Cey{o@ojNB``Rhoyn9Ul}03q$6@B+T+RHf_mnkmG;i{YG>VbHV|6vC#@{C6a$GE8kuxYR%0MR) z%3XxWv(Usu!xu+645ttc-QG5d$#hJ&LOy7(j!5k@?a#;j!SLqdCl`$`ZF9aIJ`CLp z>hM(4d!6U1_%&?gw0Qg?YkzQv^6qB7-faxy%ix=@GPLrq()nkF2# z?PApVo67V`Ax#k01bkLj`90x1-djfm>kXvbZQ#%$>FkC?in+7BSbA$c8-`5+cEm5IP+J_Qf1!h<)nT| z2h!@$13MWxI_@9X>BVX~9xyj9F%mfrZTk$3`OKFZW2|NyB{*s-fzXN6R9NJ?j%$0chjV#;K5wO) z^6w0_N4zFPemvexL6jDP*KAVt3R z-<3zYZETlG@^6|eC@49pwi-{^zVo2~5kY$0Fswkn4J-d*K|Ww@!DWQT#s&XjaQQuH z^VqURA|f)#Bz+xXKp1I=`Q@?JigM!!^9=^pD1Z;92LIN@Y0G1Quyq3sNZnGMHR{IB zeLXbzRkB$!7j!MIiYX{ivZHDHqpHyRqw8PnMz~o4nQo{rgDPUszVC57_ZzGFKdkw zc^qJugdvD?R4Z(KDOjd#+8E%j%Z}fDKvp0{8c#3@6DAN}_qVp}Z=UQb_M|cxNNY0& zBYynd(LXx8sZqnR1%vG^XPc{R+jJq`>Z?sA-rSh1@l0U0wmd)fq$Q(Bq(o)S@RPGI zZLBO$t>dSaer!_hzZDQ)?RI)54{QMJ+T>a}}_CQNs@SYvf;}2U6=3{a})GMyV)`vk+sVe{g!MdpS zSmJLbCBFbA6%1+~wvoJdCB}=k^~y71l9F*h?w&b!<0`{7+SkPW_MwLww9Ips9wsuoAL4W1R5QR0(ksxswk&W@=v#cJeFszq!g146^PE^6eW6jz4B$Y{ zgECZGAODTdyOitn_~@q@hHtwqp zC6b|hWm9a10Ix=7lI3d)$(+l|s>0}u=Wata0vXH=WMON>!0tF2kzc19h@GkUSn0^F>cYFDbz_Vgd11QSd3H; zdNC6^?u>?b{+BGKEJ%L0dUa_ZD)c0UEPz-4?%8Vcwlu z=rcKTM$&WxQBx$pHaY06zn*_j`>w1@{=w;YSKD%0>Mpa~?IPBHLjRE1LyTb%u2?2#ri`X{$ubh25#y%Lj$pKI)FGWvaSp@PxW3 zj-J3jWG6;zV+x91$xDm1A~q>@ioz3+yH}ssxL%{suIc_;`_ZNruLGv_uyWPN%nFQ* zfZugbbQJ9L@yNpp1hkB4TalAXfmxvm*5k0FeEzLN5jQX>*$^CkU9J&DH#v)nGBu~& z{>X4cmr_b`rH}ZUMDVG#h2aT260cu!i2mvBT%YIF9>a5G&{{lAF4Abq`Oa0|&k>wv zoDnZcngyDld{0K4Oz@u->=r8CTx`It!YL%Vr(U07F)Px|Uo2G&FAit0dx|Kw+$k?_ z@1wk3|9Qub7Oa8FkPc@e&d)BuiDs>(*daYvMdh9q&$>NFTTv2Y-GMGv*S&#LBu1^D zFpqgJ-TNUPcY>?oOxLK|v}+&$={nI>xaxD53T1>}!bYch@)++qvw+k5r{*g4vnzEi zYtwxzk5=?!h82rQ3#-F?A6UG9>|kB_Z@S}Mmk!|b;{UX%u&oefUvA+M{>Fy_1Df-< zOV%Hbuj|i$FenKu7@X1Ex76UjtU|vh{k#Y*kOnkUStNZ~i$K`?TJ%e|7Ut)rOVL*t zn4v(LF9iHrbWY2<1;Q4sNFa4tbrz^gr}o9v7O?zV1vdwE76l!KHL-s9{rN|aV$QCi;SVERT zvP6~;vPZ|RkdU02>JyUFod4cG-e*3~GtXbw_qng%eO>qOf@@GxfdHgy?2#Sf>&y2C z1Aq@;>4ZkRnp&FU%%K-_=>e4Yg|y8`x04G4fP!j<5&)o@B|hN02HC$oh=nu60_$Ms zQ49HvH`Rb{GK)CH*g`9>mpDkz+n z?(-4!9~{E)v%ob;3J*$|%=U&7eQP1~Do!dSj&L%OJ;q6$;$j)UnGH?IX_tiS#Wv5V zs~1hY()xv_^Qf}T%CTqaBa8ooynIS}O(71houvPoR>-nd?KGF$GuS?186}wsm2j;? zqXky5e6jhYOck*!iv-XFUv#gEfR*lR8{ue2<2jotWLBRsy@2bGW|1L2J6}+8zTw~U& z&98D%zE)UWIVO1(pD{D#|CUW&Ea>>>FW~1i9QZqDnGNOw{gLUGM<&kK3y^lsX6lc& zs#^;YQ_n+8y%_nax3qHped6VbiDy*1w8Q~ZVEwDR2-Aw>=1>(5oO%5?nxnxGgnKca zrXy3foYjk)U}~K3q@wp=DGSuf!~l5(_w~e6CWOCs&4wyJhzl4VTMV%SJ16NR1yP_X zEc<<0wZFg?WlyS_6HYH4uub-7>7s=cNcsC6NfHaqS8I59#aEDfBJac7RLMFXfmG4M za538pdb*4VL9{7m$91EM1y6ITsxFYnz5CmM@r&%^FIaJbI53S9eU+F??Xqh9w)O%8FN^#~s z3ZWi;JI9MCg#=NGqvT8BY<|Ut*e`|1z9bxO*>WhnB3@G@_3Sy z0L{Wz-q|ockq`TTaMe-Yf7Z!}{`?W+=Z5VE(LgCgV>sbf9gRLDJv1d?GNik!ga@Fp z9}RXrRT{ph6JC|y0#6xhbjvKIigGea_8`&PlnMo1y65v8aMBW& zT2gWc477Dwfmto|(Z1a0ME%=rXt^U=BxvxQIH&NoK(?m_KKy9OC{)pn5sWip0aYl& zJi@|dS@CC&N7;O}J6fM$>&O|H$s{$HUR0b6IyM{NjvYLa;9Bx<#eTT3vawSDxAC=M zZhn1oS= z)lE9f%42CwInmHj)1XUwm-uVTs=F>`7!?IFyv$sHjZ*GSij5pbF?3baF+WIE>s*a{ zs(~-z>Crgag~atZ*tSvc<${YNFcM|K1L{jxzs7WynNid>Kgh}k#<9}T_-UXX_xpcKH)L&>1?B5^*xOqYOmOlN{E z@+)|56_rMTxdf|T6po2RoE3)$a}Cm|Xeu|bf@&M2V#P}~hT@^Z1c5i~>@4L>o*OPm zy6Y9?1A%zY?(XG5xhXeZDQr`4cUZA+Y30Y{MAtcnB`iBSYPIPz)n=O}nU&Eu*f%6i zS&5oTlV7?vn4OJ!mkM^e{Z-k(%PSl(-(=dQSjmHiaE@bpo%tDDr_DU$qA=y2FzhU6 zuR5>v2#W{DC*K0xz%8BJNx$cjZZTE7d2S`~Cd=0nFZ6q3HjLs8o1b3b2L<`-HfcS1 ztl9ji_B#E|^N?{=UsOqSY$&u&8~xXx*g!;P6alEh06b04msAw)N9((Xi=8z=gHBdk znNjg^BcLkXCnwVdc>ig$x_F+ZT5#Jz5VLw)1%9o&G~p92H1ZsOw#K-9xy#ZhY*t1| z-usK!sA}$$^5Lp6uZ}bpV>{Woz7YIFQ~MK#9>qt%0^L~I6WLKq{?BAEVs2H9u^EV9 z7B+&Tx|eOffIYLHYBpZbFcQw6T`!2_MT)aIG-xLCglOV*!7EYeu{X~}YE--$dPwod zJ85mqu(wf1ao4)V+5*G-*7b#QZ0}?tRG+VOmVr4>Zt5`q^1UayZsUY^Uj5Hr-WYuU z>UrB({!}+-rSS%E(?ye%-D<^;k+1A`U2U-2y(mfC9j2WE$WOVO_vil3X9dhCl3uq> z0@U9ofqxNjKVUQ9x}xg#fd4QR{GPOPCfFp+D5j7}`Z^&HVWbJ+m&@8bEw|4H-(bWF z1f=>>5&yR2Y1d^DVOypJB6U}FHmTdK@m8ugvGmEL{xHhFXYTZXTbUp#vY9_M8Y1}@ zZV;|P+KZd3-R?-Xl0Qo4&K~oPP29=EtsKHYHv8v}vm2V4h7=k^{9%Z@XO##50R9H=IscRZ diff --git a/datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip deleted file mode 100644 index 12fd52efb3e419825ad08016f7fc83c624754a2d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4270 zcmeH~c|4SB8^?#SldMA-R7mz+_6*s#F=S~q)7TjrW8arXwnS2x>?6W!2}kHvrj$bz zm8_Aa6A_Uu%Nf-tBRS3a}&^Uw8t?)!IL*ZsSoMnEbCz~*PH*c0~k;ropd z00tnv-QE4qA>kOf)Co&^05DU)+-~#o_G1K4P)z{=0IF%?4Z3R(!RaV7_4JB~DM6a$V)r?Ii;1c3sY65}X4*Zf9~@Ml6$StYV$H4F!T ztJ|D3cj061>~#F=arC^GWw71~3ONvU#32XJy)B}Y3=a#$Bn&ef6??=XzFcsj?~^9fV#mz+9+@ki zj^=TrldIbM#%67R1?5TLi-!6}a^adL0?+VV`yL@0DdZ zX#}W72PjOr`U_PzS-$3uqHW_IPZzsQwWtF9cqy^hFgMI7508;lDyXHJ01II)j4Ot% zSmEXI*yDv<5k~GeI3aQmYGY3u0Yx?Q?+e!O@vtloIjTG!(7_}1aZQEl7nCOzY2t2cd@s|ylckF)AJzml-9zPHwbg=uYon%m1i zJp*U87PQo(vli|9R{aeD9S>LLPim|3OZV64Qb~RXM^GH3a_R&z_~D8FIls$RmN#^Orv-pze`wh z*JlhpDzJC3w4*QjbKD#=uBEcpx_QCjZC|%46nwC<@XGN179Yo1ChA+Prd=Y8JjAQWmKm+m15=)aCI*q z3yas%o{+Vrk*!IpE~DY96*p~&*cyhy3^$9Xg_Tz zPz6HgH?-o$1~~-Apyjc12ddtg>N{v1ylW+Zx(3B}0J%~P^1L#=_Ho>?g7jrxCpQ1^JCDd4xQnO5#VR1wSA)uXJO_R3cJzEXOtD=Ep=6-|69)JeF*q^-5 z;4{-HCDL%8)=hdrL%#@L{+{0=jv#zCOu_B1^$?q|2MX!$6$kKQr=P#Q&B;3$){SBG zkUvuZ8esg3nOI_scapA>C#sMgX?4bvhFj#dVgQ|wMU{JN6s;7BJnnfbdzni z#o>gw6xEDs+R}2Rzuc@D_PtkNeOTg^1mZPxMpOzFMbPO^-cU0*pnVw~uVUNQL+hET zj}^RmjULE)Y(EgGtf~?R(V5Jfhoq&*?>~h;7(OB$@7~WhhfYtuEti^D53bRDA=TLU zQd_-2LvkWyy>0rxnV`=xW!IN4Pc8aen_bspkSHi#512^vcM3wJKfDMV^}F9GgjNeJ zH;|l6v9-AFi2;Mg?)xu_*5%KAjFpMT1 z0@bBFSyVDtny+JGP!LK`USwXZ=Hsy%Su$UYTh7sVs>+-=hk6cuSHH|Y$9r<->YFUC zvc%7LzMjkpuQhMb?4jw6Y5WQ2-hfe$R9tb#qPt#Ddos{>*y>%Khx5k3Ape%@-n6Zx z)?I=8MEITt&K}#n>t-(OF!>aee8Ami1@7;B7Qhr_^KskcK>clU_!j{M0b2prAUfLv z{=*dUd(zGsVv95d$s?2Wb)q1`HYbW-E^EthZl5QrMw%f@IN}`-TUxLI7>SYyhmD_K00)49X_gTHV49;{Om_?-wLOS4 z0pg7F@$&P9z!abm#UqMGAW%gJ6iR1wQjXKMSCpYMy5G4k0PZaHonC(rZPl!RK9ZA} zeG$)87y|%M2m}C*&=0`-IHIu%cwfTjjY=Y_J5aBw|9zu65QZj_P!KZ6WPC{P5#iJD zp*Y@;=3G4b#}BZT8#{pAVWHv4dwH#|902l2se}bbsI*xelUD{psx9X&`wk713%_?% znq5{CPNxW_i-#sM!i4<~r^^|o;x5LjYxOerhb2aFoXu!5fVpP~0ticytRcUX_7%Q8 zi8if=rSvP7)|IDu=Pe}&&&S9g#MN;DJ?wt5os{6~*SGUv5f4jL3{CEF^Bd;?fd@!j z&y&+QQaY}JVh;M2=_n+FxC-yx0h@!f0-EiQioozA^iHF?8PMpH+CA|n|pirwNx)r zrOhwkUSpE5nL#@~igCR|O!;!3T>J;6%lXplSMv5HkB+aboxYjsu+q^veh0!^`dgty9zj~lT*IT z#BPRIwV#G^{YOnGM&ZltDWT=}G! zy)r++?yxIj`ZyeAsl$`P8OzopaPFKQs=cSr>OAt{k8%*sg2yR3Ud;P5`k(hj%i8i8 zkPA(%AGd*%t|=C_cp!!Lx+;ybj>YL>;ByV;IKuPZ^ENPJOVN06^~)O zQ`LaR7bY4~J($QOh*X_6AU}`l7Wy(aDTo!sdrqn${9Dt(EB4n&9Y(ra7Ri%U_Cb6!t$9w`-y?mQNEDF_3Bg zgn@*x{`3J|G*{^1+)TdBt2)G-FnH&un1^gX9ysy{8<7z%+7xG-jM6X7rJ}|I zo6|>}bez%&r4Nz90b%mXFPhixl2tXyO|Nel8SXP21&u|A4`w*2IobNYu;s$k0^@b< zoTHx!Abi>646k2ePi(u=!-Ol_vz!pz1u8!4m77%AXs|1RQzd~%MOp{!Q+0`N)xEeJ z#X@P|)9levtj;>-hO@9@>Q{LpaQdim6h-Uk{(P-9XdU*o zFTAzukC2?Z;@7=HYV0blxfksYuY_!oe#6tZN`IJs>G7_<_`g)&!P5V)`v0uzV$L@Y zX3`4Y=)axhIJ~>7&-Y}8=ev&D_YxO0cP-exVLi=U*h{+%J*)SQt@E-1b zy59^nS9%Q~UcvaAfPYfAWG1CRbZWS7mD<&PgPPX4L_|$hUqpW#?8XIBrug?$Cha>I z^{BAce3*BZxn3#0$bV6KsRTI?h?%s%uONVPyC+hTpIsp22JsAOR;XB(n|@Rt8R^8A z%9}Tl7KE*{kh=vw}R|3wmNN&6omZ8IFogcJ9e8OMZDoJ66-VqLV5wqG3+f z+#``E%kBhK2t8$xYQIj_KJMI6^D|M@t*k`7D;_>LfKA<%K?ySVnil%~Ct?)*eSK+E z_B{WZ>NIBcT?pb6aQhj*Bojc2qosi!Ee&Aw7e5;Nt@H2T0hol-SaT?-WnfNjl%eqP z^5R&p_Tt$V2@e!+cQ}><*#NwKvTTA&P70L$K=6&QpQt8K!li8?=DuL?eeUZVD2yA2 zned>xIo#|myUi@DoPP!~CTlxZa`A*?r$^_ZiAi>lPQkTv;nMBTE+j7i<LUZO2m^)&N>n>Svphi@;9tqgk)i!7m@UC+Qh_l5S^30d>w^Ud$F#77sYR zOaB8{$K^VPfn3J)1QrCBwT~wU-l@?b&X%}Y<-eh-pNUOJ^~EFADpskL#A#32 z!cz9Sr~lG3RMl_Sk!#Dh5ci)22QB?(R6jp6E?;A2Yg1XbcOoKJ(S+^Sk;;2lUgl8r zWmy)6R9wI;-H6+1LUz0rF1v(~lO4ADPdP4G~ z6r@ZjOhV#;Z?CeiYmS5Vm_fCtl5jbh*~bGBR=Aqoy%xt3K}x@mwC^< zRHJ=|e>6B~Y?Sqan!(}h)vhlS)qe;m1=tL@rm3?%;J-{)-zR-LTWykNHFM}BeV(*vu#HLU zAD6Y+dT*bzzQAZz3dp_0M7wR-ryZ9?gKe3*Xw)6m*`#i7__tEu(d>av>R;{u_nF@| zo?DqCOms8~WCoA31Er&3D z)a@?6mD)$EO*)1B#`FJ;{iWo;^_VYgS|-y{xpz0+?7utCPUu}UKH05;U$KggU9*?L-T?D=>^R$ ztcSdCSSdQatSj?id>T6(vQf~)kE(cUON-5iZBD1iU5jLV;T+ffvABbe$$rvQY)-UP zXL^o}ZgLCW(qJ<7rLfq3vAx~PWWYntQfFI;Aztgug)Iu#j@1)eqN&zO?r9K7b|-oC z2t>F}*>r2oEn`s{*41&N`&~_~>H(f62sSqCno@u;c0)VF-$G+DO^Z_9r6k)k&@A$6A+p`Rvp(jgbwZnS4$1mI z7ZW92gU|w(8ssR6WTY~`Cf}oz<%64a;vN+HI*3mFF;bZkKz)+p@jTZ{u-6UG=5xc{ zqg`L59Ud_DIzkX_x+v4^ioo6TT!!cKw=Ehb)JV8l?wkpp$0wzX2~z);mtX_<4i#!V~+g?+hFKzzY6$>Np8?-qI0=aTt37$@JYO z#Xlq!()17k_lcDqSto|s6j9fZi9qoi<&DEq3+c_qi9(o0nYdxz5zWCpAHMy z&(Wten%5CWBUH&34aR-Mw^{^SI6s%%n#6lu-~Tsxar-9Ey899$If()dN2HTtxzHvv zSH(y#Vs0n(nip@*Pm0Vg9MyMh+iZYLCaK4Y?1OijRyjBDZmr_J=|$Bx*0M{3TU}Ku zuDe3*iKOWEXL=IT^^>-i&tP0qOn3b1+&(rrJLr&?`F4pOnA_7UH4_qca!EUvp4EYW zH)czl?wwttd*?~^N6p|GLJ23zILj!a8}3NYvV!WXHK`XPg*z7CH znpIaX8yvolo<|Q+o#=Oci=;^+YzD`u-TM?DS@=BZY9hDp)(FqApHHc9F%Vpy{+Jgx zncQnhAI!o8M9j@qe0s@jo7EyBSQjzGGdSd%e(Q{~|LgZZ(637DV|WZ`K&gLs7U4sV zdj|bfz*GZ<0nL5qg!GK16Xwf;Q)7?^c14dr**V^?|LV`Y>z}FVcB4u@(it5| zqgRCGJ;?XHQ}&2JVtDrOvkEsvC~=t}|E^=GyM=#gzJyxrmM<5UJJPoFQfhE6YT)%W zPCtT-tv2LQA)#^NGo7{Pw{XAFj8}G+Iq=^br99Tj6@lc$bLVA*6*2~Q36sj(eXQK0 zr+3K5qR*V=jkL|M*3vbHkOsVFQb`py*L>$sKa~;PobJHqF`b=3A5Pa1acr#S+`Xax zA@tTtB%`AeATm|kL(g6?eo>%PUzUpA z4Yd8fknyJlP3Z9pIezgRVz!llIka-{_JU%Z=wis;<-$9#&X}$^6*T1+9$Lb<9;pPU zFp}(D3V(6+b?wy~$O0ae68`otNA!J~(}Jj_%1Lrg4u{S;waEzvj^q}XCUzmaQ8IV+ zphYI}E|}7D0Y|c1?NZx=Qe4Duiifo8)PE2QH7$v{j>%S?l7mhdQWbcWR2zNzkTuR1 zjyC;8r%jwbwT(U(U}%t8Q#+39Kb`xOFgs{iZUmT2{;pj_ky1tV$$zT0uGQeq>8so$ z)z`O0z98JA-`0#IqB3RXcuuN<`&Goy1GbeE%ZP!iqp%k1T6|T)8_%2t8FH)%&1~$F z9P4Fok*|C>Fk#GiWlG|ZFG>7g6qE&43N9I&F&F%YRQ_|)+DyJenlLV4k@PvCgD^%y z|KYM$q9AiV{{jO&0~AJaf{#^wT6b9>Y*ktZsq3n1TpPhw zGhv)8Grt=RAo&Mj%+89j7gtuxoEuh?J0UESS5m~!m&Gbk)(v8kKqG zuBJxtuuNSuf&Yp9#qw)s%ojGelfkVFgR#v1e&Vc$hHx=TL%}}|+&!m2008_Avu?ye diff --git a/datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip deleted file mode 100644 index 026a291767021b7c416b6a947fc0e8cb12e5ce05..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3339 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_29c(h<(EKg z;9p>e%l#riZ*wqQU6d4Lb;jk*DH)O)G0XF~HshR+q6LH*Gu zH(dP4RxKVB`C#K+yIYFZPrgOg`|skuvox)eJ?Y*24JON6t6uFZ`?<({iO7v?XS3=n zhA%o!Nj`0H3v%4JW6j*l2Y*a-?LWB5JvG-z$kyBJg-4T<&qem2vQs_>Bj=pZ*rwxo zB4ZNgZ0Q}qstdwIeh-J;`-08?GL0ui2@O*?OP&lIsgOJ z7l`#q2-K9+GD1PB+wI8JV8Fw2{kn}1g*eo69%}5HlHky?#GFiEMoCS9I~NrD zqbUFw-;ivDoB|9;2-uR;;u3h6;>cChO9G$*fH7Rb&H%Kf1@=G#OiwHUOY`7adK&i#em3&lC6sV-mUDPuHG^Gn?3w`ni-u9|Ym_zP!?oX4ZuOua4J zHZHi4R4poQwna_odhVX9ZsLt!YyZkVc1cA7T=&ex7;A=!oQqHPIm_H(A4?GWD;`bO@1Wnb*edyndD zeO^Ybji0ZsxHFANqr5E?cd#<8MSzCO+n39_7A+M#lWVi-tD%)UO=;YbU)6FmL29R*yjAJwd33i(_hrM=wf@5_Tt+a~T{Zg==3x+nR1f&QhA zv(rzwu5jwo+3O+2KJ)tD1J5{KH&=!!8+TgDet(`iHU04E(|>-qG|y;0qvOxkVUl^Q zH9_RAl~30DS#LL%t)J|)Uc$O9Dfgx3))ki`g_OnD9lRdZ{rS=2w|a`#p3OM8uw8&D z$?@Tx9qiiz9QF&fAJOQDywDyZyHNGfvCtrw_3NUv+ookkVcgvFv zUVqzIyXMnRvp=`>cBkH}-D|J6{NtCbH6PvF&3mSuIJ$d^w&t#mw~sdWPf?xn!mFe8 zrfbGgrh~dSytJQmCWT2X)Z<;UD(u>X?Mi2lT~Ks%UKz5oJ#0tH%e~yJTRN^eZfKlf zr0_rRkoDR%<-2GZJD88dsV5 zsMR4T!Z2D6j7)OOxEdf5P}>C<-a3MqD9sU8NOOb)163Fh2L5$)MKusl(}b{1m<Z3+6KNBqnF6#8)J(zZFGPKc+DL)e2Fz0oOBz{$6jDIp^cToBq$Uf{TAcBNuok%x z*M|lt#M&q(;;n^O>Vz%Dtdf!a$_#9L66;sA0t9F^mdZK68)03 XF1uNQ9%5kN0>Uz2Jk|qkU|;|MQ{-U} diff --git a/datasets/common_voice/dummy/zh-CN/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/zh-CN/6.1.0/dummy_data.zip deleted file mode 100644 index 9268b85991f32e3fa532c62558225ddb5d60232f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4204 zcmeH~c|4SB8^?zkWW<>|LLG#t45P7RIkuTJ*@Y}&GL{*Pvcy-s*|{kyN<{Tow5AP5X#eg^Zr z&Hs7$a^nFA1F(J`9w#tZM}p%XOABrQki5gpfqD4_@&MRC<3IoaG{Lx;u3JQMbrENO zn6sCUx1TReSxz3N09Sy+y za55LLLi*dY0RRWs0RTmo7z7_jyr&$&*Z-g8N+4@F%w5v@cDY*OADc8FAa@^+*0@#7 z=k$_q&4aNuH5{7_&ZbF{x79=TQn*kOje6|$7hZ=UAOntL0&@a#!_P~7FH=cW(v`_w z;1ibl*y+uQfl^!#4p1^oUuPw%@yg_G#2$t%#%kmtthQwT&WYYo$_-9`&4EKdue+}) z8-Gqyst6{H6nIA&-0?OD?O*^GG(t2BlO$p_Xkw)5Karej^^F0lJB9T-t>g87gI*0Z z?Gk+#bFNa0bQ`B1r&jTccAPhUduaeX*N@~dNoul!u`LDMCzg8eNH*~xMEam&*t7p!&^(4QTo z2MW+;P2;@UBJWxg_#j1yYTh?#1 z#a2+q!ZR21S3SH>=XcW{T^O1>i2mr>^wDrYUq8kC?+Twz2Q8pf>71lYOEr zUFMD|vKvl(kSWn=aw6LtEaI|X z9i{Hn@h)xT@uyEAdne~_2%c84YZ2@q6xnNiEPXTgiJlqIAlWb5b1-$mb!Qjc_+(gJ zT*PR1oBOl{@6L{v%-U^pP37A$A#%DMW4)MYm8-R~EIczShpkOEvKK_6r-Q8~pq z0gB-XAPT{$5}q@TRP7PFE>3G=lTCZNT*^TbkKLrpxwjQ1>i#)*x@=ogJzR&9Q&X!e zP82P6Kj`V{dDdc*LWwoL(L<5-K(|=m5oJsAnQ4-_22~lyLW$(_3g)f%qlMj%S?yB+ zZrRq~P#ic?J}czhSI3dgVIf7;w4h4a80>dTP;3{6WEo@0&+@tFMR3FU{7B1|IAa4K zb1cV?Bo>lsWLo*j>u`c~rzG&omH23IAyLfc&ja4lAAwO%!i5B%qzx#zO#AYv*8?ld zl&_1npKOmTh>n6b3m;9+ZLWT48rJx}yBTR?Myi@F`gp|n%;7U&T@e?^g4EHAZngu5 zVtTq^;;<3D-wQ!Q$|a%O*lDtzXei=gYK4S9ggYS=Dgf=sN>mEld*1KSR@8kn@y#nwMmg^FYJbU^WR3n27X% zKCd0L5Ocg^gGt$Qau?kj{YtxKXU@5Yr04gB`rgcW1!}LJu}cY%yf#?jA>MFgMfM z9(+65GX4csR_=Nhdcygwe1J4&rWMzpFrJKrDKUIyTJmr za!nLxV;2Rkm*|DQyaEI;j$}Tr%J&>g`JRP>l7Quc3rN)Jg8!2Be@t4N^p{EF$jqN4 z{WteBc+A}Y-OiSy;_B4D#AD1HAS;ZE@v~y7*6nPGx8gql1H7(A%iz@+btU*IW8Qxr z{57%u7`rxzt;AwMtYE(}7zX@1k<8SPxi6P{UY#0N!j(8#fiGu>A2-dM!PQ6HO73Sa zR=6t;yq^QFw(ynU%iOGh*BJW0;g@`1t@kY98GD(MzFDgppt#y5ho KnOYeP0N`&#HvbU- diff --git a/datasets/common_voice/dummy/zh-HK/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/zh-HK/6.1.0/dummy_data.zip deleted file mode 100644 index bc6d96478de5c17a91b4829611a71ea48b5e112b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4210 zcmeH~dpy&98^@>3$cl-@%_73%aGSOW#h62xGdY#YpX#=~^oAQEeX9P^bw z=IKrJA<6GXAmvdzQ9I?4D0w9E7o%tAIIj6b*dzyxFMVu?Zac5pq8Ad`ImPS>RjwEcV{`rCBTTCn;+Gk*3bZ%GP`)2Z7y z5!rLJg#^vQ8d+GHDwmKM^g%KoFeC&gp(A$XZA1k`t|kjY{D`Q8%!V}xj#mrr#~;xM zInHUr5f*<_g>Z6W`)(Kd^c(%KE#r~(16qP;hc=LMO_rsVcF)TOsCJyBe$jxq|9Ktv zLL4r}R4Qa9g>y?Xs!duqpD!)|E0660!K&h3xTdQKmDbo6C-b7_DA|&`8}y<$G2-3= zu=j0(8Kt;=yfkUpR;>e;G~X!|doOgChMKJ4Q!Uf`JcWtc85*PIvz38I?3!Q9n)uJh zd7^{af<;QEE_b=vTZetV%Db25Jb%eLw*UQ=z}bre<*DZ%wtFX9w*w=W*yq`p<+GCf z!OjS&W9*cS2{Cja-xq)xsA$vDo6=N? zHqY6bu`9o*DA<&X9dyb2Y>Xu@fInL*iJW0pr#B-*RIIa!Ha_^of+PIgz00ja(%HqB zlOj%+G`HwBwA?YeKInh!^Ncj^!e5O7OPFUPxrPci=%{fX>Ov`eb7Or=rLsbwRd=U$ zXDUwg@x%~fL1^@8r&Rh#ajm0LU!O7O4vIy&u*ByibLuvyk5uWth=-1p{==h-K1+dR zr^YqSyIN=!NZj?MVxS7^AiG}0{TVE3*kN&O${-D@P5UuqlN;=c4 zXF%_C{(#s`J3}DdH}7n+&Y3%;uGZU}LGTTNdAtu?RK9sD;#M&&Gc0eM=JtB2Z4XsO6{Q~pi&01@K`vyDW}#96RJ?$MDX871Pmpl_a2g0j6!Zwg`-4=c zi%rU>x&8e9Y*+p>HS;dIde7;l`Ph2T6%m=UZMBFfmP8~wk;q^5TW7TWU%&sgewH^m z#ITG8L~LO7Z1N=IoxFc0Ag!fazg80p9vG(|q0mIA4Z@sgosNmD&5GEP)(}5Su0vPx zx`n2>dL=`hXRKuBcMHdt6R#W%nC$&j+q5$iuOr(Kq8B37GIrT1R6lHkFwK4!%>weM zH2qUzWuB&yq(lyD+2=*g6dHac<;!AEg3 z-p-Hk1=Tgxno0-9aj5T$8YjLX6*@dpad^aYM3>Vsi7!i%to6V~x&X&L+;Jeq6?2OZ zr7@i5xV=rqSYUde7P~F|)$=ld*zlI2+JuUz;N+vp<9?(Hy9trxQ01q+GW)cj3a57Z zcLtzxf|CueGC}7OSQ|<5#1oG z=;_8c_sE`)JGJSp3ZtjGWu`J|Gp$Vu@?B}eBOiu%W(C%Eok?|HY!X8?QHE-&EOnjh z@$ZM{y3N<(#*cy4wWRx*&G&yGR|OAf|K9uj`5oZi8wM!Z6DSTbG1+sD4|Ke+dO8w1 z>e6+U(dy0BXJId~mHhO#mN5DPvCp*KBt;@ID>F)d>oc*ay^kAubxmx=<)?g3ckh%$ z{u|Mn#~G<6GXmK}8Y*&q?7d_B@Na-7UUR1#K!Te)`J%q0%mKYaV1cr02|OtT0Wqp! zJ~lfi_bMf$YGTa8<74BP#VtX3d&?bW7Y0IODO$~V89^!_3_Ru`=bm;?w;OS-k8n|H z)Kad9 z3`f8O_jC~Ph^PkqE_|a*>1!97b3YL}2`il=*?FAUMlr(PHsq#Q!64yOaCEu8kd%_r z-SG=iHE$vj;81R)yB#v#$p9?6Js^iSG%+oCb~yeWN^wF>&ID2Sl*5T$n6@qPMz2?yM$po2^pY=a|FcEltT3|_ zmOVCe0I&AgDlC=4mqKv$TGpTT3g{~zvb#eS~!{>wZ@;{tMnIT$~yQnl{R zmU*lG2{6FxYP14g>w8y&!x%&3*TG+1@XxW|o4VCl5fCfbZ>AIj{sVhwYQU_^l|8RD zBdg&r*;#?FSd*V`n%RSE57E`!91d2vtB%xP1Ft;?R)Y_2WCi@)sqjzu^=8j9o>AM3 e;>JN(!GAl6)}yn7nbElzcQ~V%UN9H{z<&U=p5-F| diff --git a/datasets/common_voice/dummy/zh-TW/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/zh-TW/6.1.0/dummy_data.zip deleted file mode 100644 index a825b9ac44eb7046081e28e68ae3aa8aa7cc924c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4052 zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@; zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7Lle`W$1>4 zL+s#RV28^GB0!IGFkD@f6m-(oU0jlpfq|Eafx(z0J4%WYGxPLHip#*kHEEt*!mnv| z=)rZif3FvFpg`;EkFisJ|Nku4^t7n_-JP>eW#%(_9kUhNLZUP}6qGb`bahy2r8d8c zt`M$a@SgopW`+lGmAB8)t9dZxq5Jx^-foqQWy@ ziE@c!Q`WCiyKQ@NPw&MHoz^g)pe08Z8NRr9W6Bf()nX&7NvnL~y!)p~FV=N`r&K5% zxn)k|G@V5o{3IT(aSva0N&02S6!WD)o)bMAlQ(D@$WC#dy~RzjgyGGsWw)=bo!GVc z;=hN1iZhI?_Sr3a}#gWmX5#N_@ z;BNfyv#cy-_Sf5KdnMjYJN!9xp8WdthaaVWuKQfO_tca>`BUsc=?D=E-CeiW3jkx` z6A&Ac6bmV-Wd!5k{C+QHM*+4ERkh-&|L>lCTg@@;_?tI5MH6;&KQl=v?eGooKI-J5 zE7sJc(cz!0x8#%P9hU6u-9HodCDxg&-0||1R)=O;vY~gQbW)*_uK(@6bfqnypOqgG z>=ra{>}M5P>T8(EsXixavDH!A)_d-5Thvkle?@4tUdwb*cdy+jl zCP=U6bt|(oubiT@^`lJ8F~fj&T;e5(O;Z=lVN8zom~OEup_5~6yM$`W2|IWF6>DFY z{myfZnb-TBzu$|M)#2fT1@oi~@9g8(H$Qy$a>4!|Rj%FxX!L zu{lXWUzV7Y2`n#CQ{WK*$^ck0LGFazezJ}{ZLc53HrZ^KJzwqOwR4m0yu)7U<^6}4 znz#bpyMkQ1GBi3eoE&FqY+wFx!UwUL&KiYJ?|*vwU+1;s@}=@!H!l<=@Xfy9c||7o z*W=2I&cV+nzqDLBb=Fir*ZWdJnRC-@xz}#>U3*OJJA;;>^-%?mr5Pm-jZ8%=#U^Gg zcbs$fz!SlPg&*!LJ#&@ITQ#(E8%N+N0YhO1kyC3Y?Bx%c``*F&dQtvFWwTb-My;z_ z(^zv%jQSt%*=*hMEuDRR-2RuhceXG-`nTg_+V7vQ=YQY7eY*MW*YW)J`tSDNmr6gN zqki`91&-sgXPrL36!hK8b9!ZERMNKvDi5Yu6r{GV5Xf0EE5tUM&0EiW|Dy>?zui3M z9+|Op6F-lDi-pswL#*5;NoQTA7*BdzrfZg%XV3gi&c?aj?5Mz%P8L0ttAQ00d6XH- z{@LGrdiHTYKmUBYd-cCRRn&d|{_3u9WOsT~Jt9UyMH+I98j%#EC8@SMurCG-}sjX;NOY>Q9KKsO! z_09$&HIf-?9lg>yp6=cz&LnW=vPt~j*$X=Pq-Ni?p2*&`J0Uu=TiAN{=J(FOts>)i z-aJnDd11Yu-`gKccgg6^TlegG`{(zYU%i-Dq1WB#>?KpS%Sq*R_7iz!ubpl&d*+7k z%`3gFy*$`jW_tYMqD!$gHPbdH->%>GSGwKpf5UG^)V#Ky#YCAG7(MTS*pj5YmYIiH ze&Wb-XV32jH6qv^RMm(-onu^`_g*nV@43-Uoj0m8t~Sg~+U)8v*^QTDrHeO#b8lZ1NwUta3lA3%x#F^mP29=L z=_*@zV3ug9)%g=Ic?=UKI7ZIboRqQnlTFpZz2A;4JSsR@PW15|DIvGSl&;DA$GeKp zXy`6YIM#o}Bf9^b%Q4li1&c3BMRrv^(p&V|%|LeR6`RvNQ*SNfT)6i2w;y?u8vCW^ z$HzU5S-k1mhwiGT4S%EfKL5V^EcyGF4S(;*T+4U1XJ=#D70EP(Gg98M;MJjewoc)U zh&}Ja{%(kPKYji0=f@A9epP)>UEm+%FL3xnTCt$A4!IR;K~nfbLq9VgwG9V~0*odv zBa<96u9mX|)CmF%ZyiBQly)>Lq#aFyfhr6L1OGa@q8f;&O-+Pa3=2h1r9OB&gM6jD&(^cloHq_#HDVw`b=uo$&LiEME%u%S=9 z#qj1OVQVoPQ^?kGu#jRcTG0Ts980q*z#Eh<5q?Ik_mM5X#Y&3hh*kh$PlNIxY88cS z?{aoh>_w`uh_e{A@I|)Rjgu6MF^Xb**5fLBAl3uZHZXA?11cgujiHxBxNK)-1DVbR Ngl)jYWWf#M0RXIGm1O_` diff --git a/datasets/openslr/dummy/SLR32/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR32/0.0.0/dummy_data.zip index 4c518d6a7ffa048f1fdbf5865e9fd6cca1420616..505fa794f07b31c161afff09248005e00de7c883 100644 GIT binary patch literal 12652 zcmc&(2{={l7Cy#~i#oVKhJ>*| zf^2LIaGFM2EjD6p=4B#8RzPBjMB3Z3*gM(owjD9pVm`9UJlyy=c?=tc0EUxnRG61~ zbXXii#igkccxg*aw2=CH{Iy^PzQ&ZKl&p9Q}Iw%Tmdc|fv#g|RujG0uC>`F=eJRF|{E%B)3;N`48r$6#6Y|f>69=@C z+yhuNpQA*=i#DDQI5{n*cJllnhG8v}px}hif;>}ZDo9ooZ4sWXSP|c1g%#~-()b1(GOWyMR>t~sB7u$TTw|V0Nw0w*2 z`@9Podu=#1=yUs|Wt+<(H>O1YNsS#h3ldiQ(M~;nM(g+V%A}c~$S2k@jRSnopMDLg zvK*gAJ61m1Z9O3N%F>UA?|p32P(kPw!4=__JakE=QF46&l+_|lsd;#*JEMDEt(6aD zk`JK=|G_F&)f+Tg7s>Q5dkH5BC#CO>S_&M7MWzKa&uiWs(=wTs;Cd5OWTdttP^Q~@ z|NA>L0mq-JQO0;%j`Dxs10)m#pap9me5M3`V&VIgt__F=d4sf{d>{|W!-b8 z;7wR;NG5rD+4$LEz3e=&PJVOowJ9LLb}~QS4m?C`TC!B1z5flVUT(i7l@}IEzzReb zu-ii{^*$6wkDv#A8m=~~AwH4w8JJPN$&Tfu2&TbBlK3Ff$L$hJ31&Pyze6I6r zmN95laVr<63+?Sxp(?;?y(kuZoXn?Ra3u?;X~Sif?_exRZj;e5P?^aq_$Ufh8o$cM zk?Yo5ZSVJ8mxs-^Zy#pz*OznfaJLd;6g!@diMl9+9p|VekCxoK`>;=Mv3AFyLURMT zfwyI%xAZDxhWCWX=i?)(Dg{*=Z;31EDY3g~NU)*FJk%ZuIF$A?B==~``Lzj{IaK3P zGHK}o?UI=W?3oIJT>Qf6;s$~}dtXK|cK089K$XVfmbbyNe+ycguk)KcCcEoZP7r;X zj6SaWbc#*8Nr!#TZBtFl4oz?DNNneujVG0w?(XF&Rd4G2W|>~T5d>8}yOZlUINSc~hJ@C`?Ku)~`pGkq_F^k?j_c=Fi5s zQwLB}kG}f0>wZOGVA8hxF>Miy9YwFcovSbM%9E^e;N>eR;SDi9d)8u?>CU7JXUhS&Nkrdd)A2z?XE8l$A`Li&723%`-Z4Dr13i! zvOW&2+MpfUAIG|Z+wz(~AeYDfkN*#6B3{3+WP0G&L zC*AejW>1dB7AYI3H5_|de*V-B#*-bn50eY9S%t~`x+h%wc4s*X%p`J(T}bc!=4Ohk zOJkRHY<=vlr|jFxKKS6Cr84j{z3w2iz*IC_>ZWU_B=A@&@qr7IoTr9h3ps2OFIeDd57VF=214-Du zoBb6ZRp^&CXG*=ijm$Cd~E>4uW z)jRqZWanzCIP9}&!#JZSUUw%w{rjdsJb52(RnLxOzEtZghIO3B&KaxzOBhAzEz%uH|e}EtzZ9aSd zAOz`%^YV0C;x0hcIl?XkfL6jtLm?99Pd^|7PxXg0!26dHhI4}gPirQ4TSQyb;h~o8 zFQ}XdeDT@1WAlZtr~?W1RtE~lW_;U3LtA(3%Dt82vg)*> zXkF$BmhrA5_mz)E7dC%M8F+hNDO2g|1^ROF$gkSlD*GJM?bxbV&U{*7w4%_AwvYYd z;V<$}Tk&q~Wrx@=wo(o41^@2Z%D922Om6t>>R#`i%4-aF=G%4CZ)veooDn$oW+v3G zTl`FXt4HMX;7gko_kYhV`k-_?eXfA{s`Ozdy)IeawxXo#Gd(g}zy@E$k*^>lj$OF< zK>?Fldh9~+Kzx%19{Ar>vQQXpK)eyf7irqq?eUmn<`GEGqtFB1M2wlg`M~1~2N_NZ z$;^dAn8Za0l+Vwk@9|=%3=Av8>9jjuGe5gY&hIRsrXPpqa}N^EQC=?{#iyH5JLt|X zo5Ei=+?gAz-S-8rX%m@m9{z!SXSgr^$p?|_E-v0`c3SDc8x8S+gQK~U0y!~P^G){; zQl}MeH#*xC*2Hc863wlwudz$Wmp7c|u7qpOhlCb|ZSG~}>)vzM>K@FwW4sn!QC`=W zJ7aZU*8WEDqec#a!qvs53M`cLs~xIKY0s`Y)}JS48Bl!sG24dQkMvd=)_$Pr_dr?o z2CY=R)<)f-AyO2Q@KWGd{f-0IZ|X${4fwyqHd+;ON&Vf|%TdfAxbB9o_cJ;+3JY&z`Pi4X5AK1kVC+(^1c=PR0(MeQSLr2Kpb@8GL=^Pr8-9CZ}>Dfni@ z5=8_wE~F_LkfkUeD3p_v8oze$>GZxt9&V#8Qe@GeFY#_x-0SYhkbV3agASI)cr>ua ztgh54XWi79M?6aWp;zNnh0e9;bss#s_35X~sSy9W$G;9Y$H(qw#2vv~`FVWw^$eVr z+H5#Qek(_L?KT<)JNJb3WC7{T%@szW4`ONdw-;P%&1r5JmOgg###^b81g0D4dUpmr z`4iR;Hco!iMuI+i~|bva)6G}*0Mx9E!$H<(9iB&!Sa)^GJDqcXwrux&o4o@_Hi z20U65KOSxZ`3(4tv_}Pg_2|NtV|gAGJB4#A=f!Asd=TnKnFUKw7iAW{y=HyeG=6VO zq=dKa+L8#5uV-ZM6(`6I&+I(Ydh~|X#%yyvj;n=DGR)Wc7_%wU4+d7SxcT5iq&C$) zFDT#Yx=W3|h28PizH=7}w~r)^Y{A@(V_D})M|X-XH{n|yj~ty|p{}5PEO+XMJ@tuA z;dL#IOk`!J2P20Jf+TheCLIw$`A$}cS?XWDXl&qCA#T{y!696#G!^S9++>_Lk?e^- zKXz1ay87IjkiV|R);@J_>*1tuOUUrO;MOTp#?qTVW;KaZJ14aJ%3N1w>I%!mGbmty z5$DYdPk*3lVo`|R{KDxESvf)eBRoWTa412=txO%G?NrYi*W#jkcl!uEtAm~}R=q?} zNL@REap&P(_~fex8CT+X;=Vhuck-GroF3M_y1jlnGO=gP$?%oMC%05Y=mB5RCZ~N@u}DR^_dFrnd*AvrwPd5$TKe7;Gj+_KnDMh7rme76;ODHYbCJz|E5;@ z6V=Kk9kGDQhwTn|&K1o4kg^`oa|n8fh%!$~c)`5>Ys@c9$^~NH2Lb~R5uibg64al? zY})^txhsu^^81#v_i5#D5tPnRX7X|Dda3HB7_YQl>sHgQDL;OXYR}|Q`G`VND&vsu z`$FwQ7x!kpy#1#65!H$86sI5^Ch;?4Pw8F>_#IL>o<{k$GtTIU(=CH6VUq$=md*X@ zo+I+<22JR=$aSvj&%&BlNlREpa5{_Pi(1i?!M9V5f@$i|hx;C9nhV@s-RUCG{y<)% z+_|4GP&Lk0F6PAcbx{wyN`&6M3|aeLa(bH5A@g3zKtcjjsmZCmEfc)9)?b<}*y49+ z)-Z~+3}rpZc)LxAKiDNxk2OqGRqE7Wm+fml$1aga&90AExgQz0%4+JZ@xfthQNxU- zO0GXgs)86(4%eV@i@)xa=7*9aoe7nf`tIkpue;U|dkhd+RcU|4ddLmH6{MY%2N7BI zH;BB5zn0VKC39Z`%}_Ym!pWog_;AJ>bgIdDS=G0eg=6CN8k^enMFneg+v{{iu|i^o z2D-YUx@IfU)I!-$BvlwTHQCa>pl*itwE`V4r8BS#+6mar{62|EkAwCJYgaE!##z!s z?6;AXQ4qHgS5T0Zm6sG3m&Zc?F*`Z{u|Rl;s7W!a5TpkR>W0p)d4&!WMd9)ll2?$G zvXPRMQV^GwkdjyUsgOOS5aC&dLcaf4AqfQu1ser9S!rom8F>Y{p9(oa3K5=VDCG5z z6`Hk>tc@QA0TJ3&ttL_?qNId`>X9_JZL$ zeA*0V;)FGX*M)xmtS-x$kQ?jILx8b0d{7F;)`XRc2fa*~?#Oe&B13Fq1o)s63|t8- z6@L?ofuEazLlO0S0N4VBkN^gR!mA7T29ra=8p3U*Kx_wbIh*y%q7BS>hcAwjfkZ*v zOa2_n3=#!#R$dMYaf=ISF*8bp#SjT`76tUV7M#4!(+ms<;iE$^dnK$?d=?UgMCDjy z5=(GD2p9kZpEZFwBw?lEsi_y@+9HS~Y83$yNG7HNnAZ^25KbA5Fdu?&vmZ<7fh|ls zBvt|v3qiZtk3Wbtj>JNcZ}wwZSi}@W?#q}4VM7E3XFuk!01+3QfXx#TV8N#}d@z5d z;O)wm_z@%=!h3TJ{Uh7~6BEKp z#S5)m0(X9(6V=isGKe@VkuH4(;lmh2V$%d01Goi8Aq!_W(5xk_A)Lx81RQC?E^Rcj zRLDz6NYbQT+NflykQ5liE=d!2X`_?*ln1)^@V-8{*%MYOeh(6ov>%cDU1XF3&wT(F zp8LSmfUr{WuNjuiUx-M=NpO)F0zm};1#ifJZzQKKG9s)^dh1IY2h3X=90=h3YcLQX ztW-RT2?4nxcNu&ZQtOx}brugqG z0K5)w(1E@uVWr}eRwEQx*y~8r6FQ*ay-2|a*l^N=79e4z;_2C!L;7Vl1PLSnaKw!b zc+3ys9ztrsNU*R8me|`vvLD`3Li%2UV~My6_maEQhbeiDm5IRew)p@aOb$+AI m1XqX9S0b&BJNIu{XL(&FV|^;P#CCJV?`-lo71H%d+HU=x0 zoS?_Id6ASFv6MhYF{^?_l@yDM^Yu~9JLqY0^%c-`N3eMi z9rnUPlM994fXrh8x@(~t#9a&3;O+u36LeV6-1c*k=+}Edv(zDOgD4W3yiV)^%qWm= zHz>1#y$NMAS3!dd=v}CgGEk@@1Kp)3rhn-O1e*L9Vmi#IJKCa~?}!95PEwYV1;qjc z2r#^L1hJ6gIAfu0<#F{gilb@KaDOppYo z|5reSfr$$v3{aC7+5`rBc}v*2IT-S5|&VB#Rea`va$J~sAQv?Uw&F%h{ z$jh%kd^kCr>sBwX?GCO^Bq!C?RsuL~$twv?_pT;5!ROXMJ{%k87d9LY`G7g{As!qL z)rVe!kBNd0IXEmQ7_79kRaJ4Ie3R;v_rYgrKI!5p>f-L`?AWz@nRSlSOtbRmD&+Lep|eJ zNd79MKoZH#pM(se=g4FyY2Xt&ezA`$TFRj=LhKD0^wD>cs@(nHkZIX^~TN zRj~)}&Yu}G^Plzfr2R4blhpfON3)k6(+4)5ByshqD!kU5*xl$ImtAq+#5V2CspTKa z99_5eF5j0EHh*hB`Go$0WzJ?nyIO8VC!JJvpFiE{K(zJin-jhyrq={Hn9f@unp}BZ zGheLGCh^``?$*~kZS>AXK2Nf&Xxk}xNv*A(=xG^Ozq-@UZZ$DBTs-j%Ij??qVi0s? zNzKAn&A^S&Vf0-Y2>v@0O9cuM04v7wDofP3B8d-$5ZhrvwfsLLF}vLrVus(iT5AB+`+9re>K5PVh-X*N#NO(z|LR z5ra00oFrkpoQJ$&IRvTqysfxRXVoT3Yv{aRqJ1lTUD2sP;m%iO$N%gzHa;KzOeeNL zsMzMd+4twfcyXLY{`m>PT#j){-A#P_>mp+sVnZ8S6Ps}fIwi#RM{=zN`!*-vRNhq< z6q2-Q(WP?yrZ&FwvqLL&mbrTryl5_{^H7MsD(EKZ^6~5XkEyaULO+@>9DeQYUY@hC z|7WL_2+=|dXR4vwzTCj#kb{YxtW-zatTlD1m3>tbX1dR-mtR#=IpzQS$&rApvq`Qh zYl~kSd}}3kPXoUl+761gG-e)lZ5{~vyu1IhLYb^lkFynj_j)`jYUjnxFCIAKvXf5j z&&Mj2l#=*(PTvlYH|d41Z)ARz{yX=eaBFWEV`WF zR17+N(h!6)?pK%;;|shZN&QvqWbkwH@J5*T8RVELAje3I%)M^rxg zW^T_}nVPQEc^hP3J#cuyu|!bD&z7^qfJff=(StjmeM1H-gPb2ti|V$juC7^KQ=fa| zuk(KK{4(yQ8YlJ&@VT5@RoL8q;p@uYh?v#So|yFS)zr|_+_I$f?%cGyPAN*#;#2lUMV3#KuQE(mNbENd znt0lPQ)pM0cMS2${ql!7@joJS^(N%?tfbI;S}+M{OG8KgD# z7Z&Lk*u41C+LvYQd$M#_RZ3HPr+%0mPl%89t2pcZPwgY*c(kJ$W%)gu<|bCk6!|I| z-I;L1NA%%4@gnsMM*+#Y;?-(e%0@j!i8-e0ZNdKvceoLX4_jZpkGCh*O)hsRImvaGeQo9TRXyoP_i7$Ets^MxJD6Lx zmtWW;y5j4`nNv$AP04SRk-GQShEutp3tDS9c$171b&389PEC1K-M5aj{nIX;o@Ubw z{etEGE)F7Y9pzeQQe}63-qHJ{z4e0Xr-S=Qp5iO=yu6p5C7IfI=$dA~FyGVo=!~+o zpm)mp*UGx0HQJ)}H&*P~J&BWmACOS0{_c3oizg(L9M&5eF8$I_vu8?1{APYnCvNS^ zm|TAA%m@MGJyy$EG(Men4o+$T^p$?4+t z&jYjHG)Ll`Cti1&vuNpSu>haR%ChVsnJNv)Le!|3uCBq5u^P-Uc8Ciq|1;hOV769p zF`dqkTX0J~MBZSqzN4M)LJB|VLStYCV*@bH4_R0XEZBh#pL8w2(+urtK!vddI0VcR zK72x01MJw2wDtcJ0*oYRnV|o{JgpI`P@Np8Jk8Sb?JEsV4U!U3?6LCl1oK%YrR;KI zi!F1siq33FoV=IIF=?yK14%#(&}azlRf zZLs?_5XyFBKZc`l6@qIGW*0)W#0*W?p^;IYJ1zm$!cU zg3`OVykDh#=CnH@aIOsRI)l^RKcOLAZ^hGCe#f5%9HU@%FJipF=VyO5btm;B*GaPs~rMa2@V7trdT^=f=UJ z$6()^t6*e!EJ}pyT98iYk~ZNY4%wHn7UEyB9v)Zh`)Pi&^n|kFj?*)59lum)XnLVbz4;^k_m^g^0f#DJY_)2zfw)eslkqN!S$;Yp63Yh%i}U0VBF!smNB{ycSnRQqv+sKgocTzJpGr`ozY-Xa(&2Vyzm74_xyR_7- zOqwydIoxgr$j(y{es4OOu}=<|r_RntF#HDp#LZ=r-P>Mp~V;GB&!kUaNBFh{-4&Ma^ z;er64GzPK7fqK_A4dYDhC_=@~F>+gd9 z6@GUm_GwGacosO2`uOQ!blOj+8LAijAcGATZnkgysKxurBer{{oS8n~%C73pjVrey z47ML)@XRX}--Jdm7#k1B;6*Gj7#kQG3d4~6cPM-mLnuh;`799%;jpE-nVR|#hl>Gj zw9`MsQ`{6=Z9?o#97eKfldQrXqN@Y$qxU3MuFuw zF0eF)VQv2f=>lu8wu}cX4GJtj0OvI5&|@rk%YQszY0e+^)=mHidJ1Ml7GQae2P`$M zVOZV(PBQ2ztif7C1IvUl3651$bbmv;P^3^S0xIi^47wfC(Nr zSj4qzd~m4``%4dCx(zx^RxHqK-T2_5(0&EL13QcUT01_t<_-I68NdWPlepN}ML0&k zr!$m!akzd0e8cR(k@P+0t(~C{qxW{OH)7zu9klh6O~LDM5tWr&JmzK;6HrXY-V}jX zYtWWO=4GS8j?r+KxoAUc2YcBA-aA2C7TF3cRl`HUIEzZT7-16b9%zV-H3`dT*h>!Z z775zY$$h{i))JMrJI0+7S_{}q3h=T7+S17+&d~-%ea?c$M%f6V#*mNFx%Yq?>nDhD zQ4NPm!bT`J<}C;cAK)PxTf>L<}RVvGjmt2DGJxEZBth;;z8j7=S|K!&z-@^d_^4j2i7!HR=M0b6ULX22m~riM?eu0jTk zqOPJ2fCI(KX7D$xW}&TvpfZUm+e3;F5^?Bvw2v~)&{YY${6P&FZCT{QjLjm`jF$Ci zxrI)x3mr3R99tcMg&wqJk-5QUlinVQYP}drY9s^&)#eFt0*ZqH?O_c7TOEM47__C6 zKL8w*cS9fXHW8plu{o@Yj_8&L8+oN-tQn|eu|2ZUG#QKtx4)J{jI9ej>4 z^=MNb6w1+-PA-|keCRPiXf#I%fkFJCNqU+$3Z;V(K9vPmPy{?C6mUu~pyrBIW1$9) zwsdkYV+2SBU{MiASqO6x1c050Q1nGxIyq8``DkE~(1c?Q2^hr!Z5{Qh64kEA}%qZ=2GH;>ba<-3iVvHrIYg*L!fXH!wAYN)9Nnj7Z@W{&7}`&6}Te8 z^o*q-QKh^>P!ySK3~d(+YRj)+(FfJ`J3uj~>|#NoN-lj+!Lq-@Qxt(~3|$us3e|Dx zgAxY>``6)PPSwSNT0;{)sN$k6oqUNgDEe~^Ar(?YVOrD0qCz!X)K%1|jN}=*iWyWC zs+)jiViZLeiwae6>7#1+J*b${bFrvU{gytefN8(QR}`|Z0KS5qj%tcUg{rsoQ3+1} cEvT5%a?Q=SfVc!NeJbH{MRB+@z%~x|AGn+aLI3~& delta 804 zcmexXbcfF^z?+#xgaHKZMsWv&8Bl^7NT-zM=2ph1B$g!V<5noaz%aRxM^YA{;sKgb za$ptmNGi}RQwFP0k-=?-3d3Xt12wo29ANiaP2O@6XgGrl5bFVr*eIw!Nk~B&;w6ZZ zqgQPnJO)Y!F+!9S@~BKctHnJzk?|=jSl#3XeNm9nlTYe%Po5*LBn8tA@`YQg?y0Xp z-GxBi`d}Au7Zl~^7L*k0l~j~WKFaz%g^@{w0e1)j^#Xwa!+%E*jS`-?H3CBv3P2h) zA>peV;Em9MC4`Z5#6!XxMF*A;N76AH68<{)>`;L@3hbRb;Lt#_0&9%GwQOsYV+6+! zh7Pm{LekLz)B%kyPPoHB(ZvT>#IUV#I}