From 25dca5f2eaf183e57121cf71c03032d574557cf5 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 21 Oct 2021 10:37:39 +0200
Subject: [PATCH 01/42] Add test fixture for TAR WAV file

---
 tests/features/test_audio.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index 293b7cd50b4..2086e77fdb6 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -1,4 +1,6 @@
+import os
 import sys
+import tarfile
 from ctypes.util import find_library
 from importlib.util import find_spec
 
@@ -26,6 +28,15 @@
 require_torchaudio = pytest.mark.skipif(find_spec("torchaudio") is None, reason="Test requires 'torchaudio'")
 
 
+@pytest.fixture()
+def tar_wav_path(shared_datadir, tmp_path_factory):
+    audio_path = str(shared_datadir / "test_audio_44100.wav")
+    path = tmp_path_factory.mktemp("data") / "audio_data.wav.tar"
+    with tarfile.TarFile(path, "w") as f:
+        f.add(audio_path, arcname=os.path.basename(audio_path))
+    return path
+
+
 def test_audio_instantiation():
     audio = Audio()
     assert audio.id is None

From 52cc44d1fb1739101ebb821f96752767308edbdb Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 21 Oct 2021 10:38:08 +0200
Subject: [PATCH 02/42] Add test iter_archive

---
 tests/features/test_audio.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index 2086e77fdb6..16cc8a6e010 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -37,6 +37,14 @@ def tar_wav_path(shared_datadir, tmp_path_factory):
     return path
 
 
+def iter_archive(archive_path):
+    with tarfile.open(archive_path) as tar:
+        for tarinfo in tar:
+            file_path = tarinfo.name
+            file_obj = tar.extractfile(tarinfo)
+            yield file_path, file_obj
+
+
 def test_audio_instantiation():
     audio = Audio()
     assert audio.id is None

From 8ff699deb9a727b622cbaedd2150b30501836f26 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 21 Oct 2021 10:39:34 +0200
Subject: [PATCH 03/42] Test dataset with Audio feature for TAR archive

---
 tests/features/test_audio.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index 16cc8a6e010..f196e59763b 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -114,6 +114,36 @@ def test_dataset_with_audio_feature(shared_datadir):
     assert column[0]["sampling_rate"] == 44100
 
 
+@require_sndfile
+def test_dataset_with_audio_feature_tar(tar_wav_path):
+    audio_filename = "test_audio_44100.wav"
+    data = {"audio": []}
+    for file_path, file_obj in iter_archive(tar_wav_path):
+        data["audio"].append({"path": file_path, "bytes": file_obj.read()})
+        break
+    features = Features({"audio": Audio(archived=True)})
+    dset = Dataset.from_dict(data, features=features)
+    item = dset[0]
+    assert item.keys() == {"audio"}
+    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
+    assert item["audio"]["path"] == audio_filename
+    assert item["audio"]["array"].shape == (202311,)
+    assert item["audio"]["sampling_rate"] == 44100
+    batch = dset[:1]
+    assert batch.keys() == {"audio"}
+    assert len(batch["audio"]) == 1
+    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
+    assert batch["audio"][0]["path"] == audio_filename
+    assert batch["audio"][0]["array"].shape == (202311,)
+    assert batch["audio"][0]["sampling_rate"] == 44100
+    column = dset["audio"]
+    assert len(column) == 1
+    assert column[0].keys() == {"path", "array", "sampling_rate"}
+    assert column[0]["path"] == audio_filename
+    assert column[0]["array"].shape == (202311,)
+    assert column[0]["sampling_rate"] == 44100
+
+
 @require_sndfile
 def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir):
     audio_path = str(shared_datadir / "test_audio_44100.wav")

From 3d20ee583aa59cd08f8e98f7e4d41565eba757cc Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 21 Oct 2021 10:49:20 +0200
Subject: [PATCH 04/42] Add Audio method to decode from bytes instead of path

---
 src/datasets/features/audio.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index c2261b40344..4900127e36b 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -52,6 +52,22 @@ def _decode_example_with_librosa(self, value):
             array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
         return array, sampling_rate
 
+    def _decode_example_with_soundfile(self, file):
+        try:
+            import librosa
+            import soundfile as sf
+        except ImportError as err:
+            raise ImportError("To support decoding audio files, please install 'librosa'.") from err
+
+        array, sampling_rate = sf.read(file)
+        array = array.T
+        if self.mono:
+            array = librosa.to_mono(array)
+        if self.sampling_rate and self.sampling_rate != sampling_rate:
+            array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best")
+            sampling_rate = self.sampling_rate
+        return array, sampling_rate
+
     def _decode_example_with_torchaudio(self, value):
         try:
             import torchaudio

From 105ead7e35dbdf5da8698b1ef17cbf017067c898 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 21 Oct 2021 10:54:10 +0200
Subject: [PATCH 05/42] Add Audio support for bytes besides path

---
 src/datasets/features/audio.py | 37 +++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 4900127e36b..3a19ae660c7 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
+from io import BytesIO
 from typing import Any, ClassVar, Optional
 
 import pyarrow as pa
@@ -11,11 +12,18 @@ class Audio:
 
     Args:
         sampling_rate (:obj:`int`, optional): Target sampling rate. If `None`, the native sampling rate is used.
-        mono (:obj:`bool`, default ```True``): Whether to convert the audio signal to mono by averaging samples across channels.
+        mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across
+            channels.
+        archived (:obj:`bool`, default ``False``): Whether the source data is archived with sequential access.
+            - If non-archived with sequential access (i.e. random access is allowed), the cache will only store the
+              absolute path to the audio file.
+            - If archived with sequential access, the cache will store the relative path of the audio file to the
+              archive file and the bytes of the audio file.
     """
 
     sampling_rate: Optional[int] = None
     mono: bool = True
+    archived: bool = False
     id: Optional[str] = None
     # Automatically constructed
     dtype: ClassVar[str] = "dict"
@@ -23,24 +31,33 @@ class Audio:
     _type: str = field(default="Audio", init=False, repr=False)
 
     def __call__(self):
-        return pa.string()
+        return pa.string() if not self.archived else pa.struct({"path": pa.string(), "bytes": pa.binary()})
 
     def decode_example(self, value):
         """Decode example audio file into audio data.
 
         Args:
-            value: Audio file path.
+            value: Either absolute audio file path (when ``archived=False``) or a dict with relative audio file path
+                and the bytes of the audio file.
 
         Returns:
             dict
         """
-        # TODO: backard compatibility for users without audio dependencies
-        array, sampling_rate = (
-            self._decode_example_with_torchaudio(value)
-            if value.endswith(".mp3")
-            else self._decode_example_with_librosa(value)
-        )
-        return {"path": value, "array": array, "sampling_rate": sampling_rate}
+        if self.archived:
+            path, file = value["path"], BytesIO(value["bytes"])
+            array, sampling_rate = (
+                self._decode_example_with_torchaudio(file)
+                if path.endswith(".mp3")
+                else self._decode_example_with_soundfile(file)
+            )
+        else:
+            path = value
+            array, sampling_rate = (
+                self._decode_example_with_torchaudio(path)
+                if path.endswith(".mp3")
+                else self._decode_example_with_librosa(path)
+            )
+        return {"path": path, "array": array, "sampling_rate": sampling_rate}
 
     def _decode_example_with_librosa(self, value):
         try:

From a869469267d317ced7557b8a091c18323527c0a0 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Thu, 21 Oct 2021 11:07:07 +0200
Subject: [PATCH 06/42] Fix docstring

---
 src/datasets/features/audio.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 3a19ae660c7..64dcaeaa819 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -15,6 +15,7 @@ class Audio:
         mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across
             channels.
         archived (:obj:`bool`, default ``False``): Whether the source data is archived with sequential access.
+
             - If non-archived with sequential access (i.e. random access is allowed), the cache will only store the
               absolute path to the audio file.
             - If archived with sequential access, the cache will store the relative path of the audio file to the

From f0911cd55e62e405661049d7559c144b4164684e Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 29 Oct 2021 16:31:57 +0200
Subject: [PATCH 07/42] Stream TAR-based Audio datasets

---
 datasets/common_voice/common_voice.py       |  83 +++++++------
 datasets/librispeech_asr/librispeech_asr.py |  73 +++++++-----
 datasets/openslr/openslr.py                 | 124 ++++++++------------
 datasets/vivos/vivos.py                     |  57 +++++----
 4 files changed, 180 insertions(+), 157 deletions(-)

diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
index 0178da98105..32232058a77 100644
--- a/datasets/common_voice/common_voice.py
+++ b/datasets/common_voice/common_voice.py
@@ -15,8 +15,6 @@
 """ Common Voice Dataset"""
 
 
-import os
-
 import datasets
 from datasets.tasks import AutomaticSpeechRecognition
 
@@ -613,6 +611,7 @@ def __init__(self, name, sub_version, **kwargs):
 
 class CommonVoice(datasets.GeneratorBasedBuilder):
 
+    DEFAULT_WRITER_BATCH_SIZE = 1000
     BUILDER_CONFIGS = [
         CommonVoiceConfig(
             name=lang_id,
@@ -632,7 +631,7 @@ def _info(self):
             {
                 "client_id": datasets.Value("string"),
                 "path": datasets.Value("string"),
-                "audio": datasets.features.Audio(sampling_rate=48_000),
+                "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
                 "sentence": datasets.Value("string"),
                 "up_votes": datasets.Value("int64"),
                 "down_votes": datasets.Value("int64"),
@@ -658,49 +657,54 @@ def _info(self):
 
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        dl_path = dl_manager.download_and_extract(_DATA_URL.format(self.config.name))
-        abs_path_to_data = os.path.join(dl_path, "cv-corpus-6.1-2020-12-11", self.config.name)
-        abs_path_to_clips = os.path.join(abs_path_to_data, "clips")
+        archive = dl_manager.download(_DATA_URL.format(self.config.name))
+        path_to_data = "/".join(["cv-corpus-6.1-2020-12-11", self.config.name])
+        path_to_clips = "/".join([path_to_data, "clips"])
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "filepath": os.path.join(abs_path_to_data, "train.tsv"),
-                    "path_to_clips": abs_path_to_clips,
+                    "files": dl_manager.iter_archive(archive),
+                    "filepath": "/".join([path_to_data, "train.tsv"]),
+                    "path_to_clips": path_to_clips,
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 gen_kwargs={
-                    "filepath": os.path.join(abs_path_to_data, "test.tsv"),
-                    "path_to_clips": abs_path_to_clips,
+                    "files": dl_manager.iter_archive(archive),
+                    "filepath": "/".join([path_to_data, "test.tsv"]),
+                    "path_to_clips": path_to_clips,
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
                 gen_kwargs={
-                    "filepath": os.path.join(abs_path_to_data, "dev.tsv"),
-                    "path_to_clips": abs_path_to_clips,
+                    "files": dl_manager.iter_archive(archive),
+                    "filepath": "/".join([path_to_data, "dev.tsv"]),
+                    "path_to_clips": path_to_clips,
                 },
             ),
             datasets.SplitGenerator(
                 name="other",
                 gen_kwargs={
-                    "filepath": os.path.join(abs_path_to_data, "other.tsv"),
-                    "path_to_clips": abs_path_to_clips,
+                    "files": dl_manager.iter_archive(archive),
+                    "filepath": "/".join([path_to_data, "other.tsv"]),
+                    "path_to_clips": path_to_clips,
                 },
             ),
             datasets.SplitGenerator(
                 name="invalidated",
                 gen_kwargs={
-                    "filepath": os.path.join(abs_path_to_data, "invalidated.tsv"),
-                    "path_to_clips": abs_path_to_clips,
+                    "files": dl_manager.iter_archive(archive),
+                    "filepath": "/".join([path_to_data, "invalidated.tsv"]),
+                    "path_to_clips": path_to_clips,
                 },
             ),
         ]
 
-    def _generate_examples(self, filepath, path_to_clips):
+    def _generate_examples(self, files, filepath, path_to_clips):
         """Yields examples."""
         data_fields = list(self._info().features.keys())
 
@@ -708,28 +712,33 @@ def _generate_examples(self, filepath, path_to_clips):
         data_fields.remove("audio")
         path_idx = data_fields.index("path")
 
-        with open(filepath, encoding="utf-8") as f:
-            lines = f.readlines()
-            headline = lines[0]
-
-            column_names = headline.strip().split("\t")
-            assert (
-                column_names == data_fields
-            ), f"The file should have {data_fields} as column names, but has {column_names}"
-
-            for id_, line in enumerate(lines[1:]):
-                field_values = line.strip().split("\t")
+        all_field_values = {}
+        for path, f in files:
+            if path == filepath:
+                lines = f.readlines()
+                headline = lines[0].decode("utf-8")
 
-                # set absolute path for mp3 audio file
-                field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx])
+                column_names = headline.strip().split("\t")
+                assert (
+                    column_names == data_fields
+                ), f"The file should have {data_fields} as column names, but has {column_names}"
+                for id_, line in enumerate(lines[1:]):
+                    field_values = line.decode("utf-8").strip().split("\t")
+                    # set full path for mp3 audio file
+                    audio_path = "/".join([path_to_clips, field_values[path_idx]])
+                    all_field_values[audio_path] = field_values
+            elif path.startswith(path_to_clips):
+                assert all_field_values, "Found audio clips before the metadata TSV file."
+                if path in all_field_values:
+                    field_values = all_field_values[path]
 
-                # if data is incomplete, fill with empty values
-                if len(field_values) < len(data_fields):
-                    field_values += (len(data_fields) - len(field_values)) * ["''"]
+                    # if data is incomplete, fill with empty values
+                    if len(field_values) < len(data_fields):
+                        field_values += (len(data_fields) - len(field_values)) * ["''"]
 
-                result = {key: value for key, value in zip(data_fields, field_values)}
+                    result = {key: value for key, value in zip(data_fields, field_values)}
 
-                # set audio feature
-                result["audio"] = field_values[path_idx]
+                    # set audio feature
+                    result["audio"] = {"path": path, "data": f.read()}
 
-                yield id_, result
+                    yield id_, result
diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py
index acdec76ddf5..794b8d426b9 100644
--- a/datasets/librispeech_asr/librispeech_asr.py
+++ b/datasets/librispeech_asr/librispeech_asr.py
@@ -17,9 +17,6 @@
 """Librispeech automatic speech recognition dataset."""
 
 
-import glob
-import os
-
 import datasets
 from datasets.tasks import AutomaticSpeechRecognition
 
@@ -93,6 +90,7 @@ def __init__(self, **kwargs):
 class LibrispeechASR(datasets.GeneratorBasedBuilder):
     """Librispeech dataset."""
 
+    DEFAULT_WRITER_BATCH_SIZE = 256
     BUILDER_CONFIGS = [
         LibrispeechASRConfig(name="clean", description="'Clean' speech."),
         LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."),
@@ -104,7 +102,7 @@ def _info(self):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
                     "text": datasets.Value("string"),
                     "speaker_id": datasets.Value("int64"),
                     "chapter_id": datasets.Value("int64"),
@@ -118,41 +116,62 @@ def _info(self):
         )
 
     def _split_generators(self, dl_manager):
-        archive_path = dl_manager.download_and_extract(_DL_URLS[self.config.name])
+        archive_path = dl_manager.download(_DL_URLS[self.config.name])
 
         if self.config.name == "clean":
             train_splits = [
-                datasets.SplitGenerator(name="train.100", gen_kwargs={"archive_path": archive_path["train.100"]}),
-                datasets.SplitGenerator(name="train.360", gen_kwargs={"archive_path": archive_path["train.360"]}),
+                datasets.SplitGenerator(
+                    name="train.100", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])}
+                ),
+                datasets.SplitGenerator(
+                    name="train.360", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])}
+                ),
             ]
         elif self.config.name == "other":
             train_splits = [
-                datasets.SplitGenerator(name="train.500", gen_kwargs={"archive_path": archive_path["train.500"]}),
+                datasets.SplitGenerator(
+                    name="train.500", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])}
+                ),
             ]
 
         return train_splits + [
-            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]}),
-            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION, gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])}
+            ),
         ]
 
-    def _generate_examples(self, archive_path):
+    def _generate_examples(self, files):
         """Generate examples from a LibriSpeech archive_path."""
-        transcripts_glob = os.path.join(archive_path, "LibriSpeech", "*/*/*/*.txt")
         key = 0
-        for transcript_path in sorted(glob.glob(transcripts_glob)):
-            transcript_dir_path = os.path.dirname(transcript_path)
-            with open(transcript_path, "r", encoding="utf-8") as f:
+        audio_data = {}
+        transcripts = []
+        for path, f in files:
+            if path.endswith(".flac"):
+                id_ = path.split("/")[-1][: -len(".flac")]
+                audio_data[id_] = f.read()
+            elif path.endswith(".trans.txt"):
                 for line in f:
-                    line = line.strip()
-                    id_, transcript = line.split(" ", 1)
-                    audio_file = f"{id_}.flac"
-                    speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
-                    yield key, {
-                        "id": id_,
-                        "speaker_id": speaker_id,
-                        "chapter_id": chapter_id,
-                        "file": os.path.join(transcript_dir_path, audio_file),
-                        "audio": os.path.join(transcript_dir_path, audio_file),
-                        "text": transcript,
-                    }
+                    if line:
+                        line = line.decode("utf-8").strip()
+                        id_, transcript = line.split(" ", 1)
+                        audio_file = f"{id_}.flac"
+                        speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
+                        transcripts.append(
+                            {
+                                "id": id_,
+                                "speaker_id": speaker_id,
+                                "chapter_id": chapter_id,
+                                "file": audio_file,
+                                "text": transcript,
+                            }
+                        )
+            if audio_data and len(audio_data) == len(transcripts):
+                for transcript in transcripts:
+                    audio = {"path": transcript["file"], "data": audio_data[transcript["id"]]}
+                    yield key, {"audio": audio, **transcript}
                     key += 1
+                audio_data = {}
+                transcripts = []
diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
index 72a08deccf2..9acb8d90d09 100644
--- a/datasets/openslr/openslr.py
+++ b/datasets/openslr/openslr.py
@@ -112,20 +112,6 @@
     ISBN = {979-10-95546-34-4},
 }
 
-SLR83
-@inproceedings{demirsahin-etal-2020-open,
-    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},
-    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},
-    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},
-    month = may,
-    year = {2020},
-    pages = {6532--6541},
-    address = {Marseille, France},
-    publisher = {European Language Resources Association (ELRA)},
-    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},
-    ISBN = {979-10-95546-34-4},
-}
-
 SLR80
 @inproceedings{oo-etal-2020-burmese,
     title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application
@@ -176,10 +162,10 @@
         "Setswana and isiXhosa.",
         "Files": ["af_za.tar.gz", "st_za.tar.gz", "tn_za.tar.gz", "xh_za.tar.gz"],
         "IndexFiles": [
-            "af_za/za/afr/line_index.tsv",
-            "st_za/za/sso/line_index.tsv",
-            "tn_za/za/tsn/line_index.tsv",
-            "xh_za/za/xho/line_index.tsv",
+            "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/af_za/line_index.tsv",
+            "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/st_za/line_index.tsv",
+            "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/tn_za/line_index.tsv",
+            "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/xh_za/line_index.tsv",
         ],
         "DataDirs": ["af_za/za/afr/wavs", "st_za/za/sso/wavs", "tn_za/za/tsn/wavs", "xh_za/za/xho/wavs"],
     },
@@ -493,39 +479,6 @@
         "IndexFiles": ["line_index.tsv"],
         "DataDirs": [""],
     },
-    "SLR83": {
-        "Language": "English",
-        "LongName": "Crowdsourced high-quality UK and Ireland English Dialect speech data set",
-        "Category": "Speech",
-        "Summary": "Data set which contains male and female recordings of English from various dialects of the UK and Ireland",
-        "Files": [
-            "irish_english_male.zip",
-            "midlands_english_female.zip",
-            "midlands_english_male.zip",
-            "northern_english_female.zip",
-            "northern_english_male.zip",
-            "scottish_english_female.zip",
-            "scottish_english_male.zip",
-            "southern_english_female.zip",
-            "southern_english_male.zip",
-            "welsh_english_female.zip",
-            "welsh_english_male.zip",
-        ],
-        "IndexFiles": [
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-            "line_index.csv",
-        ],
-        "DataDirs": ["", "", "", "", "", "", "", "", "", "", ""],
-    },
     "SLR86": {
         "Language": "Yoruba",
         "LongName": "Crowdsourced high-quality Yoruba speech data set",
@@ -565,6 +518,7 @@ def __init__(self, name, **kwargs):
 
 
 class OpenSlr(datasets.GeneratorBasedBuilder):
+    DEFAULT_WRITER_BATCH_SIZE = 32
 
     BUILDER_CONFIGS = [
         OpenSlrConfig(
@@ -581,13 +535,22 @@ class OpenSlr(datasets.GeneratorBasedBuilder):
     ]
 
     def _info(self):
-        features = datasets.Features(
-            {
-                "path": datasets.Value("string"),
-                "audio": datasets.features.Audio(sampling_rate=48_000),
-                "sentence": datasets.Value("string"),
-            }
-        )
+        if self.config.name in ["SLR32"]:
+            features = datasets.Features(
+                {
+                    "path": datasets.Value("string"),
+                    "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
+                    "sentence": datasets.Value("string"),
+                }
+            )
+        else:
+            features = datasets.Features(
+                {
+                    "path": datasets.Value("string"),
+                    "audio": datasets.features.Audio(sampling_rate=48_000),
+                    "sentence": datasets.Value("string"),
+                }
+            )
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
@@ -605,21 +568,28 @@ def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
         resource_number = self.config.name.replace("SLR", "")
         urls = [f"{_DATA_URL.format(resource_number)}/{file}" for file in self.config.files]
-        dl_paths = dl_manager.download_and_extract(urls)
-        abs_path_to_indexs = [os.path.join(path, f"{self.config.index_files[i]}") for i, path in enumerate(dl_paths)]
-        abs_path_to_datas = [os.path.join(path, f"{self.config.data_dirs[i]}") for i, path in enumerate(dl_paths)]
+        if urls[0].endswith(".zip"):
+            dl_paths = dl_manager.download_and_extract(urls)
+            path_to_indexs = [os.path.join(path, f"{self.config.index_files[i]}") for i, path in enumerate(dl_paths)]
+            path_to_datas = [os.path.join(path, f"{self.config.data_dirs[i]}") for i, path in enumerate(dl_paths)]
+            archives = None
+        else:
+            archives = dl_manager.download(urls)
+            path_to_indexs = dl_manager.download(self.config.index_files)
+            path_to_datas = self.config.data_dirs
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 gen_kwargs={
-                    "path_to_indexs": abs_path_to_indexs,
-                    "path_to_datas": abs_path_to_datas,
+                    "path_to_indexs": path_to_indexs,
+                    "path_to_datas": path_to_datas,
+                    "archive_files": [dl_manager.iter_archive(archive) for archive in archives] if archives else None,
                 },
             ),
         ]
 
-    def _generate_examples(self, path_to_indexs, path_to_datas):
+    def _generate_examples(self, path_to_indexs, path_to_datas, archive_files):
         """Yields examples."""
 
         counter = -1
@@ -640,16 +610,26 @@ def _generate_examples(self, path_to_indexs, path_to_datas):
                     sentence = sentence_index[filename]
                     counter += 1
                     yield counter, {"path": path, "audio": path, "sentence": sentence}
-        elif self.config.name in ["SLR83"]:
-            for i, path_to_index in enumerate(path_to_indexs):
+        elif self.config.name in ["SLR32"]:  # use archives
+            for path_to_index, path_to_data, files in zip(path_to_indexs, path_to_datas, archive_files):
+                sentences = {}
                 with open(path_to_index, encoding="utf-8") as f:
-                    lines = f.readlines()
-                    for id_, line in enumerate(lines):
-                        field_values = re.split(r",\s?", line.strip())
-                        user_id, filename, sentence = field_values
-                        path = os.path.join(path_to_datas[i], f"{filename}.wav")
+                    for line in f:
+                        # Following regexs are needed to normalise the lines, since the datasets
+                        # are not always consistent and have bugs:
+                        line = re.sub(r"\t[^\t]*\t", "\t", line.strip())
+                        field_values = re.split(r"\t\t?", line)
+                        if len(field_values) != 2:
+                            continue
+                        filename, sentence = field_values
+                        # set absolute path for audio file
+                        path = f"{path_to_data}/{filename}.wav"
+                        sentences[path] = sentence
+                for path, f in files:
+                    if path.startswith(path_to_data):
                         counter += 1
-                        yield counter, {"path": path, "audio": path, "sentence": sentence}
+                        audio = {"path": path, "data": f.read()}
+                        yield counter, {"path": path, "audio": audio, "sentence": sentences[path]}
         else:
             for i, path_to_index in enumerate(path_to_indexs):
                 with open(path_to_index, encoding="utf-8") as f:
diff --git a/datasets/vivos/vivos.py b/datasets/vivos/vivos.py
index 4dda623b705..c559ce9c4ae 100644
--- a/datasets/vivos/vivos.py
+++ b/datasets/vivos/vivos.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 
 import datasets
 
@@ -40,6 +39,11 @@
 
 _DATA_URL = "https://ailab.hcmus.edu.vn/assets/vivos.tar.gz"
 
+_PROMPTS_URLS = {
+    "train": "https://s3.amazonaws.com/datasets.huggingface.co/vivos/train/prompts.txt",
+    "test": "https://s3.amazonaws.com/datasets.huggingface.co/vivos/test/prompts.txt",
+}
+
 
 class VivosDataset(datasets.GeneratorBasedBuilder):
     """VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for
@@ -63,7 +67,7 @@ def _info(self):
                 {
                     "speaker_id": datasets.Value("string"),
                     "path": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=16_000),
+                    "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
                     "sentence": datasets.Value("string"),
                 }
             ),
@@ -80,46 +84,57 @@ def _split_generators(self, dl_manager):
         # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
         # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
         # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
-        dl_path = dl_manager.download_and_extract(_DATA_URL)
-        data_dir = os.path.join(dl_path, "vivos")
-        train_dir = os.path.join(data_dir, "train")
-        test_dir = os.path.join(data_dir, "test")
+        prompts_paths = dl_manager.download(_PROMPTS_URLS)
+        archive = dl_manager.download(_DATA_URL)
+        train_dir = "vivos/train"
+        test_dir = "vivos/test"
 
         return [
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(train_dir, "prompts.txt"),
-                    "path_to_clips": os.path.join(train_dir, "waves"),
+                    "prompts_path": prompts_paths["train"],
+                    "path_to_clips": train_dir + "/waves",
+                    "audio_files": dl_manager.iter_archive(archive),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "filepath": os.path.join(test_dir, "prompts.txt"),
-                    "path_to_clips": os.path.join(test_dir, "waves"),
+                    "prompts_path": prompts_paths["test"],
+                    "path_to_clips": test_dir + "/waves",
+                    "audio_files": dl_manager.iter_archive(archive),
                 },
             ),
         ]
 
-    def _generate_examples(
-        self,
-        filepath,
-        path_to_clips,  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
-    ):
+    def _generate_examples(self, prompts_path, path_to_clips, audio_files):
         """Yields examples as (key, example) tuples."""
+        # TODO(QL): use Audio featrue with data bytes instead of string path
+        raise Exception("TODO(QL): use Audio featrue with data bytes instead of string path")
         # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
         # The `key` is here for legacy reason (tfds) and is not important in itself.
-
-        with open(filepath, encoding="utf-8") as f:
-            for id_, row in enumerate(f):
+        examples = {}
+        with open(prompts_path, encoding="utf-8") as f:
+            for row in f:
                 data = row.strip().split(" ", 1)
                 speaker_id = data[0].split("_")[0]
-                yield id_, {
+                audio_path = "/".join([path_to_clips, speaker_id, data[0] + ".wav"])
+                examples[audio_path] = {
                     "speaker_id": speaker_id,
-                    "path": os.path.join(path_to_clips, speaker_id, data[0] + ".wav"),
-                    "audio": os.path.join(path_to_clips, speaker_id, data[0] + ".wav"),
+                    "path": audio_path,
                     "sentence": data[1],
                 }
+        inside_clips_dir = False
+        id_ = 0
+        for path, f in audio_files:
+            if path.startswith(path_to_clips):
+                inside_clips_dir = True
+                if path in examples:
+                    audio = {"path": path, "data": f.read()}
+                    yield id_, {**examples[path], "audio": audio}
+                    id_ += 1
+            elif inside_clips_dir:
+                break

From f224b68275e2baaa9601d37f2bb17f527878ee35 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 8 Nov 2021 16:09:59 +0100
Subject: [PATCH 08/42] Remove archived attribute from test audio with TAR
 archive

---
 tests/features/test_audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index f196e59763b..b11884baf98 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -121,7 +121,7 @@ def test_dataset_with_audio_feature_tar(tar_wav_path):
     for file_path, file_obj in iter_archive(tar_wav_path):
         data["audio"].append({"path": file_path, "bytes": file_obj.read()})
         break
-    features = Features({"audio": Audio(archived=True)})
+    features = Features({"audio": Audio()})
     dset = Dataset.from_dict(data, features=features)
     item = dset[0]
     assert item.keys() == {"audio"}

From ebb1a1ca67b7b6304c1f67fec40373f28d277315 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 8 Nov 2021 19:35:28 +0100
Subject: [PATCH 09/42] Remove archived attribute from Audio feature

---
 src/datasets/features/audio.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 64dcaeaa819..7fd45ad487b 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -10,21 +10,21 @@
 class Audio:
     """Audio Feature to extract audio data from an audio file.
 
+    Input: The Audio feature accepts as input:
+    - A :obj:`str`: Absolute path to the audio file (i.e. random access is allowed).
+    - A :obj:`dict` with the keys:
+        - path: String with relative path of the audio file to the archive file.
+        - bytes: Bytes of the audio file.
+      This is useful for archived files with sequential access.
+
     Args:
         sampling_rate (:obj:`int`, optional): Target sampling rate. If `None`, the native sampling rate is used.
         mono (:obj:`bool`, default ``True``): Whether to convert the audio signal to mono by averaging samples across
             channels.
-        archived (:obj:`bool`, default ``False``): Whether the source data is archived with sequential access.
-
-            - If non-archived with sequential access (i.e. random access is allowed), the cache will only store the
-              absolute path to the audio file.
-            - If archived with sequential access, the cache will store the relative path of the audio file to the
-              archive file and the bytes of the audio file.
     """
 
     sampling_rate: Optional[int] = None
     mono: bool = True
-    archived: bool = False
     id: Optional[str] = None
     # Automatically constructed
     dtype: ClassVar[str] = "dict"
@@ -32,19 +32,20 @@ class Audio:
     _type: str = field(default="Audio", init=False, repr=False)
 
     def __call__(self):
-        return pa.string() if not self.archived else pa.struct({"path": pa.string(), "bytes": pa.binary()})
+        return pa.struct({"path": pa.string(), "bytes": pa.binary()})
 
     def decode_example(self, value):
         """Decode example audio file into audio data.
 
         Args:
-            value: Either absolute audio file path (when ``archived=False``) or a dict with relative audio file path
-                and the bytes of the audio file.
+            value (:obj:`dict`): Dictionary with keys:
+                - path: String with absolute or relative audio file path.
+                - bytes: Optionally, the bytes of the audio file.
 
         Returns:
             dict
         """
-        if self.archived:
+        if value["bytes"]:
             path, file = value["path"], BytesIO(value["bytes"])
             array, sampling_rate = (
                 self._decode_example_with_torchaudio(file)
@@ -52,7 +53,7 @@ def decode_example(self, value):
                 else self._decode_example_with_soundfile(file)
             )
         else:
-            path = value
+            path = value["path"]
             array, sampling_rate = (
                 self._decode_example_with_torchaudio(path)
                 if path.endswith(".mp3")

From 1cc27a0477079dce9bac23fc0ab915500c76643b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 8 Nov 2021 19:40:30 +0100
Subject: [PATCH 10/42] Implement Audio.encode_example

---
 src/datasets/features/audio.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 7fd45ad487b..a79bb099f70 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -34,6 +34,19 @@ class Audio:
     def __call__(self):
         return pa.struct({"path": pa.string(), "bytes": pa.binary()})
 
+    def encode_example(self, value):
+        """Encode example into a format for Arrow.
+
+        Args:
+            value (:obj:`str` or :obj:`dict`): Data passed as input to Audio feature.
+
+        Returns:
+            :obj:`dict`
+        """
+        if isinstance(value, str):
+            return {"path": value}
+        return value
+
     def decode_example(self, value):
         """Decode example audio file into audio data.
 

From 4579b76516b2f28856ac819b100e2b8984d5e491 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 8 Nov 2021 19:41:15 +0100
Subject: [PATCH 11/42] Call Audio.encode_example from encode_nested_example

---
 src/datasets/features/features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
index 3d33f558b9b..89c913bd44a 100644
--- a/src/datasets/features/features.py
+++ b/src/datasets/features/features.py
@@ -851,7 +851,7 @@ def encode_nested_example(schema, obj):
             return list(obj)
     # Object with special encoding:
     # ClassLabel will convert from string to int, TranslationVariableLanguages does some checks
-    elif isinstance(schema, (ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)):
+    elif isinstance(schema, (Audio, ClassLabel, TranslationVariableLanguages, Value, _ArrayXD)):
         return schema.encode_example(obj)
     # Other object should be directly convertible to a native Arrow type (like Translation and Translation)
     return obj

From 0d2a3d84c4aa576ebc7ed1a5bd510ca813ead33a Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 8 Nov 2021 20:17:29 +0100
Subject: [PATCH 12/42] Fix docs

---
 src/datasets/features/audio.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index a79bb099f70..3081db10237 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -13,6 +13,7 @@ class Audio:
     Input: The Audio feature accepts as input:
     - A :obj:`str`: Absolute path to the audio file (i.e. random access is allowed).
     - A :obj:`dict` with the keys:
+
         - path: String with relative path of the audio file to the archive file.
         - bytes: Bytes of the audio file.
       This is useful for archived files with sequential access.
@@ -52,6 +53,7 @@ def decode_example(self, value):
 
         Args:
             value (:obj:`dict`): Dictionary with keys:
+
                 - path: String with absolute or relative audio file path.
                 - bytes: Optionally, the bytes of the audio file.
 

From 3d35adac5162d0b3679bfa317023fb27c3f41897 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 8 Nov 2021 20:23:47 +0100
Subject: [PATCH 13/42] Enhance Audio.decode_example to accept a string

---
 src/datasets/features/audio.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 3081db10237..1bac3f33ac1 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -52,7 +52,8 @@ def decode_example(self, value):
         """Decode example audio file into audio data.
 
         Args:
-            value (:obj:`dict`): Dictionary with keys:
+            value (obj:`str` or :obj:`dict`): Either a string with the absolute audio file path or a dictionary with
+                keys:
 
                 - path: String with absolute or relative audio file path.
                 - bytes: Optionally, the bytes of the audio file.
@@ -60,6 +61,8 @@ def decode_example(self, value):
         Returns:
             dict
         """
+        if isinstance(value, str):
+            value = {"path": value, "bytes": None}
         if value["bytes"]:
             path, file = value["path"], BytesIO(value["bytes"])
             array, sampling_rate = (

From ec5f7b0d9d6bd94464953f874598eacd7866b6b1 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 9 Nov 2021 18:04:50 +0100
Subject: [PATCH 14/42] Fix docs

---
 src/datasets/features/audio.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 1bac3f33ac1..10df37e03ea 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -16,6 +16,7 @@ class Audio:
 
         - path: String with relative path of the audio file to the archive file.
         - bytes: Bytes of the audio file.
+
       This is useful for archived files with sequential access.
 
     Args:

From 21488c01b3273ab5cf03250ac3d67805337657e0 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:27:43 +0100
Subject: [PATCH 15/42] Implement private Audio._storage_dtype to specify
 cached dtype

---
 src/datasets/features/audio.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 10df37e03ea..fa0a3bb8529 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -27,6 +27,7 @@ class Audio:
 
     sampling_rate: Optional[int] = None
     mono: bool = True
+    _storage_dtype: str = "struct"
     id: Optional[str] = None
     # Automatically constructed
     dtype: ClassVar[str] = "dict"
@@ -34,7 +35,9 @@ class Audio:
     _type: str = field(default="Audio", init=False, repr=False)
 
     def __call__(self):
-        return pa.struct({"path": pa.string(), "bytes": pa.binary()})
+        return (
+            pa.struct({"path": pa.string(), "bytes": pa.binary()}) if self._storage_dtype == "struct" else pa.string()
+        )
 
     def encode_example(self, value):
         """Encode example into a format for Arrow.

From 83f04cdc69763a9ebce6fdd4628e5926d9a75d31 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:28:56 +0100
Subject: [PATCH 16/42] Change Audio._storage_dtype dynamically when encoding a
 string

---
 src/datasets/features/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index fa0a3bb8529..01302e80db3 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -49,7 +49,7 @@ def encode_example(self, value):
             :obj:`dict`
         """
         if isinstance(value, str):
-            return {"path": value}
+            self._storage_dtype = "string"
         return value
 
     def decode_example(self, value):

From 7a3f066a89daa75430f4661671c5d63cf34dd321 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:29:42 +0100
Subject: [PATCH 17/42] Update test of Audio instantiation

---
 tests/features/test_audio.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index b11884baf98..c4139e7d7e5 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -4,6 +4,7 @@
 from ctypes.util import find_library
 from importlib.util import find_spec
 
+import pyarrow as pa
 import pytest
 
 from datasets import Dataset, load_dataset
@@ -47,10 +48,13 @@ def iter_archive(archive_path):
 
 def test_audio_instantiation():
     audio = Audio()
+    assert audio.sampling_rate is None
+    assert audio.mono is True
     assert audio.id is None
     assert audio.dtype == "dict"
     assert audio.pa_type is None
     assert audio._type == "Audio"
+    assert audio._storage_dtype == "struct"
 
 
 @require_sndfile

From ece5b97d089e5868aa1affa38b9c471725bcda0f Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:37:06 +0100
Subject: [PATCH 18/42] Set ArrowWriter.schema property dynamically calculated
 from features

---
 src/datasets/arrow_writer.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index 79d1926cc97..2870cd6f19b 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -201,7 +201,7 @@ def __init__(
             raise ValueError("At least one of path and stream must be provided.")
         if features is not None:
             self._features = features
-            self._schema = pa.schema(features.type)
+            self._schema = None
         elif schema is not None:
             self._schema: pa.Schema = schema
             self._features = Features.from_arrow_schema(self._schema)
@@ -216,9 +216,7 @@ def __init__(
             self._hasher = KeyHasher("")
 
         self._check_duplicates = check_duplicates
-
-        if disable_nullable and self._schema is not None:
-            self._schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in self._schema)
+        self._disable_nullable = disable_nullable
 
         self._path = path
         if stream is None:
@@ -287,7 +285,14 @@ def _build_writer(self, inferred_schema: pa.Schema):
 
     @property
     def schema(self):
-        return self._schema if self._schema is not None else []
+        _schema = (
+            self._schema
+            if self._schema is not None
+            else (pa.schema(self._features.type) if self._features is not None else None)
+        )
+        if self._disable_nullable and _schema is not None:
+            _schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in _schema)
+        return _schema if _schema is not None else []
 
     @staticmethod
     def _build_metadata(info: DatasetInfo, fingerprint: Optional[str] = None) -> Dict[str, str]:

From 38c80cc21374f75478050a3631eb4898269d643a Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:40:19 +0100
Subject: [PATCH 19/42] Update ArrowWriter.write_examples_on_file

---
 src/datasets/arrow_writer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index 2870cd6f19b..a5d6d479ca9 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -311,14 +311,14 @@ def write_examples_on_file(self):
 
         # Since current_examples contains (example, key) tuples
         cols = (
-            [col for col in self._schema.names if col in self.current_examples[0][0]]
-            + [col for col in self.current_examples[0][0].keys() if col not in self._schema.names]
-            if self._schema
+            [col for col in self.schema.names if col in self.current_examples[0][0]]
+            + [col for col in self.current_examples[0][0].keys() if col not in self.schema.names]
+            if self.schema
             else self.current_examples[0][0].keys()
         )
 
-        schema = None if self.pa_writer is None and self.update_features else self._schema
-        try_schema = self._schema if self.pa_writer is None and self.update_features else None
+        schema = None if self.pa_writer is None and self.update_features else self.schema
+        try_schema = self.schema if self.pa_writer is None and self.update_features else None
         arrays = []
         inferred_types = []
         for col in cols:
@@ -340,7 +340,7 @@ def write_examples_on_file(self):
                     )
             arrays.append(pa_array)
             inferred_types.append(inferred_type)
-        schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self._schema
+        schema = pa.schema(zip(cols, inferred_types)) if self.pa_writer is None else self.schema
         table = pa.Table.from_arrays(arrays, schema=schema)
         self.write_table(table)
         self.current_examples = []

From 7787985492ae8d2657793ddaabf003d75672ddae Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:39:24 +0100
Subject: [PATCH 20/42] Update ArrowWriter._build_writer

---
 src/datasets/arrow_writer.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index a5d6d479ca9..4014782e28b 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -261,6 +261,7 @@ def close(self):
             self.stream.close()  # This also closes self.pa_writer if it is opened
 
     def _build_writer(self, inferred_schema: pa.Schema):
+        schema = self.schema
         inferred_features = Features.from_arrow_schema(inferred_schema)
         if self._features is not None:
             if self.update_features:  # keep original features it they match, or update them
@@ -271,17 +272,16 @@ def _build_writer(self, inferred_schema: pa.Schema):
                         if inferred_field == fields[name]:
                             inferred_features[name] = self._features[name]
                 self._features = inferred_features
-                self._schema: pa.Schema = inferred_schema
+                schema: pa.Schema = inferred_schema
         else:
             self._features = inferred_features
-            self._schema: pa.Schema = inferred_schema
+            schema: pa.Schema = inferred_schema
         if self.disable_nullable:
-            self._schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in self._schema)
+            schema = pa.schema(pa.field(field.name, field.type, nullable=False) for field in schema)
         if self.with_metadata:
-            self._schema = self._schema.with_metadata(
-                self._build_metadata(DatasetInfo(features=self._features), self.fingerprint)
-            )
-        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
+            schema = schema.with_metadata(self._build_metadata(DatasetInfo(features=self._features), self.fingerprint))
+        self._schema = schema
+        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, schema)
 
     @property
     def schema(self):

From 090723e380be2becf61b5531aba811fba68c2923 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 11:45:34 +0100
Subject: [PATCH 21/42] Fix code quality

---
 tests/features/test_audio.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index c4139e7d7e5..8bc08537731 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -4,7 +4,6 @@
 from ctypes.util import find_library
 from importlib.util import find_spec
 
-import pyarrow as pa
 import pytest
 
 from datasets import Dataset, load_dataset

From 7f587775bada438140b741ec4d9909e835077a61 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 14:50:40 +0100
Subject: [PATCH 22/42] Replace _schema with schema and condition on schema in
 ArrowWriter

---
 src/datasets/arrow_writer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index 4014782e28b..e042b26a757 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -322,7 +322,7 @@ def write_examples_on_file(self):
         arrays = []
         inferred_types = []
         for col in cols:
-            col_type = schema.field(col).type if schema is not None else None
+            col_type = schema.field(col).type if schema else None
             col_try_type = try_schema.field(col).type if try_schema is not None and col in try_schema.names else None
             typed_sequence = OptimizedTypedSequence(
                 [row[0][col] for row in self.current_examples], type=col_type, try_type=col_try_type, col=col
@@ -421,11 +421,11 @@ def write_batch(
         """
         if batch_examples and len(next(iter(batch_examples.values()))) == 0:
             return
-        schema = None if self.pa_writer is None and self.update_features else self._schema
-        try_schema = self._schema if self.pa_writer is None and self.update_features else None
+        schema = None if self.pa_writer is None and self.update_features else self.schema
+        try_schema = self.schema if self.pa_writer is None and self.update_features else None
         typed_sequence_examples = {}
         for col in sorted(batch_examples.keys()):
-            col_type = schema.field(col).type if schema is not None else None
+            col_type = schema.field(col).type if schema else None
             col_try_type = try_schema.field(col).type if try_schema is not None and col in try_schema.names else None
             typed_sequence = OptimizedTypedSequence(batch_examples[col], type=col_type, try_type=col_try_type, col=col)
             typed_sequence_examples[col] = typed_sequence
@@ -460,8 +460,8 @@ def finalize(self, close_stream=True):
             self.hkey_record = []
         self.write_examples_on_file()
         if self.pa_writer is None:
-            if self._schema is not None:
-                self._build_writer(self._schema)
+            if self.schema:
+                self._build_writer(self.schema)
             else:
                 raise ValueError("Please pass `features` or at least one example when writing data")
         self.pa_writer.close()

From 583be77b7b645698d078834e3d7da9869221c347 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 15:46:47 +0100
Subject: [PATCH 23/42] Add test for MP3 TAR audio file

---
 tests/features/test_audio.py | 42 +++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index 8bc08537731..1dbaae13e9a 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -37,6 +37,15 @@ def tar_wav_path(shared_datadir, tmp_path_factory):
     return path
 
 
+@pytest.fixture()
+def tar_mp3_path(shared_datadir, tmp_path_factory):
+    audio_path = str(shared_datadir / "test_audio_44100.mp3")
+    path = tmp_path_factory.mktemp("data") / "audio_data.mp3.tar"
+    with tarfile.TarFile(path, "w") as f:
+        f.add(audio_path, arcname=os.path.basename(audio_path))
+    return path
+
+
 def iter_archive(archive_path):
     with tarfile.open(archive_path) as tar:
         for tarinfo in tar:
@@ -118,7 +127,7 @@ def test_dataset_with_audio_feature(shared_datadir):
 
 
 @require_sndfile
-def test_dataset_with_audio_feature_tar(tar_wav_path):
+def test_dataset_with_audio_feature_tar_wav(tar_wav_path):
     audio_filename = "test_audio_44100.wav"
     data = {"audio": []}
     for file_path, file_obj in iter_archive(tar_wav_path):
@@ -147,6 +156,37 @@ def test_dataset_with_audio_feature_tar(tar_wav_path):
     assert column[0]["sampling_rate"] == 44100
 
 
+@require_sox
+@require_torchaudio
+def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
+    audio_filename = "test_audio_44100.wav"
+    data = {"audio": []}
+    for file_path, file_obj in iter_archive(tar_mp3_path):
+        data["audio"].append({"path": file_path, "bytes": file_obj.read()})
+        break
+    features = Features({"audio": Audio()})
+    dset = Dataset.from_dict(data, features=features)
+    item = dset[0]
+    assert item.keys() == {"audio"}
+    assert item["audio"].keys() == {"path", "array", "sampling_rate"}
+    assert item["audio"]["path"] == audio_filename
+    assert item["audio"]["array"].shape == (109440,)
+    assert item["audio"]["sampling_rate"] == 44100
+    batch = dset[:1]
+    assert batch.keys() == {"audio"}
+    assert len(batch["audio"]) == 1
+    assert batch["audio"][0].keys() == {"path", "array", "sampling_rate"}
+    assert batch["audio"][0]["path"] == audio_filename
+    assert batch["audio"][0]["array"].shape == (109440,)
+    assert batch["audio"][0]["sampling_rate"] == 44100
+    column = dset["audio"]
+    assert len(column) == 1
+    assert column[0].keys() == {"path", "array", "sampling_rate"}
+    assert column[0]["path"] == audio_filename
+    assert column[0]["array"].shape == (109440,)
+    assert column[0]["sampling_rate"] == 44100
+
+
 @require_sndfile
 def test_resampling_at_loading_dataset_with_audio_feature(shared_datadir):
     audio_path = str(shared_datadir / "test_audio_44100.wav")

From 8dbe0d777ba5f8ffe47df844875fb6f934621712 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 15:49:02 +0100
Subject: [PATCH 24/42] Refactor Audio decode_example

---
 src/datasets/features/audio.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 01302e80db3..63d54cdbd4d 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -67,20 +67,20 @@ def decode_example(self, value):
         """
         if isinstance(value, str):
             value = {"path": value, "bytes": None}
-        if value["bytes"]:
-            path, file = value["path"], BytesIO(value["bytes"])
-            array, sampling_rate = (
-                self._decode_example_with_torchaudio(file)
-                if path.endswith(".mp3")
-                else self._decode_example_with_soundfile(file)
-            )
+        if value["path"].endswith("mp3"):
+            if value["bytes"]:
+                path, file = value["path"], BytesIO(value["bytes"])
+                array, sampling_rate = self._decode_example_with_torchaudio(file)
+            else:
+                path = value["path"]
+                array, sampling_rate = self._decode_example_with_torchaudio(path)
         else:
-            path = value["path"]
-            array, sampling_rate = (
-                self._decode_example_with_torchaudio(path)
-                if path.endswith(".mp3")
-                else self._decode_example_with_librosa(path)
-            )
+            if value["bytes"]:
+                path, file = value["path"], BytesIO(value["bytes"])
+                array, sampling_rate = self._decode_example_with_soundfile(file)
+            else:
+                path = value["path"]
+                array, sampling_rate = self._decode_example_with_librosa(path)
         return {"path": path, "array": array, "sampling_rate": sampling_rate}
 
     def _decode_example_with_librosa(self, value):

From c9732091c537e46fa1d00d5039463a6f3a2fb7ea Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 16:04:20 +0100
Subject: [PATCH 25/42] Pass raw bytes to torchaudio.load

---
 src/datasets/features/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 63d54cdbd4d..14b7c674fde 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -69,7 +69,7 @@ def decode_example(self, value):
             value = {"path": value, "bytes": None}
         if value["path"].endswith("mp3"):
             if value["bytes"]:
-                path, file = value["path"], BytesIO(value["bytes"])
+                path, file = value["path"], value["bytes"]
                 array, sampling_rate = self._decode_example_with_torchaudio(file)
             else:
                 path = value["path"]

From 7363e9ab922b93e4204893cf753e436843ffc5e6 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Wed, 10 Nov 2021 18:39:01 +0100
Subject: [PATCH 26/42] Revert "Pass raw bytes to torchaudio.load"

This reverts commit c9732091c537e46fa1d00d5039463a6f3a2fb7ea.
---
 src/datasets/features/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 14b7c674fde..63d54cdbd4d 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -69,7 +69,7 @@ def decode_example(self, value):
             value = {"path": value, "bytes": None}
         if value["path"].endswith("mp3"):
             if value["bytes"]:
-                path, file = value["path"], value["bytes"]
+                path, file = value["path"], BytesIO(value["bytes"])
                 array, sampling_rate = self._decode_example_with_torchaudio(file)
             else:
                 path = value["path"]

From 9f61ab8529a1078bb76451421db1e3d5154198fc Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 15 Nov 2021 21:27:43 +0100
Subject: [PATCH 27/42] Pass format to load in _decode_example_with_torchaudio

---
 src/datasets/features/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 63d54cdbd4d..77a425b1d4b 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -120,7 +120,7 @@ def _decode_example_with_torchaudio(self, value):
         except RuntimeError as err:
             raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
 
-        array, sampling_rate = torchaudio.load(value)
+        array, sampling_rate = torchaudio.load(value, format="mp3")
         if self.sampling_rate and self.sampling_rate != sampling_rate:
             if not hasattr(self, "_resampler"):
                 self._resampler = T.Resample(sampling_rate, self.sampling_rate)

From efa4c2575d2e1595abb69eead9348ac53e24d6f6 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Mon, 15 Nov 2021 21:44:32 +0100
Subject: [PATCH 28/42] Fix filename extension in test

---
 tests/features/test_audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index 1dbaae13e9a..aacd42ee353 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -159,7 +159,7 @@ def test_dataset_with_audio_feature_tar_wav(tar_wav_path):
 @require_sox
 @require_torchaudio
 def test_dataset_with_audio_feature_tar_mp3(tar_mp3_path):
-    audio_filename = "test_audio_44100.wav"
+    audio_filename = "test_audio_44100.mp3"
     data = {"audio": []}
     for file_path, file_obj in iter_archive(tar_mp3_path):
         data["audio"].append({"path": file_path, "bytes": file_obj.read()})

From 659fb786e2e5c50419845d7b17b4c75b9718c270 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 13:58:37 +0100
Subject: [PATCH 29/42] Fix Audio tests CI

---
 .github/workflows/test-audio.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml
index fa9764145c7..addf9aa1035 100644
--- a/.github/workflows/test-audio.yml
+++ b/.github/workflows/test-audio.yml
@@ -12,15 +12,15 @@ jobs:
   test:
     runs-on: ubuntu-latest
     steps:
+      - name: Install OS dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install libsndfile1 sox
       - uses: actions/checkout@v2
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
           python-version: "3.6"
-      - name: Install OS dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install libsndfile1 sox
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

From 2fc997a75ad22812ac3ad99479aea3e081e61977 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:10:40 +0100
Subject: [PATCH 30/42] Fix Audio tests CI

---
 .github/workflows/test-audio.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml
index addf9aa1035..4239f21f649 100644
--- a/.github/workflows/test-audio.yml
+++ b/.github/workflows/test-audio.yml
@@ -17,6 +17,8 @@ jobs:
           sudo apt-get update
           sudo apt-get install libsndfile1 sox
       - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
       - name: Set up Python
         uses: actions/setup-python@v2
         with:

From 416d1bf194b213542b6895c2e101ec1b3c92ec4b Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:15:00 +0100
Subject: [PATCH 31/42] Fix audio test CI by checking out PR HEAD commit
 instead of merge commit

---
 .github/workflows/test-audio.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml
index 4239f21f649..68e0b8f0b3b 100644
--- a/.github/workflows/test-audio.yml
+++ b/.github/workflows/test-audio.yml
@@ -1,9 +1,6 @@
 name: Test audio
 
 on:
-  push:
-    branches:
-    - master
   pull_request:
     branches:
     - master
@@ -18,7 +15,7 @@ jobs:
           sudo apt-get install libsndfile1 sox
       - uses: actions/checkout@v2
         with:
-          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
       - name: Set up Python
         uses: actions/setup-python@v2
         with:

From 5f162406139733e8bf6ffc9b6a26266da8688d8a Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:40:26 +0100
Subject: [PATCH 32/42] Change default Audio storage dtype to string

---
 src/datasets/features/audio.py | 6 +++---
 tests/features/test_audio.py   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index f7762803ef7..51242733b81 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -29,7 +29,7 @@ class Audio:
 
     sampling_rate: Optional[int] = None
     mono: bool = True
-    _storage_dtype: str = "struct"
+    _storage_dtype: str = "string"
     id: Optional[str] = None
     # Automatically constructed
     dtype: ClassVar[str] = "dict"
@@ -50,8 +50,8 @@ def encode_example(self, value):
         Returns:
             :obj:`dict`
         """
-        if isinstance(value, str):
-            self._storage_dtype = "string"
+        if isinstance(value, dict):
+            self._storage_dtype = "struct"
         return value
 
     def decode_example(self, value):
diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py
index 2360713f005..69eb63579d3 100644
--- a/tests/features/test_audio.py
+++ b/tests/features/test_audio.py
@@ -62,7 +62,7 @@ def test_audio_instantiation():
     assert audio.dtype == "dict"
     assert audio.pa_type is None
     assert audio._type == "Audio"
-    assert audio._storage_dtype == "struct"
+    assert audio._storage_dtype == "string"
 
 
 @require_sndfile

From 488b74abb8e1ca72fdd0160a9e23ffc7ed5cd466 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:43:14 +0100
Subject: [PATCH 33/42] Rename Audio decode functions

---
 src/datasets/features/audio.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index 51242733b81..b39833bc3c2 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -72,20 +72,20 @@ def decode_example(self, value):
         if value["path"].endswith("mp3"):
             if value["bytes"]:
                 path, file = value["path"], BytesIO(value["bytes"])
-                array, sampling_rate = self._decode_example_with_torchaudio(file)
+                array, sampling_rate = self._decode_mp3(file)
             else:
                 path = value["path"]
-                array, sampling_rate = self._decode_example_with_torchaudio(path)
+                array, sampling_rate = self._decode_mp3(path)
         else:
             if value["bytes"]:
                 path, file = value["path"], BytesIO(value["bytes"])
-                array, sampling_rate = self._decode_example_with_soundfile(file)
+                array, sampling_rate = self._decode_non_mp3_file_like(file)
             else:
                 path = value["path"]
-                array, sampling_rate = self._decode_example_with_librosa(path)
+                array, sampling_rate = self._decode_non_mp3_path_like(path)
         return {"path": path, "array": array, "sampling_rate": sampling_rate}
 
-    def _decode_example_with_librosa(self, value):
+    def _decode_non_mp3_path_like(self, value):
         try:
             import librosa
         except ImportError as err:
@@ -95,7 +95,7 @@ def _decode_example_with_librosa(self, value):
             array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
         return array, sampling_rate
 
-    def _decode_example_with_soundfile(self, file):
+    def _decode_non_mp3_file_like(self, file):
         try:
             import librosa
             import soundfile as sf
@@ -111,7 +111,7 @@ def _decode_example_with_soundfile(self, file):
             sampling_rate = self.sampling_rate
         return array, sampling_rate
 
-    def _decode_example_with_torchaudio(self, value):
+    def _decode_mp3(self, value):
         try:
             import torchaudio
             import torchaudio.transforms as T

From 0ae5d44965dfc0eba084b8048a30755a9d189061 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:47:55 +0100
Subject: [PATCH 34/42] Refactor Audio decode_example

---
 src/datasets/features/audio.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index b39833bc3c2..f0e39fec4ee 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -67,21 +67,16 @@ def decode_example(self, value):
         Returns:
             dict
         """
-        if isinstance(value, str):
-            value = {"path": value, "bytes": None}
-        if value["path"].endswith("mp3"):
-            if value["bytes"]:
-                path, file = value["path"], BytesIO(value["bytes"])
+        path, file = (value["path"], BytesIO(value["bytes"])) if isinstance(value, dict) else (value, None)
+        if path.endswith("mp3"):
+            if file:
                 array, sampling_rate = self._decode_mp3(file)
             else:
-                path = value["path"]
                 array, sampling_rate = self._decode_mp3(path)
         else:
-            if value["bytes"]:
-                path, file = value["path"], BytesIO(value["bytes"])
+            if file:
                 array, sampling_rate = self._decode_non_mp3_file_like(file)
             else:
-                path = value["path"]
                 array, sampling_rate = self._decode_non_mp3_path_like(path)
         return {"path": path, "array": array, "sampling_rate": sampling_rate}
 

From 4679d8ed615a29bf367b4940f6b0ed07dac61052 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 15:38:54 +0100
Subject: [PATCH 35/42] Force CI re-run


From e178cc71a966175866eab1b08315f680f5825971 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 16:02:59 +0100
Subject: [PATCH 36/42] Refactor and rename

---
 src/datasets/features/audio.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index f0e39fec4ee..f9812b4c306 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -69,10 +69,7 @@ def decode_example(self, value):
         """
         path, file = (value["path"], BytesIO(value["bytes"])) if isinstance(value, dict) else (value, None)
         if path.endswith("mp3"):
-            if file:
-                array, sampling_rate = self._decode_mp3(file)
-            else:
-                array, sampling_rate = self._decode_mp3(path)
+            array, sampling_rate = self._decode_mp3(file if file else path)
         else:
             if file:
                 array, sampling_rate = self._decode_non_mp3_file_like(file)
@@ -80,13 +77,13 @@ def decode_example(self, value):
                 array, sampling_rate = self._decode_non_mp3_path_like(path)
         return {"path": path, "array": array, "sampling_rate": sampling_rate}
 
-    def _decode_non_mp3_path_like(self, value):
+    def _decode_non_mp3_path_like(self, path):
         try:
             import librosa
         except ImportError as err:
             raise ImportError("To support decoding audio files, please install 'librosa'.") from err
 
-        with xopen(value, "rb") as f:
+        with xopen(path, "rb") as f:
             array, sampling_rate = librosa.load(f, sr=self.sampling_rate, mono=self.mono)
         return array, sampling_rate
 
@@ -106,7 +103,7 @@ def _decode_non_mp3_file_like(self, file):
             sampling_rate = self.sampling_rate
         return array, sampling_rate
 
-    def _decode_mp3(self, value):
+    def _decode_mp3(self, path_or_file):
         try:
             import torchaudio
             import torchaudio.transforms as T
@@ -117,7 +114,7 @@ def _decode_mp3(self, value):
         except RuntimeError as err:
             raise ImportError("To support decoding 'mp3' audio files, please install 'sox'.") from err
 
-        array, sampling_rate = torchaudio.load(value, format="mp3")
+        array, sampling_rate = torchaudio.load(path_or_file, format="mp3")
         if self.sampling_rate and self.sampling_rate != sampling_rate:
             if not hasattr(self, "_resampler"):
                 self._resampler = T.Resample(sampling_rate, self.sampling_rate)

From 4c4a6873f6bf347a998d99849feaa80048203dce Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 16 Nov 2021 16:04:20 +0100
Subject: [PATCH 37/42] Fix docstring

---
 src/datasets/features/audio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
index f9812b4c306..9be32c1682d 100644
--- a/src/datasets/features/audio.py
+++ b/src/datasets/features/audio.py
@@ -48,7 +48,7 @@ def encode_example(self, value):
             value (:obj:`str` or :obj:`dict`): Data passed as input to Audio feature.
 
         Returns:
-            :obj:`dict`
+            :obj:`str` or :obj:`dict`
         """
         if isinstance(value, dict):
             self._storage_dtype = "struct"

From adbcc25dde5583a9aeb1e5e3792d887a2a3677fd Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 17 Nov 2021 18:23:07 +0100
Subject: [PATCH 38/42] put back the Audio feature

---
 datasets/common_voice/common_voice.py       |  4 ++--
 datasets/librispeech_asr/librispeech_asr.py |  4 ++--
 datasets/openslr/openslr.py                 | 23 +++++++--------------
 datasets/vivos/vivos.py                     |  6 ++----
 4 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
index 32232058a77..7ceab649186 100644
--- a/datasets/common_voice/common_voice.py
+++ b/datasets/common_voice/common_voice.py
@@ -631,7 +631,7 @@ def _info(self):
             {
                 "client_id": datasets.Value("string"),
                 "path": datasets.Value("string"),
-                "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
+                "audio": datasets.features.Audio(sampling_rate=48_000),
                 "sentence": datasets.Value("string"),
                 "up_votes": datasets.Value("int64"),
                 "down_votes": datasets.Value("int64"),
@@ -739,6 +739,6 @@ def _generate_examples(self, files, filepath, path_to_clips):
                     result = {key: value for key, value in zip(data_fields, field_values)}
 
                     # set audio feature
-                    result["audio"] = {"path": path, "data": f.read()}
+                    result["audio"] = {"path": path, "bytes": f.read()}
 
                     yield id_, result
diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py
index 794b8d426b9..86bbee658c2 100644
--- a/datasets/librispeech_asr/librispeech_asr.py
+++ b/datasets/librispeech_asr/librispeech_asr.py
@@ -102,7 +102,7 @@ def _info(self):
             features=datasets.Features(
                 {
                     "file": datasets.Value("string"),
-                    "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
+                    "audio": datasets.features.Audio(sampling_rate=16_000),
                     "text": datasets.Value("string"),
                     "speaker_id": datasets.Value("int64"),
                     "chapter_id": datasets.Value("int64"),
@@ -170,7 +170,7 @@ def _generate_examples(self, files):
                         )
             if audio_data and len(audio_data) == len(transcripts):
                 for transcript in transcripts:
-                    audio = {"path": transcript["file"], "data": audio_data[transcript["id"]]}
+                    audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]}
                     yield key, {"audio": audio, **transcript}
                     key += 1
                 audio_data = {}
diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
index 9acb8d90d09..8389d25289a 100644
--- a/datasets/openslr/openslr.py
+++ b/datasets/openslr/openslr.py
@@ -535,22 +535,13 @@ class OpenSlr(datasets.GeneratorBasedBuilder):
     ]
 
     def _info(self):
-        if self.config.name in ["SLR32"]:
-            features = datasets.Features(
-                {
-                    "path": datasets.Value("string"),
-                    "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
-                    "sentence": datasets.Value("string"),
-                }
-            )
-        else:
-            features = datasets.Features(
-                {
-                    "path": datasets.Value("string"),
-                    "audio": datasets.features.Audio(sampling_rate=48_000),
-                    "sentence": datasets.Value("string"),
-                }
-            )
+        features = datasets.Features(
+            {
+                "path": datasets.Value("string"),
+                "audio": datasets.features.Audio(sampling_rate=48_000),
+                "sentence": datasets.Value("string"),
+            }
+        )
 
         return datasets.DatasetInfo(
             description=_DESCRIPTION,
diff --git a/datasets/vivos/vivos.py b/datasets/vivos/vivos.py
index c559ce9c4ae..0e596402b77 100644
--- a/datasets/vivos/vivos.py
+++ b/datasets/vivos/vivos.py
@@ -67,7 +67,7 @@ def _info(self):
                 {
                     "speaker_id": datasets.Value("string"),
                     "path": datasets.Value("string"),
-                    "audio": {"path": datasets.Value("string"), "data": datasets.features.Value("binary")},
+                    "audio": datasets.features.Audio(sampling_rate=16_000),
                     "sentence": datasets.Value("string"),
                 }
             ),
@@ -112,8 +112,6 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, prompts_path, path_to_clips, audio_files):
         """Yields examples as (key, example) tuples."""
-        # TODO(QL): use Audio featrue with data bytes instead of string path
-        raise Exception("TODO(QL): use Audio featrue with data bytes instead of string path")
         # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
         # The `key` is here for legacy reason (tfds) and is not important in itself.
         examples = {}
@@ -133,7 +131,7 @@ def _generate_examples(self, prompts_path, path_to_clips, audio_files):
             if path.startswith(path_to_clips):
                 inside_clips_dir = True
                 if path in examples:
-                    audio = {"path": path, "data": f.read()}
+                    audio = {"path": path, "bytes": f.read()}
                     yield id_, {**examples[path], "audio": audio}
                     id_ += 1
             elif inside_clips_dir:

From 25f18069b930984f80630f2a7bab24c7be78da31 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 17 Nov 2021 18:40:09 +0100
Subject: [PATCH 39/42] fix openslr

---
 datasets/openslr/openslr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/openslr/openslr.py b/datasets/openslr/openslr.py
index 8389d25289a..c76375b22f3 100644
--- a/datasets/openslr/openslr.py
+++ b/datasets/openslr/openslr.py
@@ -619,7 +619,7 @@ def _generate_examples(self, path_to_indexs, path_to_datas, archive_files):
                 for path, f in files:
                     if path.startswith(path_to_data):
                         counter += 1
-                        audio = {"path": path, "data": f.read()}
+                        audio = {"path": path, "bytes": f.read()}
                         yield counter, {"path": path, "audio": audio, "sentence": sentences[path]}
         else:
             for i, path_to_index in enumerate(path_to_indexs):

From 7f674776505b6dabad8faac4abe7a4a781653e7d Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <quentin@huggingface.co>
Date: Fri, 19 Nov 2021 09:56:23 +0000
Subject: [PATCH 40/42] fix common_voice

---
 datasets/common_voice/common_voice.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
index 7ceab649186..b11a09a759e 100644
--- a/datasets/common_voice/common_voice.py
+++ b/datasets/common_voice/common_voice.py
@@ -713,8 +713,10 @@ def _generate_examples(self, files, filepath, path_to_clips):
         path_idx = data_fields.index("path")
 
         all_field_values = {}
+        metadata_found = False
         for path, f in files:
             if path == filepath:
+                metadata_found = True
                 lines = f.readlines()
                 headline = lines[0].decode("utf-8")
 
@@ -722,13 +724,15 @@ def _generate_examples(self, files, filepath, path_to_clips):
                 assert (
                     column_names == data_fields
                 ), f"The file should have {data_fields} as column names, but has {column_names}"
-                for id_, line in enumerate(lines[1:]):
+                for line in lines[1:]:
                     field_values = line.decode("utf-8").strip().split("\t")
                     # set full path for mp3 audio file
                     audio_path = "/".join([path_to_clips, field_values[path_idx]])
                     all_field_values[audio_path] = field_values
             elif path.startswith(path_to_clips):
-                assert all_field_values, "Found audio clips before the metadata TSV file."
+                assert metadata_found, "Found audio clips before the metadata TSV file."
+                if not all_field_values:
+                    break
                 if path in all_field_values:
                     field_values = all_field_values[path]
 
@@ -741,4 +745,4 @@ def _generate_examples(self, files, filepath, path_to_clips):
                     # set audio feature
                     result["audio"] = {"path": path, "bytes": f.read()}
 
-                    yield id_, result
+                    yield path, result

From 45ed8cdc42d9a2341afbe6bc74765e394601975a Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <quentin@huggingface.co>
Date: Fri, 19 Nov 2021 09:56:30 +0000
Subject: [PATCH 41/42] update infos

---
 datasets/common_voice/dataset_infos.json    | 2 +-
 datasets/librispeech_asr/dataset_infos.json | 2 +-
 datasets/openslr/dataset_infos.json         | 2 +-
 datasets/vivos/dataset_infos.json           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/datasets/common_voice/dataset_infos.json b/datasets/common_voice/dataset_infos.json
index f9266ece149..1e79e044ac3 100644
--- a/datasets/common_voice/dataset_infos.json
+++ b/datasets/common_voice/dataset_infos.json
@@ -1 +1 @@
-{"ab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10802, "num_examples": 22, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4442, "num_examples": 9, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 376182, "num_examples": 752, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 3906, "num_examples": 8, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ab.tar.gz": {"num_bytes": 41038412, "checksum": "801de9c63f740c4d2c821709586921bed216c736e593051306579cf478a54388"}}, "download_size": 41038412, "post_processing_size": null, "dataset_size": 395332, "size_in_bytes": 41433744}, "ar": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ar", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6330858, "num_examples": 14227, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 3306715, "num_examples": 7622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 3330810, "num_examples": 7517, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 7881421, "num_examples": 18283, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2822099, "num_examples": 6333, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ar.tar.gz": {"num_bytes": 1756264615, "checksum": "516b369da8a000c1b98d8f5ee3b90fa12bcc5d5438391fcf01f3d5e78ccdd6fa"}}, "download_size": 1756264615, "post_processing_size": null, "dataset_size": 23671903, "size_in_bytes": 1779936518}, "as": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "as", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 135331, "num_examples": 270, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 54717, "num_examples": 110, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 63580, "num_examples": 124, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 15547, "num_examples": 31, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/as.tar.gz": {"num_bytes": 22226465, "checksum": "d9afd6d28e9c837ff0943a94452fb12ce8a7885b38fdeb25fc2912bbe4977f40"}}, "download_size": 22226465, "post_processing_size": null, "dataset_size": 269175, "size_in_bytes": 22495640}, "br": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "br", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1114817, "num_examples": 2780, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 838823, "num_examples": 2087, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 807978, "num_examples": 1997, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 4446871, "num_examples": 10912, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 260104, "num_examples": 623, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/br.tar.gz": {"num_bytes": 465276982, "checksum": "d323d71337055b794c8fe3dcdf5a0dc03d6bf8f7c8c19f96369884410aef4606"}}, "download_size": 465276982, "post_processing_size": null, "dataset_size": 7468593, "size_in_bytes": 472745575}, "ca": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ca", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 128917601, "num_examples": 285584, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6886168, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6959066, "num_examples": 15724, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 28903919, "num_examples": 64446, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 8504933, "num_examples": 18846, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ca.tar.gz": {"num_bytes": 20743110341, "checksum": "a27bec66c151ddb21c1736781b3bca972047cc20c02488bad94d2311c40bc6da"}}, "download_size": 20743110341, "post_processing_size": null, "dataset_size": 180171687, "size_in_bytes": 20923282028}, "cnh": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cnh", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 330832, "num_examples": 807, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 307840, "num_examples": 752, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 310074, "num_examples": 756, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1208870, "num_examples": 2934, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 177752, "num_examples": 433, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cnh.tar.gz": {"num_bytes": 161331331, "checksum": "9c27ce17ea8db73e7a2c8715bdb3a45a40792d6d64238cfbb467a81c6b71d71f"}}, "download_size": 161331331, "post_processing_size": null, "dataset_size": 2335368, "size_in_bytes": 163666699}, "cs": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cs", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2459092, "num_examples": 5655, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1748420, "num_examples": 4144, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1756122, "num_examples": 4118, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3247839, "num_examples": 7475, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 292158, "num_examples": 685, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cs.tar.gz": {"num_bytes": 1271909933, "checksum": "68a1d6f27eb7161fdf28da889e7d37e8c86b7aff73b0b6df52edc8359e30ac56"}}, "download_size": 1271909933, "post_processing_size": null, "dataset_size": 9503631, "size_in_bytes": 1281413564}, "cv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 436012, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 365363, "num_examples": 788, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 388030, "num_examples": 818, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3263709, "num_examples": 6927, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 607952, "num_examples": 1282, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cv.tar.gz": {"num_bytes": 439329081, "checksum": "c3fb84c28a5718f01b91cf1026985b1dcd83bb312d32620f16b5ed4f12fb8c73"}}, "download_size": 439329081, "post_processing_size": null, "dataset_size": 5061066, "size_in_bytes": 444390147}, "cy": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cy", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3029147, "num_examples": 6839, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2060863, "num_examples": 4820, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2102719, "num_examples": 4776, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 7778447, "num_examples": 17919, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1569654, "num_examples": 3648, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cy.tar.gz": {"num_bytes": 3434474658, "checksum": "269da0cbbb2887d1903c0e17bbb71ea9bcd83506ba928fe75c660cb3e52f9a67"}}, "download_size": 3434474658, "post_processing_size": null, "dataset_size": 16540830, "size_in_bytes": 3451015488}, "de": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "de", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 111735161, "num_examples": 246525, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6785721, "num_examples": 15588, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6850065, "num_examples": 15588, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 4563457, "num_examples": 10095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 14542398, "num_examples": 32789, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/de.tar.gz": {"num_bytes": 23283812097, "checksum": "733e6e367da4b9588b4bb175ac45c6c0ec545e41df5494a7ee4a7e4ff3141ef7"}}, "download_size": 23283812097, "post_processing_size": null, "dataset_size": 144476802, "size_in_bytes": 23428288899}, "dv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "dv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1312675, "num_examples": 2680, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1075889, "num_examples": 2202, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1032265, "num_examples": 2077, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 421053, "num_examples": 840, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/dv.tar.gz": {"num_bytes": 540488041, "checksum": "b2c8617df5e7aebd74d88491913ecc6b94066198e875853b0b3847d13e70f419"}}, "download_size": 540488041, "post_processing_size": null, "dataset_size": 3841882, "size_in_bytes": 544329923}, "el": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "el", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1043636, "num_examples": 2316, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 677742, "num_examples": 1522, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 631379, "num_examples": 1401, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2539987, "num_examples": 5659, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 83583, "num_examples": 185, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/el.tar.gz": {"num_bytes": 381570611, "checksum": "86c67e7bda7658a7087b5a1997d140d57957a05bb413a188610db61807c53ee4"}}, "download_size": 381570611, "post_processing_size": null, "dataset_size": 4976327, "size_in_bytes": 386546938}, "en": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "en", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 250691604, "num_examples": 564337, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6850452, "num_examples": 16164, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6976081, "num_examples": 16164, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 72156747, "num_examples": 169895, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 82557632, "num_examples": 189562, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en.tar.gz": {"num_bytes": 60613063630, "checksum": "0f8fdfc4fe715738be94ee49c4fb63d5f1608d2e6a43a2bed80f6cb871171c36"}}, "download_size": 60613063630, "post_processing_size": null, "dataset_size": 419232516, "size_in_bytes": 61032296146}, "eo": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eo", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8663844, "num_examples": 19587, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 3843190, "num_examples": 8969, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 3879354, "num_examples": 8987, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1296351, "num_examples": 2946, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2081223, "num_examples": 4736, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eo.tar.gz": {"num_bytes": 2883560869, "checksum": "c19900010aee0f9eb39416406598509b1cdba136a16318e746b1a64f97d7809c"}}, "download_size": 2883560869, "post_processing_size": null, "dataset_size": 19763962, "size_in_bytes": 2903324831}, "es": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "es", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 72689623, "num_examples": 161813, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6544041, "num_examples": 15089, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6567785, "num_examples": 15089, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 62421588, "num_examples": 144791, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 17672664, "num_examples": 40640, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/es.tar.gz": {"num_bytes": 16188844718, "checksum": "276ca393783cd8b208d56b5032b87c13a40fcadde5b3925596e67c15578d0235"}}, "download_size": 16188844718, "post_processing_size": null, "dataset_size": 165895701, "size_in_bytes": 16354740419}, "et": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "et", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1426348, "num_examples": 2966, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1173073, "num_examples": 2509, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1212463, "num_examples": 2507, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 266991, "num_examples": 569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1766673, "num_examples": 3557, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/et.tar.gz": {"num_bytes": 767174465, "checksum": "50a861393e4e7013ab71f1b63bca8c42c26dca1519c15a3b9cdb3cb5b6c561a2"}}, "download_size": 767174465, "post_processing_size": null, "dataset_size": 5845548, "size_in_bytes": 773020013}, "eu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3389176, "num_examples": 7505, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2247330, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2281644, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 10454269, "num_examples": 23570, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2389658, "num_examples": 5387, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eu.tar.gz": {"num_bytes": 3664586106, "checksum": "55b6eaf7ca7c120faa0b60d71c87189b610412334e6b710fe12c2a79489ab06f"}}, "download_size": 3664586106, "post_processing_size": null, "dataset_size": 20762077, "size_in_bytes": 3685348183}, "fa": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fa", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3246710, "num_examples": 7593, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2271812, "num_examples": 5213, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2263134, "num_examples": 5213, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9773876, "num_examples": 22510, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5329900, "num_examples": 11698, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz": {"num_bytes": 8884585819, "checksum": "5454efe3b2f6d06d51e7177469b7bef9a962adbf7611e3cd21771451112abe6d"}}, "download_size": 8884585819, "post_processing_size": null, "dataset_size": 22885432, "size_in_bytes": 8907471251}, "fi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 199505, "num_examples": 460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 183540, "num_examples": 428, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 179607, "num_examples": 415, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 64358, "num_examples": 149, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 25781, "num_examples": 59, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fi.tar.gz": {"num_bytes": 49882909, "checksum": "eb26d0904beef5ec08cf53267be7e78b8ba5056fd162057d5b085a7cba51f035"}}, "download_size": 49882909, "post_processing_size": null, "dataset_size": 652791, "size_in_bytes": 50535700}, "fr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 133605567, "num_examples": 298982, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6854610, "num_examples": 15763, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6868568, "num_examples": 15763, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1435580, "num_examples": 3222, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 17776024, "num_examples": 40351, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fr.tar.gz": {"num_bytes": 19130141984, "checksum": "719ef964b55d830a095a602aff311db39b77239e9d600b6af646ec2ed57e5e45"}}, "download_size": 19130141984, "post_processing_size": null, "dataset_size": 166540349, "size_in_bytes": 19296682333}, "fy-NL": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fy-NL", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1695909, "num_examples": 3927, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1311327, "num_examples": 3020, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1215844, "num_examples": 2790, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9389087, "num_examples": 21569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 451010, "num_examples": 1031, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fy-NL.tar.gz": {"num_bytes": 1237743070, "checksum": "ddee4fc3ce52df2379fa4069090d8f5c853155dc0462eb645f6111e2da627297"}}, "download_size": 1237743070, "post_processing_size": null, "dataset_size": 14063177, "size_in_bytes": 1251806247}, "ga-IE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ga-IE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 236396, "num_examples": 541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 215599, "num_examples": 506, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 212002, "num_examples": 497, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 917017, "num_examples": 2130, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 176661, "num_examples": 409, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ga-IE.tar.gz": {"num_bytes": 156553447, "checksum": "27223fc99af6a45f81190ecb90034806991ff3b9e3aa38a7e97caaabbb0a4ddc"}}, "download_size": 156553447, "post_processing_size": null, "dataset_size": 1757675, "size_in_bytes": 158311122}, "hi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 73903, "num_examples": 157, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 58773, "num_examples": 127, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 64002, "num_examples": 135, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 67240, "num_examples": 139, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 29139, "num_examples": 60, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hi.tar.gz": {"num_bytes": 21424045, "checksum": "5492393b04dd1307a52d93525a7db08fc392c8ba0df553668945152e434f58c9"}}, "download_size": 21424045, "post_processing_size": null, "dataset_size": 293057, "size_in_bytes": 21717102}, "hsb": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hsb", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 367798, "num_examples": 808, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 173155, "num_examples": 387, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 77478, "num_examples": 172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 28207, "num_examples": 62, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 103211, "num_examples": 227, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hsb.tar.gz": {"num_bytes": 79362060, "checksum": "3dd3d79aaa078ad7955552ebc596e0a8894ffd7a4a88a51b2c8ee80c0e088152"}}, "download_size": 79362060, "post_processing_size": null, "dataset_size": 749849, "size_in_bytes": 80111909}, "hu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1428176, "num_examples": 3348, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 699721, "num_examples": 1649, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 612969, "num_examples": 1434, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 127337, "num_examples": 295, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 72559, "num_examples": 169, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hu.tar.gz": {"num_bytes": 242758708, "checksum": "61f933155cba6c54c0b76d0ddd2caebd62d69228b7c935382112abe172660953"}}, "download_size": 242758708, "post_processing_size": null, "dataset_size": 2940762, "size_in_bytes": 245699470}, "ia": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ia", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1446791, "num_examples": 3477, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 372192, "num_examples": 899, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 664744, "num_examples": 1601, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 452330, "num_examples": 1095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 79695, "num_examples": 192, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ia.tar.gz": {"num_bytes": 226499645, "checksum": "47a137a805ea8ce01f2cf9277739919a824a9fd13468345dfbd84eddb52c02f1"}}, "download_size": 226499645, "post_processing_size": null, "dataset_size": 3015752, "size_in_bytes": 229515397}, "id": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "id", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 889083, "num_examples": 2130, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 766675, "num_examples": 1844, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 766720, "num_examples": 1835, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2831110, "num_examples": 6782, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 196795, "num_examples": 470, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/id.tar.gz": {"num_bytes": 475918233, "checksum": "71177fa9d2fac29f48db5feabc294f1d6bbcaa0c326b0d1099be66c0b804b245"}}, "download_size": 475918233, "post_processing_size": null, "dataset_size": 5450383, "size_in_bytes": 481368616}, "it": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "it", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 25748596, "num_examples": 58015, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 5629778, "num_examples": 12928, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5651445, "num_examples": 12928, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 6438506, "num_examples": 14549, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5425867, "num_examples": 12189, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/it.tar.gz": {"num_bytes": 5585781573, "checksum": "3a75b1631958af1487ee49b13cd27efc951183737ed515832cf714ed20c97808"}}, "download_size": 5585781573, "post_processing_size": null, "dataset_size": 48894192, "size_in_bytes": 5634675765}, "ja": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ja", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 317820, "num_examples": 722, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 278459, "num_examples": 632, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 255038, "num_examples": 586, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 389563, "num_examples": 885, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 222566, "num_examples": 504, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ja.tar.gz": {"num_bytes": 152879796, "checksum": "3614cd0d0abac80794351c78183967c83179fab390d7e19cad97758eb85ae558"}}, "download_size": 152879796, "post_processing_size": null, "dataset_size": 1463446, "size_in_bytes": 154343242}, "ka": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ka", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 581587, "num_examples": 1058, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 358380, "num_examples": 656, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 294673, "num_examples": 527, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 24443, "num_examples": 44, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 78770, "num_examples": 139, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ka.tar.gz": {"num_bytes": 104280554, "checksum": "7677df9d650234306a11bf8518be5807e72e7d5fc440d391304d1b99dd5517f5"}}, "download_size": 104280554, "post_processing_size": null, "dataset_size": 1337853, "size_in_bytes": 105618407}, "kab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "kab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 49343008, "num_examples": 120530, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 5936276, "num_examples": 14622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5928674, "num_examples": 14622, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 36104123, "num_examples": 88021, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 7518840, "num_examples": 18134, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/kab.tar.gz": {"num_bytes": 17171606918, "checksum": "d2089107d4f3a84856c457a436a47a883b872022f2085cfad0501469be91fd95"}}, "download_size": 17171606918, "post_processing_size": null, "dataset_size": 104830921, "size_in_bytes": 17276437839}, "ky": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ky", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 927074, "num_examples": 1955, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 700081, "num_examples": 1503, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 711620, "num_examples": 1511, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3410831, "num_examples": 7223, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 437848, "num_examples": 926, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ky.tar.gz": {"num_bytes": 579440853, "checksum": "6efe0ca5384d0419fcf5fda0e0229a1b5eb80d8eeba2d7528a4c3c9f2593206f"}}, "download_size": 579440853, "post_processing_size": null, "dataset_size": 6187454, "size_in_bytes": 585628307}, "lg": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lg", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 549563, "num_examples": 1250, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 253625, "num_examples": 584, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 168943, "num_examples": 384, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1365647, "num_examples": 3110, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 127043, "num_examples": 290, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lg.tar.gz": {"num_bytes": 208197149, "checksum": "71243c65f638cd7f392fabe22e37cbafbdca4eb5a199210000ae957a88768040"}}, "download_size": 208197149, "post_processing_size": null, "dataset_size": 2464821, "size_in_bytes": 210661970}, "lt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 402862, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 203781, "num_examples": 466, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 106451, "num_examples": 244, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 710428, "num_examples": 1629, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 44360, "num_examples": 102, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lt.tar.gz": {"num_bytes": 135299706, "checksum": "5ad3d93bc308f58a70e6685f71ae035237ef9caa0922232ac76846f7587bb8aa"}}, "download_size": 135299706, "post_processing_size": null, "dataset_size": 1467882, "size_in_bytes": 136767588}, "lv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1051326, "num_examples": 2552, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 767926, "num_examples": 1882, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 819846, "num_examples": 2002, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 641669, "num_examples": 1560, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 58933, "num_examples": 143, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lv.tar.gz": {"num_bytes": 208307691, "checksum": "8a4350ccf24884ee1012032bfd5a87e0de50d780b1f8450d1cb52afe3f69c671"}}, "download_size": 208307691, "post_processing_size": null, "dataset_size": 3339700, "size_in_bytes": 211647391}, "mn": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mn", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1088733, "num_examples": 2183, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 912144, "num_examples": 1862, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 912414, "num_examples": 1837, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1628610, "num_examples": 3272, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 332643, "num_examples": 667, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mn.tar.gz": {"num_bytes": 486369317, "checksum": "3aebc40d40eb19263576664a981f4bb8b221abeab78c8154adc3d16875c75ec7"}}, "download_size": 486369317, "post_processing_size": null, "dataset_size": 4874544, "size_in_bytes": 491243861}, "mt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 884543, "num_examples": 2036, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 690486, "num_examples": 1617, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 651610, "num_examples": 1516, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2464327, "num_examples": 5714, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 136773, "num_examples": 314, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mt.tar.gz": {"num_bytes": 425114242, "checksum": "9d53000d7832d130c4d35fb412bfc092ab8de8e763a5d2a528aebf37f052af03"}}, "download_size": 425114242, "post_processing_size": null, "dataset_size": 4827739, "size_in_bytes": 429941981}, "nl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "nl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4219972, "num_examples": 9460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2457725, "num_examples": 5708, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2200827, "num_examples": 4938, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 11420, "num_examples": 27, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1442237, "num_examples": 3308, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/nl.tar.gz": {"num_bytes": 1741827548, "checksum": "048f823408e3bbd16e63111d1b4caecb0102606c440bbdf3e5b6a6bae1e1e3f1"}}, "download_size": 1741827548, "post_processing_size": null, "dataset_size": 10332181, "size_in_bytes": 1752159729}, "or": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "or", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 196790, "num_examples": 388, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 49231, "num_examples": 98, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 65559, "num_examples": 129, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2191159, "num_examples": 4302, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 30974, "num_examples": 62, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/or.tar.gz": {"num_bytes": 199077358, "checksum": "f3edad30166fe454f4d2b14adeece1434dc4b8eb7b0ece37aac8389b7122218a"}}, "download_size": 199077358, "post_processing_size": null, "dataset_size": 2533713, "size_in_bytes": 201611071}, "pa-IN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pa-IN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 100668, "num_examples": 211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 54307, "num_examples": 116, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 20728, "num_examples": 44, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 670272, "num_examples": 1411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 20354, "num_examples": 43, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pa-IN.tar.gz": {"num_bytes": 69748265, "checksum": "d2e30f28a227ecb8209340c4133edf6489f35f8e3d1eb55ff22b96b12f36952c"}}, "download_size": 69748265, "post_processing_size": null, "dataset_size": 866329, "size_in_bytes": 70614594}, "pl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3259050, "num_examples": 7468, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2156262, "num_examples": 5153, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2203857, "num_examples": 5153, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5566818, "num_examples": 12848, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1983448, "num_examples": 4601, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pl.tar.gz": {"num_bytes": 3537012341, "checksum": "acbf77d36e083e2bcb7152ffb52ab7d1e3e64d33a3f51f106cdff7feff6279aa"}}, "download_size": 3537012341, "post_processing_size": null, "dataset_size": 15169435, "size_in_bytes": 3552181776}, "pt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2763497, "num_examples": 6514, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1948500, "num_examples": 4641, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1936082, "num_examples": 4592, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3492648, "num_examples": 8390, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 738577, "num_examples": 1740, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pt.tar.gz": {"num_bytes": 1704252567, "checksum": "6700de499f728e0e3f3ed4d7005e5b7db27ba2ddc872b21b0b404c3b4859d84b"}}, "download_size": 1704252567, "post_processing_size": null, "dataset_size": 10879304, "size_in_bytes": 1715131871}, "rm-sursilv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-sursilv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 627518, "num_examples": 1384, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 535630, "num_examples": 1194, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 539772, "num_examples": 1205, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 946574, "num_examples": 2102, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 290484, "num_examples": 639, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-sursilv.tar.gz": {"num_bytes": 275950479, "checksum": "3cfc4971b6ab8958d7c3d784977690fcc04ebd7570ecf788d5948df84a5481a1"}}, "download_size": 275950479, "post_processing_size": null, "dataset_size": 2939978, "size_in_bytes": 278890457}, "rm-vallader": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-vallader", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 267837, "num_examples": 574, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 173761, "num_examples": 378, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 163725, "num_examples": 357, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 339277, "num_examples": 727, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 175312, "num_examples": 374, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-vallader.tar.gz": {"num_bytes": 108113989, "checksum": "4fdb7dc5e20862a636ee7975831b39db29012d615f9139edf2d266b878ce43ae"}}, "download_size": 108113989, "post_processing_size": null, "dataset_size": 1119912, "size_in_bytes": 109233901}, "ro": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ro", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1457000, "num_examples": 3399, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 756861, "num_examples": 1778, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 368157, "num_examples": 858, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 827971, "num_examples": 1945, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 207526, "num_examples": 485, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ro.tar.gz": {"num_bytes": 261978702, "checksum": "450b159e936ef6ff136fcdfad193675caec5b2230d1b6ca24c5cde491ff002cd"}}, "download_size": 261978702, "post_processing_size": null, "dataset_size": 3617515, "size_in_bytes": 265596217}, "ru": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ru", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7918252, "num_examples": 15481, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4035778, "num_examples": 8007, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 4017986, "num_examples": 7963, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5123246, "num_examples": 10247, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1567391, "num_examples": 3056, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ru.tar.gz": {"num_bytes": 3655676916, "checksum": "dcbb460e58d4afc78047c3801c9eb56d940b388eb350ee3da3de5bfe5a74a025"}}, "download_size": 3655676916, "post_processing_size": null, "dataset_size": 22662653, "size_in_bytes": 3678339569}, "rw": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rw", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 222435182, "num_examples": 515197, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6836125, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6685632, "num_examples": 15032, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9774022, "num_examples": 22923, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 93086051, "num_examples": 206790, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rw.tar.gz": {"num_bytes": 42545189583, "checksum": "cf8a07059b3713022d487f9a6b8f465271f3457c525a8b350f829f87b0132b41"}}, "download_size": 42545189583, "post_processing_size": null, "dataset_size": 338817012, "size_in_bytes": 42884006595}, "sah": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sah", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 733267, "num_examples": 1442, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 379003, "num_examples": 757, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 204118, "num_examples": 405, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 636097, "num_examples": 1275, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 33499, "num_examples": 66, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sah.tar.gz": {"num_bytes": 181245626, "checksum": "dea1a454813c8f90abcbdf427fa922e1b7a116753deeb410af096ce5f0ae2405"}}, "download_size": 181245626, "post_processing_size": null, "dataset_size": 1985984, "size_in_bytes": 183231610}, "sl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 845619, "num_examples": 2038, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 363066, "num_examples": 881, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 231081, "num_examples": 556, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1033232, "num_examples": 2502, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 37929, "num_examples": 92, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sl.tar.gz": {"num_bytes": 222751292, "checksum": "184cfbfe876a1f1c6317e4e34680c82a940db833afca78203c2929db1768a353"}}, "download_size": 222751292, "post_processing_size": null, "dataset_size": 2510927, "size_in_bytes": 225262219}, "sv-SE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sv-SE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 983262, "num_examples": 2331, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 840358, "num_examples": 2027, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 844026, "num_examples": 2019, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1329608, "num_examples": 3043, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 193364, "num_examples": 462, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sv-SE.tar.gz": {"num_bytes": 421434184, "checksum": "dc8634dafacb33be00f06e376f6c479d53f84f4834952593c8903f1080535213"}}, "download_size": 421434184, "post_processing_size": null, "dataset_size": 4190618, "size_in_bytes": 425624802}, "ta": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ta", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 957720, "num_examples": 2009, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 846103, "num_examples": 1781, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 858400, "num_examples": 1779, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3584809, "num_examples": 7428, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 284039, "num_examples": 594, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ta.tar.gz": {"num_bytes": 679766097, "checksum": "78560d9d608a63ee75c3fdeb7f96f33cf0d85855ba6294b13e945de066eb46d8"}}, "download_size": 679766097, "post_processing_size": null, "dataset_size": 6531071, "size_in_bytes": 686297168}, "th": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "th", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1389723, "num_examples": 2917, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1029454, "num_examples": 2188, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 909292, "num_examples": 1922, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1268833, "num_examples": 2671, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 222666, "num_examples": 467, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/th.tar.gz": {"num_bytes": 341305736, "checksum": "a3d11043c49d3ea8ffb58dfab117cd831dd62a641e0a26ac60eb43e483534f7a"}}, "download_size": 341305736, "post_processing_size": null, "dataset_size": 4819968, "size_in_bytes": 346125704}, "tr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 778858, "num_examples": 1831, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 689987, "num_examples": 1647, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 694938, "num_examples": 1647, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 137465, "num_examples": 325, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 730583, "num_examples": 1726, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tr.tar.gz": {"num_bytes": 620848700, "checksum": "b3f266c868b1fe9f76270ba76226b1cdc17f33b3e387e6b44a64d5419f8b9768"}}, "download_size": 620848700, "post_processing_size": null, "dataset_size": 3031831, "size_in_bytes": 623880531}, "tt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 5048627, "num_examples": 11211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1974398, "num_examples": 4485, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 939118, "num_examples": 2127, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 793843, "num_examples": 1798, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 129728, "num_examples": 287, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tt.tar.gz": {"num_bytes": 777153207, "checksum": "89c8d7a49584de720f1790df39e6f07996e2eecb07f6273f4ba2668e9fe4ad46"}}, "download_size": 777153207, "post_processing_size": null, "dataset_size": 8885714, "size_in_bytes": 786038921}, "uk": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "uk", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1888179, "num_examples": 4035, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1511544, "num_examples": 3235, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1521216, "num_examples": 3236, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3830066, "num_examples": 8161, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 598922, "num_examples": 1255, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/uk.tar.gz": {"num_bytes": 1218559031, "checksum": "f3ca0143cd84f5eacb583187052e69efec21c571a426efee91a765a2284519c2"}}, "download_size": 1218559031, "post_processing_size": null, "dataset_size": 9349927, "size_in_bytes": 1227908958}, "vi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 92564, "num_examples": 221, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 82035, "num_examples": 198, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 84472, "num_examples": 200, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 366671, "num_examples": 870, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 32664, "num_examples": 78, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vi.tar.gz": {"num_bytes": 51929480, "checksum": "704bce8031932377cc21c017923ff1e96ebd2be9bd520adcf839f7a0f5f03b6e"}}, "download_size": 51929480, "post_processing_size": null, "dataset_size": 658406, "size_in_bytes": 52587886}, "vot": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vot", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1250, "num_examples": 3, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 163377, "num_examples": 411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2419, "num_examples": 6, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vot.tar.gz": {"num_bytes": 7792602, "checksum": "7fb07dd25b0575e8cd811bb8d1e5aebd17fdbca079a4ee50d81e0aaaff50f8b0"}}, "download_size": 7792602, "post_processing_size": null, "dataset_size": 167046, "size_in_bytes": 7959648}, "zh-CN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-CN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8279157, "num_examples": 18541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 3757047, "num_examples": 8760, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 3823707, "num_examples": 8743, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3908115, "num_examples": 8948, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2328784, "num_examples": 5305, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-CN.tar.gz": {"num_bytes": 2184602350, "checksum": "cd8589cac28541f9f996d1954f14c307954f1146ac44a8eadad8e31ebaf1f15e"}}, "download_size": 2184602350, "post_processing_size": null, "dataset_size": 22096810, "size_in_bytes": 2206699160}, "zh-HK": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-HK", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3142432, "num_examples": 7506, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 2144145, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 2163111, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 16142369, "num_examples": 38830, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1272392, "num_examples": 2999, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-HK.tar.gz": {"num_bytes": 2774145806, "checksum": "8a525ce4664d6647701449d5e72f7d8658cc3a5fabc72e05c6883994fd3c0134"}}, "download_size": 2774145806, "post_processing_size": null, "dataset_size": 24864449, "size_in_bytes": 2799010255}, "zh-TW": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-TW", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1478055, "num_examples": 3507, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 1184204, "num_examples": 2895, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1204526, "num_examples": 2895, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 9437896, "num_examples": 22477, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1493820, "num_examples": 3584, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-TW.tar.gz": {"num_bytes": 2182836295, "checksum": "67fadf561f8237690d4a4a1d63a9b3ac271b5d05438dc745b7e04282d909460f"}}, "download_size": 2182836295, "post_processing_size": null, "dataset_size": 14798501, "size_in_bytes": 2197634796}}
\ No newline at end of file
+{"ab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1295622, "num_examples": 22, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 411844, "num_examples": 9, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 40023390, "num_examples": 752, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 361626, "num_examples": 8, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ab.tar.gz": {"num_bytes": 41038412, "checksum": "801de9c63f740c4d2c821709586921bed216c736e593051306579cf478a54388"}}, "download_size": 41038412, "post_processing_size": null, "dataset_size": 42092482, "size_in_bytes": 83130894}, "ar": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ar", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 359335168, "num_examples": 14227, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 237546641, "num_examples": 7622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 209606861, "num_examples": 7517, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 515822404, "num_examples": 18283, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 194805036, "num_examples": 6333, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ar.tar.gz": {"num_bytes": 1756264615, "checksum": "516b369da8a000c1b98d8f5ee3b90fa12bcc5d5438391fcf01f3d5e78ccdd6fa"}}, "download_size": 1756264615, "post_processing_size": null, "dataset_size": 1517116110, "size_in_bytes": 3273380725}, "as": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "as", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11442279, "num_examples": 270, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 5071343, "num_examples": 110, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5480156, "num_examples": 124, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 886145, "num_examples": 31, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/as.tar.gz": {"num_bytes": 22226465, "checksum": "d9afd6d28e9c837ff0943a94452fb12ce8a7885b38fdeb25fc2912bbe4977f40"}}, "download_size": 22226465, "post_processing_size": null, "dataset_size": 22879923, "size_in_bytes": 45106388}, "br": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "br", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62238289, "num_examples": 2780, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 54461339, "num_examples": 2087, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 46995570, "num_examples": 1997, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 269858143, "num_examples": 10912, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 20861017, "num_examples": 623, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/br.tar.gz": {"num_bytes": 465276982, "checksum": "d323d71337055b794c8fe3dcdf5a0dc03d6bf8f7c8c19f96369884410aef4606"}}, "download_size": 465276982, "post_processing_size": null, "dataset_size": 454414358, "size_in_bytes": 919691340}, "ca": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ca", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12966939466, "num_examples": 285584, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 745761890, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 716442038, "num_examples": 15724, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2693542910, "num_examples": 64446, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 850402888, "num_examples": 18846, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ca.tar.gz": {"num_bytes": 20743110341, "checksum": "a27bec66c151ddb21c1736781b3bca972047cc20c02488bad94d2311c40bc6da"}}, "download_size": 20743110341, "post_processing_size": null, "dataset_size": 17973089192, "size_in_bytes": 38716199533}, "cnh": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cnh", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 18866674, "num_examples": 807, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 24675321, "num_examples": 752, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 22162315, "num_examples": 756, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 84878963, "num_examples": 2934, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 13642724, "num_examples": 433, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cnh.tar.gz": {"num_bytes": 161331331, "checksum": "9c27ce17ea8db73e7a2c8715bdb3a45a40792d6d64238cfbb467a81c6b71d71f"}}, "download_size": 161331331, "post_processing_size": null, "dataset_size": 164225997, "size_in_bytes": 325557328}, "cs": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cs", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 215205282, "num_examples": 5655, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 148499476, "num_examples": 4144, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 148312130, "num_examples": 4118, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 282225475, "num_examples": 7475, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 24717823, "num_examples": 685, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cs.tar.gz": {"num_bytes": 1271909933, "checksum": "68a1d6f27eb7161fdf28da889e7d37e8c86b7aff73b0b6df52edc8359e30ac56"}}, "download_size": 1271909933, "post_processing_size": null, "dataset_size": 818960186, "size_in_bytes": 2090870119}, "cv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 31649510, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 32513061, "num_examples": 788, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 28429779, "num_examples": 818, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 288294623, "num_examples": 6927, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 57923138, "num_examples": 1282, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cv.tar.gz": {"num_bytes": 439329081, "checksum": "c3fb84c28a5718f01b91cf1026985b1dcd83bb312d32620f16b5ed4f12fb8c73"}}, "download_size": 439329081, "post_processing_size": null, "dataset_size": 438810111, "size_in_bytes": 878139192}, "cy": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "cy", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 271642649, "num_examples": 6839, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 206865596, "num_examples": 4820, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 201813388, "num_examples": 4776, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 688469886, "num_examples": 17919, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 146874576, "num_examples": 3648, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/cy.tar.gz": {"num_bytes": 3434474658, "checksum": "269da0cbbb2887d1903c0e17bbb71ea9bcd83506ba928fe75c660cb3e52f9a67"}}, "download_size": 3434474658, "post_processing_size": null, "dataset_size": 1515666095, "size_in_bytes": 4950140753}, "de": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "de", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11463160619, "num_examples": 246525, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 744617681, "num_examples": 15588, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 729559862, "num_examples": 15588, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 464513461, "num_examples": 10095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1440604803, "num_examples": 32789, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/de.tar.gz": {"num_bytes": 23283812097, "checksum": "733e6e367da4b9588b4bb175ac45c6c0ec545e41df5494a7ee4a7e4ff3141ef7"}}, "download_size": 23283812097, "post_processing_size": null, "dataset_size": 14842456426, "size_in_bytes": 38126268523}, "dv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "dv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 118576140, "num_examples": 2680, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 94281409, "num_examples": 2202, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 94117088, "num_examples": 2077, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 37694847, "num_examples": 840, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/dv.tar.gz": {"num_bytes": 540488041, "checksum": "b2c8617df5e7aebd74d88491913ecc6b94066198e875853b0b3847d13e70f419"}}, "download_size": 540488041, "post_processing_size": null, "dataset_size": 344669484, "size_in_bytes": 885157525}, "el": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "el", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 80759076, "num_examples": 2316, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 53820491, "num_examples": 1522, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 44818565, "num_examples": 1401, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 186861175, "num_examples": 5659, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 6023769, "num_examples": 185, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/el.tar.gz": {"num_bytes": 381570611, "checksum": "86c67e7bda7658a7087b5a1997d140d57957a05bb413a188610db61807c53ee4"}}, "download_size": 381570611, "post_processing_size": null, "dataset_size": 372283076, "size_in_bytes": 753853687}, "en": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "en", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 26088826658, "num_examples": 564337, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 758718688, "num_examples": 16164, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 795638801, "num_examples": 16164, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5796244022, "num_examples": 169895, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 9122973965, "num_examples": 189562, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/en.tar.gz": {"num_bytes": 60613063630, "checksum": "0f8fdfc4fe715738be94ee49c4fb63d5f1608d2e6a43a2bed80f6cb871171c36"}}, "download_size": 60613063630, "post_processing_size": null, "dataset_size": 42562402134, "size_in_bytes": 103175465764}, "eo": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eo", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 993655930, "num_examples": 19587, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 420153812, "num_examples": 8969, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 391427586, "num_examples": 8987, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 142476819, "num_examples": 2946, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 238105462, "num_examples": 4736, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eo.tar.gz": {"num_bytes": 2883560869, "checksum": "c19900010aee0f9eb39416406598509b1cdba136a16318e746b1a64f97d7809c"}}, "download_size": 2883560869, "post_processing_size": null, "dataset_size": 2185819609, "size_in_bytes": 5069380478}, "es": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "es", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6918333205, "num_examples": 161813, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 754049291, "num_examples": 15089, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 735558084, "num_examples": 15089, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5528972205, "num_examples": 144791, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1664876264, "num_examples": 40640, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/es.tar.gz": {"num_bytes": 16188844718, "checksum": "276ca393783cd8b208d56b5032b87c13a40fcadde5b3925596e67c15578d0235"}}, "download_size": 16188844718, "post_processing_size": null, "dataset_size": 15601789049, "size_in_bytes": 31790633767}, "et": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "et", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 161124199, "num_examples": 2966, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 133183135, "num_examples": 2509, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 137604813, "num_examples": 2507, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 30339130, "num_examples": 569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 193019544, "num_examples": 3557, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/et.tar.gz": {"num_bytes": 767174465, "checksum": "50a861393e4e7013ab71f1b63bca8c42c26dca1519c15a3b9cdb3cb5b6c561a2"}}, "download_size": 767174465, "post_processing_size": null, "dataset_size": 655270821, "size_in_bytes": 1422445286}, "eu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "eu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 317322801, "num_examples": 7505, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 238866501, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 228150083, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 988079897, "num_examples": 23570, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 208553909, "num_examples": 5387, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/eu.tar.gz": {"num_bytes": 3664586106, "checksum": "55b6eaf7ca7c120faa0b60d71c87189b610412334e6b710fe12c2a79489ab06f"}}, "download_size": 3664586106, "post_processing_size": null, "dataset_size": 1980973191, "size_in_bytes": 5645559297}, "fa": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fa", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 239255087, "num_examples": 7593, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 217939210, "num_examples": 5213, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 196558067, "num_examples": 5213, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 737017546, "num_examples": 22510, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 499570226, "num_examples": 11698, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fa.tar.gz": {"num_bytes": 8884585819, "checksum": "5454efe3b2f6d06d51e7177469b7bef9a962adbf7611e3cd21771451112abe6d"}}, "download_size": 8884585819, "post_processing_size": null, "dataset_size": 1890340136, "size_in_bytes": 10774925955}, "fi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 16017393, "num_examples": 460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 16117529, "num_examples": 428, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 15471757, "num_examples": 415, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 5836400, "num_examples": 149, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2228215, "num_examples": 59, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fi.tar.gz": {"num_bytes": 49882909, "checksum": "eb26d0904beef5ec08cf53267be7e78b8ba5056fd162057d5b085a7cba51f035"}}, "download_size": 49882909, "post_processing_size": null, "dataset_size": 55671294, "size_in_bytes": 105554203}, "fr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 12439892070, "num_examples": 298982, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 733943163, "num_examples": 15763, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 703801114, "num_examples": 15763, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 117998889, "num_examples": 3222, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1794149368, "num_examples": 40351, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fr.tar.gz": {"num_bytes": 19130141984, "checksum": "719ef964b55d830a095a602aff311db39b77239e9d600b6af646ec2ed57e5e45"}}, "download_size": 19130141984, "post_processing_size": null, "dataset_size": 15789784604, "size_in_bytes": 34919926588}, "fy-NL": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "fy-NL", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 159116360, "num_examples": 3927, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 126913262, "num_examples": 3020, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 112288554, "num_examples": 2790, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 893887467, "num_examples": 21569, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 38985422, "num_examples": 1031, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/fy-NL.tar.gz": {"num_bytes": 1237743070, "checksum": "ddee4fc3ce52df2379fa4069090d8f5c853155dc0462eb645f6111e2da627297"}}, "download_size": 1237743070, "post_processing_size": null, "dataset_size": 1331191065, "size_in_bytes": 2568934135}, "ga-IE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ga-IE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 15396820, "num_examples": 541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 16611739, "num_examples": 506, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 14897739, "num_examples": 497, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 61948768, "num_examples": 2130, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 10993268, "num_examples": 409, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ga-IE.tar.gz": {"num_bytes": 156553447, "checksum": "27223fc99af6a45f81190ecb90034806991ff3b9e3aa38a7e97caaabbb0a4ddc"}}, "download_size": 156553447, "post_processing_size": null, "dataset_size": 119848334, "size_in_bytes": 276401781}, "hi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4860737, "num_examples": 157, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4728043, "num_examples": 127, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5569352, "num_examples": 135, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 4176110, "num_examples": 139, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2801051, "num_examples": 60, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hi.tar.gz": {"num_bytes": 21424045, "checksum": "5492393b04dd1307a52d93525a7db08fc392c8ba0df553668945152e434f58c9"}}, "download_size": 21424045, "post_processing_size": null, "dataset_size": 22135293, "size_in_bytes": 43559338}, "hsb": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hsb", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43049910, "num_examples": 808, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 20929094, "num_examples": 387, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 8769458, "num_examples": 172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 3173841, "num_examples": 62, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5589972, "num_examples": 227, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hsb.tar.gz": {"num_bytes": 79362060, "checksum": "3dd3d79aaa078ad7955552ebc596e0a8894ffd7a4a88a51b2c8ee80c0e088152"}}, "download_size": 79362060, "post_processing_size": null, "dataset_size": 81512275, "size_in_bytes": 160874335}, "hu": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "hu", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 126163153, "num_examples": 3348, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 57056435, "num_examples": 1649, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 50306925, "num_examples": 1434, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 12051094, "num_examples": 295, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 5881521, "num_examples": 169, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/hu.tar.gz": {"num_bytes": 242758708, "checksum": "61f933155cba6c54c0b76d0ddd2caebd62d69228b7c935382112abe172660953"}}, "download_size": 242758708, "post_processing_size": null, "dataset_size": 251459128, "size_in_bytes": 494217836}, "ia": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ia", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 96577153, "num_examples": 3477, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 33204678, "num_examples": 899, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 67436779, "num_examples": 1601, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 30937041, "num_examples": 1095, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 6769573, "num_examples": 192, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ia.tar.gz": {"num_bytes": 226499645, "checksum": "47a137a805ea8ce01f2cf9277739919a824a9fd13468345dfbd84eddb52c02f1"}}, "download_size": 226499645, "post_processing_size": null, "dataset_size": 234925224, "size_in_bytes": 461424869}, "id": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "id", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 63515863, "num_examples": 2130, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 60711104, "num_examples": 1844, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 56963520, "num_examples": 1835, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 206578628, "num_examples": 6782, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 16566129, "num_examples": 470, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/id.tar.gz": {"num_bytes": 475918233, "checksum": "71177fa9d2fac29f48db5feabc294f1d6bbcaa0c326b0d1099be66c0b804b245"}}, "download_size": 475918233, "post_processing_size": null, "dataset_size": 404335244, "size_in_bytes": 880253477}, "it": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "it", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2555546829, "num_examples": 58015, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 656285877, "num_examples": 12928, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 621955330, "num_examples": 12928, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 671213467, "num_examples": 14549, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 564610354, "num_examples": 12189, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/it.tar.gz": {"num_bytes": 5585781573, "checksum": "3a75b1631958af1487ee49b13cd27efc951183737ed515832cf714ed20c97808"}}, "download_size": 5585781573, "post_processing_size": null, "dataset_size": 5069611857, "size_in_bytes": 10655393430}, "ja": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ja", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 27600264, "num_examples": 722, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 26475556, "num_examples": 632, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 22098940, "num_examples": 586, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 34588931, "num_examples": 885, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 17819020, "num_examples": 504, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ja.tar.gz": {"num_bytes": 152879796, "checksum": "3614cd0d0abac80794351c78183967c83179fab390d7e19cad97758eb85ae558"}}, "download_size": 152879796, "post_processing_size": null, "dataset_size": 128582711, "size_in_bytes": 281462507}, "ka": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ka", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 47790695, "num_examples": 1058, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 30301524, "num_examples": 656, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 24951079, "num_examples": 527, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2144603, "num_examples": 44, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 7004160, "num_examples": 139, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ka.tar.gz": {"num_bytes": 104280554, "checksum": "7677df9d650234306a11bf8518be5807e72e7d5fc440d391304d1b99dd5517f5"}}, "download_size": 104280554, "post_processing_size": null, "dataset_size": 112192061, "size_in_bytes": 216472615}, "kab": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "kab", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3219289101, "num_examples": 120530, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 446453041, "num_examples": 14622, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 414159937, "num_examples": 14622, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 2282481767, "num_examples": 88021, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 581587104, "num_examples": 18134, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/kab.tar.gz": {"num_bytes": 17171606918, "checksum": "d2089107d4f3a84856c457a436a47a883b872022f2085cfad0501469be91fd95"}}, "download_size": 17171606918, "post_processing_size": null, "dataset_size": 6943970950, "size_in_bytes": 24115577868}, "ky": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ky", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 75460488, "num_examples": 1955, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 57116561, "num_examples": 1503, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 61393867, "num_examples": 1511, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 258081579, "num_examples": 7223, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 41007711, "num_examples": 926, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ky.tar.gz": {"num_bytes": 579440853, "checksum": "6efe0ca5384d0419fcf5fda0e0229a1b5eb80d8eeba2d7528a4c3c9f2593206f"}}, "download_size": 579440853, "post_processing_size": null, "dataset_size": 493060206, "size_in_bytes": 1072501059}, "lg": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lg", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 46910479, "num_examples": 1250, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 26951803, "num_examples": 584, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 16709367, "num_examples": 384, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 111180838, "num_examples": 3110, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 14069959, "num_examples": 290, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lg.tar.gz": {"num_bytes": 208197149, "checksum": "71243c65f638cd7f392fabe22e37cbafbdca4eb5a199210000ae957a88768040"}}, "download_size": 208197149, "post_processing_size": null, "dataset_size": 215822446, "size_in_bytes": 424019595}, "lt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 34605356, "num_examples": 931, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 19940391, "num_examples": 466, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 10462851, "num_examples": 244, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 71150206, "num_examples": 1629, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 4414780, "num_examples": 102, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lt.tar.gz": {"num_bytes": 135299706, "checksum": "5ad3d93bc308f58a70e6685f71ae035237ef9caa0922232ac76846f7587bb8aa"}}, "download_size": 135299706, "post_processing_size": null, "dataset_size": 140573584, "size_in_bytes": 275873290}, "lv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "lv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 67269173, "num_examples": 2552, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 56937435, "num_examples": 1882, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 55289058, "num_examples": 2002, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 40259801, "num_examples": 1560, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 4383319, "num_examples": 143, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/lv.tar.gz": {"num_bytes": 208307691, "checksum": "8a4350ccf24884ee1012032bfd5a87e0de50d780b1f8450d1cb52afe3f69c671"}}, "download_size": 208307691, "post_processing_size": null, "dataset_size": 224138786, "size_in_bytes": 432446477}, "mn": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mn", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 89913910, "num_examples": 2183, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 86737041, "num_examples": 1862, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 82343275, "num_examples": 1837, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 146365394, "num_examples": 3272, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 31764232, "num_examples": 667, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mn.tar.gz": {"num_bytes": 486369317, "checksum": "3aebc40d40eb19263576664a981f4bb8b221abeab78c8154adc3d16875c75ec7"}}, "download_size": 486369317, "post_processing_size": null, "dataset_size": 437123852, "size_in_bytes": 923493169}, "mt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "mt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 73850815, "num_examples": 2036, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 66520195, "num_examples": 1617, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 56412066, "num_examples": 1516, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 220666971, "num_examples": 5714, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 12328068, "num_examples": 314, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/mt.tar.gz": {"num_bytes": 425114242, "checksum": "9d53000d7832d130c4d35fb412bfc092ab8de8e763a5d2a528aebf37f052af03"}}, "download_size": 425114242, "post_processing_size": null, "dataset_size": 429778115, "size_in_bytes": 854892357}, "nl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "nl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 321946148, "num_examples": 9460, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 205287443, "num_examples": 5708, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 186095353, "num_examples": 4938, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 801418, "num_examples": 27, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 115133112, "num_examples": 3308, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/nl.tar.gz": {"num_bytes": 1741827548, "checksum": "048f823408e3bbd16e63111d1b4caecb0102606c440bbdf3e5b6a6bae1e1e3f1"}}, "download_size": 1741827548, "post_processing_size": null, "dataset_size": 829263474, "size_in_bytes": 2571091022}, "or": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "or", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 16067910, "num_examples": 388, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4270651, "num_examples": 98, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 5485937, "num_examples": 129, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 177775963, "num_examples": 4302, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2701922, "num_examples": 62, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/or.tar.gz": {"num_bytes": 199077358, "checksum": "f3edad30166fe454f4d2b14adeece1434dc4b8eb7b0ece37aac8389b7122218a"}}, "download_size": 199077358, "post_processing_size": null, "dataset_size": 206302383, "size_in_bytes": 405379741}, "pa-IN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pa-IN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7572499, "num_examples": 211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 4375532, "num_examples": 116, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 1702492, "num_examples": 44, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 56683312, "num_examples": 1411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 1690766, "num_examples": 43, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pa-IN.tar.gz": {"num_bytes": 69748265, "checksum": "d2e30f28a227ecb8209340c4133edf6489f35f8e3d1eb55ff22b96b12f36952c"}}, "download_size": 69748265, "post_processing_size": null, "dataset_size": 72024601, "size_in_bytes": 141772866}, "pl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 273394509, "num_examples": 7468, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 205047541, "num_examples": 5153, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 195917307, "num_examples": 5153, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 442144781, "num_examples": 12848, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 180801918, "num_examples": 4601, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pl.tar.gz": {"num_bytes": 3537012341, "checksum": "acbf77d36e083e2bcb7152ffb52ab7d1e3e64d33a3f51f106cdff7feff6279aa"}}, "download_size": 3537012341, "post_processing_size": null, "dataset_size": 1297306056, "size_in_bytes": 4834318397}, "pt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "pt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 231451724, "num_examples": 6514, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 180108694, "num_examples": 4641, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 165966139, "num_examples": 4592, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 283497435, "num_examples": 8390, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 67948392, "num_examples": 1740, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/pt.tar.gz": {"num_bytes": 1704252567, "checksum": "6700de499f728e0e3f3ed4d7005e5b7db27ba2ddc872b21b0b404c3b4859d84b"}}, "download_size": 1704252567, "post_processing_size": null, "dataset_size": 928972384, "size_in_bytes": 2633224951}, "rm-sursilv": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-sursilv", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62396326, "num_examples": 1384, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 51707733, "num_examples": 1194, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 52114252, "num_examples": 1205, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 93351293, "num_examples": 2102, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 30593270, "num_examples": 639, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-sursilv.tar.gz": {"num_bytes": 275950479, "checksum": "3cfc4971b6ab8958d7c3d784977690fcc04ebd7570ecf788d5948df84a5481a1"}}, "download_size": 275950479, "post_processing_size": null, "dataset_size": 290162874, "size_in_bytes": 566113353}, "rm-vallader": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rm-vallader", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29528457, "num_examples": 574, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 18805466, "num_examples": 378, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 17012341, "num_examples": 357, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 36890435, "num_examples": 727, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 9356204, "num_examples": 374, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rm-vallader.tar.gz": {"num_bytes": 108113989, "checksum": "4fdb7dc5e20862a636ee7975831b39db29012d615f9139edf2d266b878ce43ae"}}, "download_size": 108113989, "post_processing_size": null, "dataset_size": 111592903, "size_in_bytes": 219706892}, "ro": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ro", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 107235430, "num_examples": 3399, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 60106568, "num_examples": 1778, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 30358457, "num_examples": 858, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 65805210, "num_examples": 1945, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 11108104, "num_examples": 485, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ro.tar.gz": {"num_bytes": 261978702, "checksum": "450b159e936ef6ff136fcdfad193675caec5b2230d1b6ca24c5cde491ff002cd"}}, "download_size": 261978702, "post_processing_size": null, "dataset_size": 274613769, "size_in_bytes": 536592471}, "ru": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ru", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 686168722, "num_examples": 15481, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 385349488, "num_examples": 8007, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 361164462, "num_examples": 7963, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 450644862, "num_examples": 10247, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 145739451, "num_examples": 3056, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ru.tar.gz": {"num_bytes": 3655676916, "checksum": "dcbb460e58d4afc78047c3801c9eb56d940b388eb350ee3da3de5bfe5a74a025"}}, "download_size": 3655676916, "post_processing_size": null, "dataset_size": 2029066985, "size_in_bytes": 5684743901}, "rw": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "rw", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 21645788973, "num_examples": 515197, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 707959382, "num_examples": 15724, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 698662384, "num_examples": 15032, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 923146896, "num_examples": 22923, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 7969286423, "num_examples": 206790, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/rw.tar.gz": {"num_bytes": 42545189583, "checksum": "cf8a07059b3713022d487f9a6b8f465271f3457c525a8b350f829f87b0132b41"}}, "download_size": 42545189583, "post_processing_size": null, "dataset_size": 31944844058, "size_in_bytes": 74490033641}, "sah": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sah", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 68286985, "num_examples": 1442, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 38534020, "num_examples": 757, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 17900397, "num_examples": 405, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 62594222, "num_examples": 1275, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 3594160, "num_examples": 66, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sah.tar.gz": {"num_bytes": 181245626, "checksum": "dea1a454813c8f90abcbdf427fa922e1b7a116753deeb410af096ce5f0ae2405"}}, "download_size": 181245626, "post_processing_size": null, "dataset_size": 190909784, "size_in_bytes": 372155410}, "sl": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sl", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 66122967, "num_examples": 2038, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 26872195, "num_examples": 881, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 16353097, "num_examples": 556, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 79268518, "num_examples": 2502, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 3048301, "num_examples": 92, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sl.tar.gz": {"num_bytes": 222751292, "checksum": "184cfbfe876a1f1c6317e4e34680c82a940db833afca78203c2929db1768a353"}}, "download_size": 222751292, "post_processing_size": null, "dataset_size": 191665078, "size_in_bytes": 414416370}, "sv-SE": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "sv-SE", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62727263, "num_examples": 2331, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 59127381, "num_examples": 2027, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 53846355, "num_examples": 2019, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 109970049, "num_examples": 3043, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 13462567, "num_examples": 462, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/sv-SE.tar.gz": {"num_bytes": 421434184, "checksum": "dc8634dafacb33be00f06e376f6c479d53f84f4834952593c8903f1080535213"}}, "download_size": 421434184, "post_processing_size": null, "dataset_size": 299133615, "size_in_bytes": 720567799}, "ta": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "ta", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 69052658, "num_examples": 2009, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 67616865, "num_examples": 1781, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 63248009, "num_examples": 1779, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 246650792, "num_examples": 7428, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 23587453, "num_examples": 594, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/ta.tar.gz": {"num_bytes": 679766097, "checksum": "78560d9d608a63ee75c3fdeb7f96f33cf0d85855ba6294b13e945de066eb46d8"}}, "download_size": 679766097, "post_processing_size": null, "dataset_size": 470155777, "size_in_bytes": 1149921874}, "th": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "th", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 100435725, "num_examples": 2917, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 82030679, "num_examples": 2188, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 63237632, "num_examples": 1922, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 95235301, "num_examples": 2671, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 18247080, "num_examples": 467, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/th.tar.gz": {"num_bytes": 341305736, "checksum": "a3d11043c49d3ea8ffb58dfab117cd831dd62a641e0a26ac60eb43e483534f7a"}}, "download_size": 341305736, "post_processing_size": null, "dataset_size": 359186417, "size_in_bytes": 700492153}, "tr": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tr", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 57879052, "num_examples": 1831, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 60268059, "num_examples": 1647, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 54914798, "num_examples": 1647, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 10954154, "num_examples": 325, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 59288266, "num_examples": 1726, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tr.tar.gz": {"num_bytes": 620848700, "checksum": "b3f266c868b1fe9f76270ba76226b1cdc17f33b3e387e6b44a64d5419f8b9768"}}, "download_size": 620848700, "post_processing_size": null, "dataset_size": 243304329, "size_in_bytes": 864153029}, "tt": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "tt", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 348132697, "num_examples": 11211, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 135120057, "num_examples": 4485, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 61690964, "num_examples": 2127, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 62158038, "num_examples": 1798, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 10403128, "num_examples": 287, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/tt.tar.gz": {"num_bytes": 777153207, "checksum": "89c8d7a49584de720f1790df39e6f07996e2eecb07f6273f4ba2668e9fe4ad46"}}, "download_size": 777153207, "post_processing_size": null, "dataset_size": 617504884, "size_in_bytes": 1394658091}, "uk": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "uk", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 161925063, "num_examples": 4035, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 138422211, "num_examples": 3235, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 135483169, "num_examples": 3236, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 327979131, "num_examples": 8161, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 55745301, "num_examples": 1255, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/uk.tar.gz": {"num_bytes": 1218559031, "checksum": "f3ca0143cd84f5eacb583187052e69efec21c571a426efee91a765a2284519c2"}}, "download_size": 1218559031, "post_processing_size": null, "dataset_size": 819554875, "size_in_bytes": 2038113906}, "vi": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vi", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6244454, "num_examples": 221, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 6656365, "num_examples": 198, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 6531856, "num_examples": 200, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 31315434, "num_examples": 870, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 2981661, "num_examples": 78, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vi.tar.gz": {"num_bytes": 51929480, "checksum": "704bce8031932377cc21c017923ff1e96ebd2be9bd520adcf839f7a0f5f03b6e"}}, "download_size": 51929480, "post_processing_size": null, "dataset_size": 53729770, "size_in_bytes": 105659250}, "vot": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "vot", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 146467, "num_examples": 3, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 0, "num_examples": 0, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 7963322, "num_examples": 411, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 107949, "num_examples": 6, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/vot.tar.gz": {"num_bytes": 7792602, "checksum": "7fb07dd25b0575e8cd811bb8d1e5aebd17fdbca079a4ee50d81e0aaaff50f8b0"}}, "download_size": 7792602, "post_processing_size": null, "dataset_size": 8217738, "size_in_bytes": 16010340}, "zh-CN": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-CN", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 793667379, "num_examples": 18541, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 420202544, "num_examples": 8760, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 396096323, "num_examples": 8743, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 381264783, "num_examples": 8948, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 266234479, "num_examples": 5305, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-CN.tar.gz": {"num_bytes": 2184602350, "checksum": "cd8589cac28541f9f996d1954f14c307954f1146ac44a8eadad8e31ebaf1f15e"}}, "download_size": 2184602350, "post_processing_size": null, "dataset_size": 2257465508, "size_in_bytes": 4442067858}, "zh-HK": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-HK", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 221459521, "num_examples": 7506, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 217627041, "num_examples": 5172, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 196071110, "num_examples": 5172, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 1319233252, "num_examples": 38830, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 124170969, "num_examples": 2999, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-HK.tar.gz": {"num_bytes": 2774145806, "checksum": "8a525ce4664d6647701449d5e72f7d8658cc3a5fabc72e05c6883994fd3c0134"}}, "download_size": 2774145806, "post_processing_size": null, "dataset_size": 2078561893, "size_in_bytes": 4852707699}, "zh-TW": {"description": "Common Voice is Mozilla's initiative to help teach machines how real people speak.\nThe dataset currently consists of 7,335 validated hours of speech in 60 languages, but we\u2019re always adding more voices and languages.\n", "citation": "@inproceedings{commonvoice:2020,\n  author = {Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J. and Morais, R. and Saunders, L. and Tyers, F. M. and Weber, G.},\n  title = {Common Voice: A Massively-Multilingual Speech Corpus},\n  booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)},\n  pages = {4211--4215},\n  year = 2020\n}\n", "homepage": "https://commonvoice.mozilla.org/en/datasets", "license": "https://github.com/common-voice/common-voice/blob/main/LICENSE", "features": {"client_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "up_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "down_votes": {"dtype": "int64", "id": null, "_type": "Value"}, "age": {"dtype": "string", "id": null, "_type": "Value"}, "gender": {"dtype": "string", "id": null, "_type": "Value"}, "accent": {"dtype": "string", "id": null, "_type": "Value"}, "locale": {"dtype": "string", "id": null, "_type": "Value"}, "segment": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "common_voice", "config_name": "zh-TW", "version": {"version_str": "6.1.0", "description": "", "major": 6, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 97323787, "num_examples": 3507, "dataset_name": "common_voice"}, "test": {"name": "test", "num_bytes": 85512325, "num_examples": 2895, "dataset_name": "common_voice"}, "validation": {"name": "validation", "num_bytes": 80402637, "num_examples": 2895, "dataset_name": "common_voice"}, "other": {"name": "other", "num_bytes": 623801957, "num_examples": 22477, "dataset_name": "common_voice"}, "invalidated": {"name": "invalidated", "num_bytes": 100241443, "num_examples": 3584, "dataset_name": "common_voice"}}, "download_checksums": {"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/zh-TW.tar.gz": {"num_bytes": 2182836295, "checksum": "67fadf561f8237690d4a4a1d63a9b3ac271b5d05438dc745b7e04282d909460f"}}, "download_size": 2182836295, "post_processing_size": null, "dataset_size": 987282149, "size_in_bytes": 3170118444}}
\ No newline at end of file
diff --git a/datasets/librispeech_asr/dataset_infos.json b/datasets/librispeech_asr/dataset_infos.json
index 737f150eba8..3ff72c90ec3 100644
--- a/datasets/librispeech_asr/dataset_infos.json
+++ b/datasets/librispeech_asr/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 11823891, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 43049490, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 894510, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 868614, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 56636505, "size_in_bytes": 30178014159}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 59561081, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 907644, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 934838, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 61403563, "size_in_bytes": 31297968940}}
\ No newline at end of file
+{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}}
\ No newline at end of file
diff --git a/datasets/openslr/dataset_infos.json b/datasets/openslr/dataset_infos.json
index cd269515fb2..e3de4f34dbb 100644
--- a/datasets/openslr/dataset_infos.json
+++ b/datasets/openslr/dataset_infos.json
@@ -1 +1 @@
-{"SLR41": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR41", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2363510, "num_examples": 5822, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/41/jv_id_female.zip": {"num_bytes": 967179448, "checksum": "6fd795a441b3ddd62d6131d4bbd9231151af89f5d9ce5ac7d8ecb370a49576c7"}, "https://openslr.org/resources/41/jv_id_male.zip": {"num_bytes": 923612912, "checksum": "6ee23916b7489420a538e7032f58d7be088a615fb67ec3e7043414d436bb5c1a"}}, "download_size": 1890792360, "post_processing_size": null, "dataset_size": 2363510, "size_in_bytes": 1893155870}, "SLR42": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR42", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1397844, "num_examples": 2906, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/42/km_kh_male.zip": {"num_bytes": 866086951, "checksum": "c0ec9c0494c57f04cf1f2d8d2668d517598375f24e34de07272ecd637c332591"}}, "download_size": 866086951, "post_processing_size": null, "dataset_size": 1397844, "size_in_bytes": 867484795}, "SLR43": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR43", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1052597, "num_examples": 2064, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/43/ne_np_female.zip": {"num_bytes": 800375645, "checksum": "3f355b543e1fad7af5e63116db871fac8e0a2d2f1a2c8f6ebc742270819da101"}}, "download_size": 800375645, "post_processing_size": null, "dataset_size": 1052597, "size_in_bytes": 801428242}, "SLR44": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR44", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1733125, "num_examples": 4213, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/44/su_id_female.zip": {"num_bytes": 861425671, "checksum": "aa75bdef23b7bf0b980431d68df6bb32f695f3be365eb379d4c22516d2d11c5a"}, "https://openslr.org/resources/44/su_id_male.zip": {"num_bytes": 610827081, "checksum": "cabed03a45d4ce0f76e2de4d34b82d6876cd00d5ad6a5349629359028460652d"}}, "download_size": 1472252752, "post_processing_size": null, "dataset_size": 1733125, "size_in_bytes": 1473985877}, "SLR63": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR63", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1973791, "num_examples": 4126, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/63/ml_in_female.zip": {"num_bytes": 710218411, "checksum": "e82d70717d20304f20f635d248c8cb1fd0c9c888e35b9105c8306fc76498a67e"}, "https://openslr.org/resources/63/ml_in_male.zip": {"num_bytes": 635657888, "checksum": "d1a6de4f58f53b973596ff1c69a64afea70f899b044397ce37465c626eee2ab9"}}, "download_size": 1345876299, "post_processing_size": null, "dataset_size": 1973791, "size_in_bytes": 1347850090}, "SLR64": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR64", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 794097, "num_examples": 1569, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/64/mr_in_female.zip": {"num_bytes": 712155683, "checksum": "42b770ee87c95379b55e187b17dccb9fbacb05d0e8292430ffe16a7483948fe5"}}, "download_size": 712155683, "post_processing_size": null, "dataset_size": 794097, "size_in_bytes": 712949780}, "SLR65": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR65", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2092011, "num_examples": 4284, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/65/ta_in_female.zip": {"num_bytes": 769504014, "checksum": "fe00da10ae12ecd6dbe1afcc5abe365d44ad9036fb017cbd73bcfed71e0f8c81"}, "https://openslr.org/resources/65/ta_in_male.zip": {"num_bytes": 603800641, "checksum": "80e546e954939c92a0cd732446418b583b61da9f538f83b00cbd445cbebd4395"}}, "download_size": 1373304655, "post_processing_size": null, "dataset_size": 2092011, "size_in_bytes": 1375396666}, "SLR66": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR66", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1852199, "num_examples": 4448, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/66/te_in_female.zip": {"num_bytes": 505680804, "checksum": "3aa3c22d6fad33ed68951f4934ae47349ee76b77220d8261ec3bda8c24bf42b2"}, "https://openslr.org/resources/66/te_in_male.zip": {"num_bytes": 529447066, "checksum": "f8a0f239d39088b6702a2186681e2874328e9fcd9bfa6a0dd9e1dc5695be3185"}}, "download_size": 1035127870, "post_processing_size": null, "dataset_size": 1852199, "size_in_bytes": 1036980069}, "SLR69": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR69", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1603279, "num_examples": 4240, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/69/ca_es_female.zip": {"num_bytes": 1043934596, "checksum": "2ec39de70550a1cdb93aee960967125fb652b8d26b8de4f6e8658c62847c3f11"}, "https://openslr.org/resources/69/ca_es_male.zip": {"num_bytes": 804724947, "checksum": "8b412ffaa65cd85692c6eab038fc085a8ae5613c6eed38c097a65946c2ee9146"}}, "download_size": 1848659543, "post_processing_size": null, "dataset_size": 1603279, "size_in_bytes": 1850262822}, "SLR35": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR35", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 71645434, "num_examples": 185076, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/35/asr_javanese_0.zip": {"num_bytes": 1197540348, "checksum": "a871c8b71ff8fa9d95955447ca0c388e8c6f925aecfce92e1880bda2da113bcb"}, "https://openslr.org/resources/35/asr_javanese_1.zip": {"num_bytes": 1172552957, "checksum": "8024b18acc265bd502f2c36930ab41bd9a8a9cbc67d3db340698df1f6799eeef"}, "https://openslr.org/resources/35/asr_javanese_2.zip": {"num_bytes": 1187099390, "checksum": "c1605da9f74b0951533bcd9bb66a868dc4552929a6e3597d1f6b66c8436cd87e"}, "https://openslr.org/resources/35/asr_javanese_3.zip": {"num_bytes": 1178721705, "checksum": "f813cfa6ea5db1a2c7af65d62dd4d2edc932e67990570f0e5418675c0c9443d3"}, "https://openslr.org/resources/35/asr_javanese_4.zip": {"num_bytes": 1174850803, "checksum": "506af733d9c1f02372e83e997c924fac5a8141a7920d1ab345bd607e26438f0c"}, "https://openslr.org/resources/35/asr_javanese_5.zip": {"num_bytes": 1178642105, "checksum": "5300df2d2fd95033632fe7d3d77042804c92bf4f9983f11e707c20e358e45a91"}, "https://openslr.org/resources/35/asr_javanese_6.zip": {"num_bytes": 1197026293, "checksum": "a487e12f9d3fd1d3e6d8a8c2b58363813d6121e6a84937ec0d27601fea2654db"}, "https://openslr.org/resources/35/asr_javanese_7.zip": {"num_bytes": 1197789186, "checksum": "944ce7e3463f2e0d6024f8a1768e161a64dd4ab7cf8a96b7924fb8666ae2142e"}, "https://openslr.org/resources/35/asr_javanese_8.zip": {"num_bytes": 1185807385, "checksum": "cb598b81bd681dc51965c912bf4aabc4af6eb9b57d5a7cb0998ed121cec63dcd"}, "https://openslr.org/resources/35/asr_javanese_9.zip": {"num_bytes": 1160028499, "checksum": "7ee9de72360a59dc2a3cd3570627565a638d7a47f0f95ce4c14545bc9b6690b2"}, "https://openslr.org/resources/35/asr_javanese_a.zip": {"num_bytes": 1176016135, "checksum": "1fd1e4b06ed5d18614ef7ce414e7e0b6c105d6f5d87b3a6210fcedc4cc6f35cd"}, "https://openslr.org/resources/35/asr_javanese_b.zip": {"num_bytes": 1176960512, "checksum": "036bb70c60e8ba4b9be090dcd717e1da8744dd1cfdfab1eb4a4cd29d7755b938"}, "https://openslr.org/resources/35/asr_javanese_c.zip": {"num_bytes": 1178017086, "checksum": "a46d7b1ad184a4c2ac9099c8399f18fb8b14dd9ab4172a61f8abe3e464f7b2b9"}, "https://openslr.org/resources/35/asr_javanese_d.zip": {"num_bytes": 1199910382, "checksum": "9f3058916fe721f92a4d1a6c2794d82920b7c88ed780ef06fe69f8e448d0ddb6"}, "https://openslr.org/resources/35/asr_javanese_e.zip": {"num_bytes": 1175431904, "checksum": "d9234d3331fb11c082bc17f3b54c13dfa183c4cb13e35c030f7a1dbbe4c819cd"}, "https://openslr.org/resources/35/asr_javanese_f.zip": {"num_bytes": 1163711036, "checksum": "1bedbc295e4d1592e5730da8f0774fe360fe146d193b9c9815a8025072dd0b70"}}, "download_size": 18900105726, "post_processing_size": null, "dataset_size": 71645434, "size_in_bytes": 18971751160}, "SLR36": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR36", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 86668853, "num_examples": 219156, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/36/asr_sundanese_0.zip": {"num_bytes": 1433294860, "checksum": "947a0ac86008b88130f7c8f1b27d4a0f93886f653cf65b5948c0532cd0097c0d"}, "https://openslr.org/resources/36/asr_sundanese_1.zip": {"num_bytes": 1445470477, "checksum": "365f052dd9d977343002289ea1f29dea466f1243e5edf22dfb933e3fa93a6d87"}, "https://openslr.org/resources/36/asr_sundanese_2.zip": {"num_bytes": 1431289018, "checksum": "f9b9ee2a925d4fd934be3ebe09545ffb3f294f1e6d1380e837054fdf4ce8cff2"}, "https://openslr.org/resources/36/asr_sundanese_3.zip": {"num_bytes": 1446805642, "checksum": "ba3cc0e8e351a5456269c72edf7a3b50cf820941f93d7eed0e8f02a3b1b0a89f"}, "https://openslr.org/resources/36/asr_sundanese_4.zip": {"num_bytes": 1449187658, "checksum": "a6ca66e2537bd55dfaea4e716d847c70aead58c217184ab37afbd4065cca9262"}, "https://openslr.org/resources/36/asr_sundanese_5.zip": {"num_bytes": 1425741894, "checksum": "31bb8a9981b45855ab0b7c634c89040fe99b122455750a6ab956393dc9dec0d8"}, "https://openslr.org/resources/36/asr_sundanese_6.zip": {"num_bytes": 1415730042, "checksum": "3f23d6c4c67dc6f39a8ebb2af43e2efedb57028abb85eb519394f2d9ef8b3a21"}, "https://openslr.org/resources/36/asr_sundanese_7.zip": {"num_bytes": 1436967650, "checksum": "bce8f33b6ed62978915dfc601957162e9eece8bc3190cd2d548d7679409a3d77"}, "https://openslr.org/resources/36/asr_sundanese_8.zip": {"num_bytes": 1436421462, "checksum": "755e0af77d0bd6d4aa7895b2ab9fbf792c57efc49c8cec21d3d728fe3374b621"}, "https://openslr.org/resources/36/asr_sundanese_9.zip": {"num_bytes": 1434660332, "checksum": "5d426d2c99eb91ffd3db193d510e288133c426556430fe2e70e08f58815f5a31"}, "https://openslr.org/resources/36/asr_sundanese_a.zip": {"num_bytes": 1436753516, "checksum": "e032537b62aa8a8abe660bca418ac2e26a93bdc7a357b948a301bde286952fa5"}, "https://openslr.org/resources/36/asr_sundanese_b.zip": {"num_bytes": 1435014221, "checksum": "e999e83fde37ec973b1a1822aaa8769488c2a95058a3448661ac94c319881549"}, "https://openslr.org/resources/36/asr_sundanese_c.zip": {"num_bytes": 1429102490, "checksum": "275ac684fe7b8bf012dc251ddb91496e2d95c2c257ec87ab0847efa379e96787"}, "https://openslr.org/resources/36/asr_sundanese_d.zip": {"num_bytes": 1432973082, "checksum": "34ae64f8a29ddef2e05ca5ce8122b461a737d58d796dbe577a4e8a4a05c6b2ce"}, "https://openslr.org/resources/36/asr_sundanese_e.zip": {"num_bytes": 1443609656, "checksum": "25e36087063e0cc5e54cf04e5a4e065b19e0c1bc9cbc07a9f98635941b53bfea"}, "https://openslr.org/resources/36/asr_sundanese_f.zip": {"num_bytes": 1463531929, "checksum": "3d1410c31cc70994f82b9555967fa4c8d682aee288cc85b05b9c4e6352a49f14"}}, "download_size": 22996553929, "post_processing_size": null, "dataset_size": 86668853, "size_in_bytes": 23083222782}, "SLR70": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR70", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1304770, "num_examples": 3359, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/70/en_ng_female.zip": {"num_bytes": 759856787, "checksum": "e840afea824c9075db8c7d574e993837c6a4861fd0ff0275c4cc223aa00a785c"}, "https://openslr.org/resources/70/en_ng_male.zip": {"num_bytes": 454098409, "checksum": "f619d09d5ffdf0d4044ef1d57585eeaa50c0cbf08844782a9dd08f56ea9e567f"}}, "download_size": 1213955196, "post_processing_size": null, "dataset_size": 1304770, "size_in_bytes": 1215259966}, "SLR71": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR71", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1630901, "num_examples": 4374, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/71/es_cl_female.zip": {"num_bytes": 585615697, "checksum": "23593f3dac085d26f99df38159c1ab0ae2c23f5c97ad869292496abc6e171bc6"}, "https://openslr.org/resources/71/es_cl_male.zip": {"num_bytes": 859750206, "checksum": "ace2cbd6df28e94fdd636ba1263b72b557722b0d2abcf4c6e072011ac870cbee"}}, "download_size": 1445365903, "post_processing_size": null, "dataset_size": 1630901, "size_in_bytes": 1446996804}, "SLR72": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR72", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1825435, "num_examples": 4903, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/72/es_co_female.zip": {"num_bytes": 801960444, "checksum": "03721aa7b6b7fe1dd309a0c545cbef4898fac99ed811f4e1769b2fc16bb7eb70"}, "https://openslr.org/resources/72/es_co_male.zip": {"num_bytes": 810070088, "checksum": "2e72abf283adf3f52c28d9f4d59709d4a24fa57243dc696a99dfbc1b8e534c9a"}}, "download_size": 1612030532, "post_processing_size": null, "dataset_size": 1825435, "size_in_bytes": 1613855967}, "SLR73": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR73", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2027542, "num_examples": 5447, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/73/es_pe_female.zip": {"num_bytes": 913983951, "checksum": "0bcb138a6a4657fa52ec6ec129807dc2476d9a89184ea2ab4f588bbbddc12062"}, "https://openslr.org/resources/73/es_pe_male.zip": {"num_bytes": 1026322863, "checksum": "8baf41802bc59f7d170ee091d8676db725903efdcfeda12d699a31a746ae50bf"}}, "download_size": 1940306814, "post_processing_size": null, "dataset_size": 2027542, "size_in_bytes": 1942334356}, "SLR74": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR74", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 230997, "num_examples": 617, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/74/es_pr_female.zip": {"num_bytes": 214181314, "checksum": "0ff2f4ed63fbbc4305140bb88c71ca9a72b18c6686a755534b47ae28dce2861d"}}, "download_size": 214181314, "post_processing_size": null, "dataset_size": 230997, "size_in_bytes": 214412311}, "SLR75": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR75", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1252119, "num_examples": 3357, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/75/es_ve_female.zip": {"num_bytes": 517000277, "checksum": "4600baead7519afaa5f6b33cf3f4b2373e7f1902aa72841fc38582660b07fe31"}, "https://openslr.org/resources/75/es_ve_male.zip": {"num_bytes": 526316727, "checksum": "3cf8703b1b61de1bf964e26f0a2c7f0ec637b1a85eafd982e98de9301558b289"}}, "download_size": 1043317004, "post_processing_size": null, "dataset_size": 1252119, "size_in_bytes": 1044569123}, "SLR76": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR76", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2682483, "num_examples": 7136, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/76/eu_es_female.zip": {"num_bytes": 1622676657, "checksum": "b3eaa91f2be198c8455f46e802f671e33cba5d95909e58e0b59cb6638f5b4947"}, "https://openslr.org/resources/76/eu_es_male.zip": {"num_bytes": 1418448856, "checksum": "787bcb8369d3797a6b34b0e2d420f5255e12e6c6a385cd4e72ddde59c6018227"}}, "download_size": 3041125513, "post_processing_size": null, "dataset_size": 2682483, "size_in_bytes": 3043807996}, "SLR77": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR77", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2159694, "num_examples": 5587, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/77/gl_es_female.zip": {"num_bytes": 1656677564, "checksum": "e2cda7ef8d5f57b5f3086473d5297e6bb73757f0c446409245f407d7612c5060"}, "https://openslr.org/resources/77/gl_es_male.zip": {"num_bytes": 551314211, "checksum": "b768ed0b77fb4e88adf795dedcc872c53a4348ee8d11eb8efb4571fff94688be"}}, "download_size": 2207991775, "post_processing_size": null, "dataset_size": 2159694, "size_in_bytes": 2210151469}, "SLR78": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR78", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2077670, "num_examples": 4272, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/78/gu_in_female.zip": {"num_bytes": 917450036, "checksum": "bbda0815e0d2e01ad9310768e0e2be9efb612a9c56c66c4ab2f32b817da5c786"}, "https://openslr.org/resources/78/gu_in_male.zip": {"num_bytes": 825772066, "checksum": "ce474d1686104b3bd274a2d5192459cb4dee6e0c9bbcf3de1bb3b39c6ab89caf"}}, "download_size": 1743222102, "post_processing_size": null, "dataset_size": 2077670, "size_in_bytes": 1745299772}, "SLR79": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR79", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2130895, "num_examples": 4400, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/79/kn_in_female.zip": {"num_bytes": 980825420, "checksum": "182a147e5747ad4f4ac50a5e7e1ee3683e1c2c1d9105963365d151d664466b62"}, "https://openslr.org/resources/79/kn_in_male.zip": {"num_bytes": 840093695, "checksum": "38e3c0c51f792a3655cc8f4747b339df8ec4b1031a0fff590c1a1af6a8bbbcdf"}}, "download_size": 1820919115, "post_processing_size": null, "dataset_size": 2130895, "size_in_bytes": 1823050010}, "SLR80": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR80", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1282403, "num_examples": 2530, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/80/my_mm_female.zip": {"num_bytes": 948181015, "checksum": "a7cdcaa5e06864e02fa18fc0fe9595feadf332d6a63aadc01ce51a24969a2708"}}, "download_size": 948181015, "post_processing_size": null, "dataset_size": 1282403, "size_in_bytes": 949463418}, "SLR86": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR86", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 1341639, "num_examples": 3583, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/86/yo_ng_female.zip": {"num_bytes": 462033045, "checksum": "8875ebc839e57a3318ba1ce37d98c35da46d4f99f9f777f83fcf074257804060"}, "https://openslr.org/resources/86/yo_ng_male.zip": {"num_bytes": 445032517, "checksum": "58519b27f6954c446d0e7221b227a6f342b9c5ea66bf02af40c1616e086afc4c"}}, "download_size": 907065562, "post_processing_size": null, "dataset_size": 1341639, "size_in_bytes": 908407201}, "SLR32": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR32", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3958024, "num_examples": 9821, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/32/af_za.tar.gz": {"num_bytes": 950827926, "checksum": "b702a68486bf16cbf302d6e0808ea2e966f3dfa720ea0d6ce36d881aa266978f"}, "https://openslr.org/resources/32/st_za.tar.gz": {"num_bytes": 724425648, "checksum": "509202bcf6fae3b24508cfdbc3a6c886b29b4c3d822adbf6c40b21d98ada3fcf"}, "https://openslr.org/resources/32/tn_za.tar.gz": {"num_bytes": 729406193, "checksum": "3e6a522d2fafa071ec1d484cb79336ff36008a5d5d34e1444984e5df8312eb6f"}, "https://openslr.org/resources/32/xh_za.tar.gz": {"num_bytes": 907498093, "checksum": "712336c82637cbfb4304766dd7c0889bac1664945aed08bafb49eac29ae756c3"}}, "download_size": 3312157860, "post_processing_size": null, "dataset_size": 3958024, "size_in_bytes": 3316115884}, "SLR52": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR52", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 75447705, "num_examples": 185293, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/52/asr_sinhala_0.zip": {"num_bytes": 915237858, "checksum": "41bcd4cf6edde39e49bf8ca6b54c32e1403609759ff9edea2a2696ef7aa8fff5"}, "https://openslr.org/resources/52/asr_sinhala_1.zip": {"num_bytes": 908852134, "checksum": "7a4dd3279254f06ba8d1e864d2aa68eec1e6740cfc2b718d2bc060b878871e74"}, "https://openslr.org/resources/52/asr_sinhala_2.zip": {"num_bytes": 913568157, "checksum": "746b5ee016e09868016851ff2148000570b6cb6b9acde5d16527f20053d1cd14"}, "https://openslr.org/resources/52/asr_sinhala_3.zip": {"num_bytes": 901325452, "checksum": "a167e6bd9c0b64e105cc57528a455a4653303336b85731273039487d9f94afda"}, "https://openslr.org/resources/52/asr_sinhala_4.zip": {"num_bytes": 922493671, "checksum": "f17fc798ea085e876500095e8dd357d1088303598d190642978c353d51d2b94b"}, "https://openslr.org/resources/52/asr_sinhala_5.zip": {"num_bytes": 922505332, "checksum": "8285340d15064caa1da0635d50471c8de24d33e3d1ae7af3c63e4a23d3ba25fe"}, "https://openslr.org/resources/52/asr_sinhala_6.zip": {"num_bytes": 914729823, "checksum": "a511dc329dfc493c9e25d1315ab95da93a8a4b751e032c1848eeeb8655608403"}, "https://openslr.org/resources/52/asr_sinhala_7.zip": {"num_bytes": 911992962, "checksum": "8180736327c3147bac912c329fe3a571a61ecb6d4da7d4584acb0d34ab204fa5"}, "https://openslr.org/resources/52/asr_sinhala_8.zip": {"num_bytes": 924344925, "checksum": "fdf333751c254f8dc7b649fd1a48cf47ae8e855e369a182d88bee3325ae8a99d"}, "https://openslr.org/resources/52/asr_sinhala_9.zip": {"num_bytes": 920427318, "checksum": "288f4a7ea055b3963ad7d6a6e6e6189672715a42d0a1b6e99a1a8ba0fe67a9c6"}, "https://openslr.org/resources/52/asr_sinhala_a.zip": {"num_bytes": 901532849, "checksum": "da36de6739ce5b8c835c3c232d5122b883a88442ec3f91a534154b2a9177d0ec"}, "https://openslr.org/resources/52/asr_sinhala_b.zip": {"num_bytes": 924132571, "checksum": "4b5dd26de34b27e9cc88842e992626694fd329f23493f40c748d556c61395d2a"}, "https://openslr.org/resources/52/asr_sinhala_c.zip": {"num_bytes": 938991415, "checksum": "f6db1cece623fafe866a56b9f7100976823b32f968036b72a9a634138e87e92d"}, "https://openslr.org/resources/52/asr_sinhala_d.zip": {"num_bytes": 911368918, "checksum": "8ecc58c745998b05b21c8af05fdc741d437a654a8babba16c4970ad981074e2c"}, "https://openslr.org/resources/52/asr_sinhala_e.zip": {"num_bytes": 927771260, "checksum": "f5cbfd3c8d1c5bf6fe7a1c1ee606101368512a852856fb2d01f4dde7869f605a"}, "https://openslr.org/resources/52/asr_sinhala_f.zip": {"num_bytes": 917209429, "checksum": "65782dee2ba4256bab123835ef2277a3fd1116f20f403a2c4ff5ace3ac45714c"}}, "download_size": 14676484074, "post_processing_size": null, "dataset_size": 75447705, "size_in_bytes": 14751931779}, "SLR53": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR53", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 85804462, "num_examples": 218703, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/53/asr_bengali_0.zip": {"num_bytes": 919838172, "checksum": "c1bbeadbcffae8a40d8e54f25c6c3dea922951a322cc7875a18f52dec127741a"}, "https://openslr.org/resources/53/asr_bengali_1.zip": {"num_bytes": 906161405, "checksum": "b6af5d30439d25a5df20efd85bfa2e900ee962e3afb91fe88a65cdbb0689cf84"}, "https://openslr.org/resources/53/asr_bengali_2.zip": {"num_bytes": 921562897, "checksum": "ac0b50d5ad38d5295c16b7eb62901b273bd6df55dea7b1a8495e69c1a50c0986"}, "https://openslr.org/resources/53/asr_bengali_3.zip": {"num_bytes": 918817316, "checksum": "444760953dc4e006cd6e38ea647b611c7be93a07a78b1a6b83974fe3ebba6b65"}, "https://openslr.org/resources/53/asr_bengali_4.zip": {"num_bytes": 908199672, "checksum": "975a1b690ccfe0609ba50738666758ad92c3683416d1cf7771972496adb4313f"}, "https://openslr.org/resources/53/asr_bengali_5.zip": {"num_bytes": 932042725, "checksum": "21dec790c4f96771a28347ed4430c74d3f3bff046684f4522c2301f7029f632d"}, "https://openslr.org/resources/53/asr_bengali_6.zip": {"num_bytes": 900826997, "checksum": "b0f93fb831bb36c75a6f4c0731bfb991f8b6529bc3b16ee0bede3e7108a7679e"}, "https://openslr.org/resources/53/asr_bengali_7.zip": {"num_bytes": 927750265, "checksum": "647cbcfb9c92930f4625dbc107f4218cdd37f8e3494df23d42917640da22938c"}, "https://openslr.org/resources/53/asr_bengali_8.zip": {"num_bytes": 927268934, "checksum": "73168b982a0665fb4f1104eaafeb3ddc01780b39978649e01ce6ab7850a86de1"}, "https://openslr.org/resources/53/asr_bengali_9.zip": {"num_bytes": 906382286, "checksum": "25f678604ffe93fc986cc402dc4a4329f36eb44ab627c645c4957dbf8e85917c"}, "https://openslr.org/resources/53/asr_bengali_a.zip": {"num_bytes": 900283300, "checksum": "daf0fc69dbd041fd254e96df1732359666ace7c9aea9d5c64c03ab8add3a00c4"}, "https://openslr.org/resources/53/asr_bengali_b.zip": {"num_bytes": 910050386, "checksum": "2d6fc0f464130bc3761546ac0e8b085921d5f1c9afbf886b9c1fa95f9755fd26"}, "https://openslr.org/resources/53/asr_bengali_c.zip": {"num_bytes": 897120616, "checksum": "116e8e63882f548410a3b835d2d3b6a11e6a05969374d173b9c01a8ba7112abd"}, "https://openslr.org/resources/53/asr_bengali_d.zip": {"num_bytes": 914366610, "checksum": "aa155d8e0688d032229ad7a5e4c713e696d1ea531feae83ae3230e526f1db7a6"}, "https://openslr.org/resources/53/asr_bengali_e.zip": {"num_bytes": 922936447, "checksum": "2f6f97591adde2b469f29b601ba33bfc3e8049681594fe31be8a55204c70ae15"}, "https://openslr.org/resources/53/asr_bengali_f.zip": {"num_bytes": 917202893, "checksum": "42542ec7d434bd6a34b30c01fa24de206fb2d2e56afea745a14867a8c0eaa32c"}}, "download_size": 14630810921, "post_processing_size": null, "dataset_size": 85804462, "size_in_bytes": 14716615383}, "SLR54": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR54", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 61097744, "num_examples": 157905, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/54/asr_nepali_0.zip": {"num_bytes": 589002210, "checksum": "6c783a5a731c7a9c2cac678823a2ee7866db1acbad7f9a199bce3bf7a64e22b6"}, "https://openslr.org/resources/54/asr_nepali_1.zip": {"num_bytes": 582088242, "checksum": "661865704f3d9adacd74f8c98cd0f6a6e869902c6441efb96c761573dd1d2f05"}, "https://openslr.org/resources/54/asr_nepali_2.zip": {"num_bytes": 589401540, "checksum": "a2b4c373d7ebe5f2d491bf73c2324e6f5645d724df58fd765c71a3a86e7ab6d4"}, "https://openslr.org/resources/54/asr_nepali_3.zip": {"num_bytes": 574596426, "checksum": "6a925d4448f98694185d50cacfa380fad128b47ebf9d5519526b83dd6586348d"}, "https://openslr.org/resources/54/asr_nepali_4.zip": {"num_bytes": 583746586, "checksum": "7315f69b392690c22db32b3c2f14b82b1f64215c5a21d697c421d6d220a55bf0"}, "https://openslr.org/resources/54/asr_nepali_5.zip": {"num_bytes": 572967016, "checksum": "3891b332a9fc55e4fb0579bf67431989e92ab05b9715c0e9673cf356e878e0df"}, "https://openslr.org/resources/54/asr_nepali_6.zip": {"num_bytes": 588104006, "checksum": "78c321a8f55a5aa0c56feb791826a2751087cc87a36b27bba56ac6b124eac73f"}, "https://openslr.org/resources/54/asr_nepali_7.zip": {"num_bytes": 588410232, "checksum": "8b05b8b4aedfc9829cf33cd65ab3c1474eb8f738078b414d40b61f08782064ec"}, "https://openslr.org/resources/54/asr_nepali_8.zip": {"num_bytes": 585192213, "checksum": "0125cfc7c54e44bd4ac01d5558130a752cad26aa7055df753c65b400ece2c9f8"}, "https://openslr.org/resources/54/asr_nepali_9.zip": {"num_bytes": 578834881, "checksum": "6c68e80fe7c58a33aeb91b5b9bc37a99f9374a8f629e2a109bddba51d1712b12"}, "https://openslr.org/resources/54/asr_nepali_a.zip": {"num_bytes": 587798317, "checksum": "03b7bf7b6ace01a677e2a0dd079053ea29abf45743f197761190f3f52678e6df"}, "https://openslr.org/resources/54/asr_nepali_b.zip": {"num_bytes": 584397714, "checksum": "9a98d93ae91e75c6928d9222b387105e99030b8b81df9ada57c87f6b317c0853"}, "https://openslr.org/resources/54/asr_nepali_c.zip": {"num_bytes": 579440365, "checksum": "8bac1a046a86fc3684bfec2e5af1b1e0916ec5c2f1be5ccb1fb4778ecd7bb357"}, "https://openslr.org/resources/54/asr_nepali_d.zip": {"num_bytes": 588470094, "checksum": "9aad327fd72efcc009d060a8299aa70ca1757f1ec32fe3280d53e449ef75e5c3"}, "https://openslr.org/resources/54/asr_nepali_e.zip": {"num_bytes": 578091869, "checksum": "4ba73ada7cf482611b3ad3e17a77685b1ac872e5840953c07a1c6c2b10a83e4a"}, "https://openslr.org/resources/54/asr_nepali_f.zip": {"num_bytes": 577705651, "checksum": "062f4908802ab0d57362da1dfea4898898f6d21ba09596c1e271c2cda47297c6"}}, "download_size": 9328247362, "post_processing_size": null, "dataset_size": 61097744, "size_in_bytes": 9389345106}, "SLR83": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR83", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7098985, "num_examples": 17877, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/83/irish_english_male.zip": {"num_bytes": 164531638, "checksum": "2e5dbae4cc27e0e24e21f21c8e7464d219feb703f5fee3e567de6561a05024ed"}, "https://openslr.org/resources/83/midlands_english_female.zip": {"num_bytes": 103085118, "checksum": "aa1083a319e52d658b85c162905ec27cdf2ac6d5645b4caeab05a385a2c8a37f"}, "https://openslr.org/resources/83/midlands_english_male.zip": {"num_bytes": 166833961, "checksum": "8192c7a0626eb742f9999e63162289f8f9a86c9cb49ef68298dc7f624acaebcf"}, "https://openslr.org/resources/83/northern_english_female.zip": {"num_bytes": 314983063, "checksum": "22b6229d08481e7605b028185dc55dccd0611db428854f2d485d9ff34395a65c"}, "https://openslr.org/resources/83/northern_english_male.zip": {"num_bytes": 817772034, "checksum": "b627d500d1b2e3c4921fb6d91338ead7b972f67c1c2f0babb300e0ef844c7248"}, "https://openslr.org/resources/83/scottish_english_female.zip": {"num_bytes": 351443880, "checksum": "2dbe5545a7ab87112c7730086586f738ec4f42171f7738628ba084ed4ba15ccb"}, "https://openslr.org/resources/83/scottish_english_male.zip": {"num_bytes": 620254118, "checksum": "c7d2d9cd581c48a8323f6cc3886d879e2e7aca5931d98228e07d07b350d9f9a9"}, "https://openslr.org/resources/83/southern_english_female.zip": {"num_bytes": 1636701939, "checksum": "e0a2e8e64b9efdbd7bae5cdf33ac8b81db495b499c9d40da0a7d7842e42b1e76"}, "https://openslr.org/resources/83/southern_english_male.zip": {"num_bytes": 1700955740, "checksum": "788b1c59fb5713b0e1efebc02b7aa1b55182b21955493b299b9941c70a878cad"}, "https://openslr.org/resources/83/welsh_english_female.zip": {"num_bytes": 595683538, "checksum": "3c2465b9618e33f42c7d2ee753b54ae593714e758e236efcdd56c14c5bd89f1d"}, "https://openslr.org/resources/83/welsh_english_male.zip": {"num_bytes": 757645790, "checksum": "eaf8de0f8872bb647d5c159bb33713cfd58966bd59d733f5f399793778ea5058"}}, "download_size": 7229890819, "post_processing_size": null, "dataset_size": 7098985, "size_in_bytes": 7236989804}}
\ No newline at end of file
+{"SLR41": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR41", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2423902, "num_examples": 5822, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/41/jv_id_female.zip": {"num_bytes": 967179448, "checksum": "6fd795a441b3ddd62d6131d4bbd9231151af89f5d9ce5ac7d8ecb370a49576c7"}, "https://openslr.org/resources/41/jv_id_male.zip": {"num_bytes": 923612912, "checksum": "6ee23916b7489420a538e7032f58d7be088a615fb67ec3e7043414d436bb5c1a"}}, "download_size": 1890792360, "post_processing_size": null, "dataset_size": 2423902, "size_in_bytes": 1893216262}, "SLR42": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR42", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1427984, "num_examples": 2906, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/42/km_kh_male.zip": {"num_bytes": 866086951, "checksum": "c0ec9c0494c57f04cf1f2d8d2668d517598375f24e34de07272ecd637c332591"}}, "download_size": 866086951, "post_processing_size": null, "dataset_size": 1427984, "size_in_bytes": 867514935}, "SLR43": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR43", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1074005, "num_examples": 2064, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/43/ne_np_female.zip": {"num_bytes": 800375645, "checksum": "3f355b543e1fad7af5e63116db871fac8e0a2d2f1a2c8f6ebc742270819da101"}}, "download_size": 800375645, "post_processing_size": null, "dataset_size": 1074005, "size_in_bytes": 801449650}, "SLR44": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR44", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1776827, "num_examples": 4213, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/44/su_id_female.zip": {"num_bytes": 861425671, "checksum": "aa75bdef23b7bf0b980431d68df6bb32f695f3be365eb379d4c22516d2d11c5a"}, "https://openslr.org/resources/44/su_id_male.zip": {"num_bytes": 610827081, "checksum": "cabed03a45d4ce0f76e2de4d34b82d6876cd00d5ad6a5349629359028460652d"}}, "download_size": 1472252752, "post_processing_size": null, "dataset_size": 1776827, "size_in_bytes": 1474029579}, "SLR63": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR63", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2016587, "num_examples": 4126, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/63/ml_in_female.zip": {"num_bytes": 710218411, "checksum": "e82d70717d20304f20f635d248c8cb1fd0c9c888e35b9105c8306fc76498a67e"}, "https://openslr.org/resources/63/ml_in_male.zip": {"num_bytes": 635657888, "checksum": "d1a6de4f58f53b973596ff1c69a64afea70f899b044397ce37465c626eee2ab9"}}, "download_size": 1345876299, "post_processing_size": null, "dataset_size": 2016587, "size_in_bytes": 1347892886}, "SLR64": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR64", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 810375, "num_examples": 1569, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/64/mr_in_female.zip": {"num_bytes": 712155683, "checksum": "42b770ee87c95379b55e187b17dccb9fbacb05d0e8292430ffe16a7483948fe5"}}, "download_size": 712155683, "post_processing_size": null, "dataset_size": 810375, "size_in_bytes": 712966058}, "SLR65": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR65", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2136447, "num_examples": 4284, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/65/ta_in_female.zip": {"num_bytes": 769504014, "checksum": "fe00da10ae12ecd6dbe1afcc5abe365d44ad9036fb017cbd73bcfed71e0f8c81"}, "https://openslr.org/resources/65/ta_in_male.zip": {"num_bytes": 603800641, "checksum": "80e546e954939c92a0cd732446418b583b61da9f538f83b00cbd445cbebd4395"}}, "download_size": 1373304655, "post_processing_size": null, "dataset_size": 2136447, "size_in_bytes": 1375441102}, "SLR66": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR66", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1898335, "num_examples": 4448, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/66/te_in_female.zip": {"num_bytes": 505680804, "checksum": "3aa3c22d6fad33ed68951f4934ae47349ee76b77220d8261ec3bda8c24bf42b2"}, "https://openslr.org/resources/66/te_in_male.zip": {"num_bytes": 529447066, "checksum": "f8a0f239d39088b6702a2186681e2874328e9fcd9bfa6a0dd9e1dc5695be3185"}}, "download_size": 1035127870, "post_processing_size": null, "dataset_size": 1898335, "size_in_bytes": 1037026205}, "SLR69": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR69", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1647263, "num_examples": 4240, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/69/ca_es_female.zip": {"num_bytes": 1043934596, "checksum": "2ec39de70550a1cdb93aee960967125fb652b8d26b8de4f6e8658c62847c3f11"}, "https://openslr.org/resources/69/ca_es_male.zip": {"num_bytes": 804724947, "checksum": "8b412ffaa65cd85692c6eab038fc085a8ae5613c6eed38c097a65946c2ee9146"}}, "download_size": 1848659543, "post_processing_size": null, "dataset_size": 1647263, "size_in_bytes": 1850306806}, "SLR35": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR35", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 73565374, "num_examples": 185076, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/35/asr_javanese_0.zip": {"num_bytes": 1197540348, "checksum": "a871c8b71ff8fa9d95955447ca0c388e8c6f925aecfce92e1880bda2da113bcb"}, "https://openslr.org/resources/35/asr_javanese_1.zip": {"num_bytes": 1172552957, "checksum": "8024b18acc265bd502f2c36930ab41bd9a8a9cbc67d3db340698df1f6799eeef"}, "https://openslr.org/resources/35/asr_javanese_2.zip": {"num_bytes": 1187099390, "checksum": "c1605da9f74b0951533bcd9bb66a868dc4552929a6e3597d1f6b66c8436cd87e"}, "https://openslr.org/resources/35/asr_javanese_3.zip": {"num_bytes": 1178721705, "checksum": "f813cfa6ea5db1a2c7af65d62dd4d2edc932e67990570f0e5418675c0c9443d3"}, "https://openslr.org/resources/35/asr_javanese_4.zip": {"num_bytes": 1174850803, "checksum": "506af733d9c1f02372e83e997c924fac5a8141a7920d1ab345bd607e26438f0c"}, "https://openslr.org/resources/35/asr_javanese_5.zip": {"num_bytes": 1178642105, "checksum": "5300df2d2fd95033632fe7d3d77042804c92bf4f9983f11e707c20e358e45a91"}, "https://openslr.org/resources/35/asr_javanese_6.zip": {"num_bytes": 1197026293, "checksum": "a487e12f9d3fd1d3e6d8a8c2b58363813d6121e6a84937ec0d27601fea2654db"}, "https://openslr.org/resources/35/asr_javanese_7.zip": {"num_bytes": 1197789186, "checksum": "944ce7e3463f2e0d6024f8a1768e161a64dd4ab7cf8a96b7924fb8666ae2142e"}, "https://openslr.org/resources/35/asr_javanese_8.zip": {"num_bytes": 1185807385, "checksum": "cb598b81bd681dc51965c912bf4aabc4af6eb9b57d5a7cb0998ed121cec63dcd"}, "https://openslr.org/resources/35/asr_javanese_9.zip": {"num_bytes": 1160028499, "checksum": "7ee9de72360a59dc2a3cd3570627565a638d7a47f0f95ce4c14545bc9b6690b2"}, "https://openslr.org/resources/35/asr_javanese_a.zip": {"num_bytes": 1176016135, "checksum": "1fd1e4b06ed5d18614ef7ce414e7e0b6c105d6f5d87b3a6210fcedc4cc6f35cd"}, "https://openslr.org/resources/35/asr_javanese_b.zip": {"num_bytes": 1176960512, "checksum": "036bb70c60e8ba4b9be090dcd717e1da8744dd1cfdfab1eb4a4cd29d7755b938"}, "https://openslr.org/resources/35/asr_javanese_c.zip": {"num_bytes": 1178017086, "checksum": "a46d7b1ad184a4c2ac9099c8399f18fb8b14dd9ab4172a61f8abe3e464f7b2b9"}, "https://openslr.org/resources/35/asr_javanese_d.zip": {"num_bytes": 1199910382, "checksum": "9f3058916fe721f92a4d1a6c2794d82920b7c88ed780ef06fe69f8e448d0ddb6"}, "https://openslr.org/resources/35/asr_javanese_e.zip": {"num_bytes": 1175431904, "checksum": "d9234d3331fb11c082bc17f3b54c13dfa183c4cb13e35c030f7a1dbbe4c819cd"}, "https://openslr.org/resources/35/asr_javanese_f.zip": {"num_bytes": 1163711036, "checksum": "1bedbc295e4d1592e5730da8f0774fe360fe146d193b9c9815a8025072dd0b70"}}, "download_size": 18900105726, "post_processing_size": null, "dataset_size": 73565374, "size_in_bytes": 18973671100}, "SLR36": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR36", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 88942337, "num_examples": 219156, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/36/asr_sundanese_0.zip": {"num_bytes": 1433294860, "checksum": "947a0ac86008b88130f7c8f1b27d4a0f93886f653cf65b5948c0532cd0097c0d"}, "https://openslr.org/resources/36/asr_sundanese_1.zip": {"num_bytes": 1445470477, "checksum": "365f052dd9d977343002289ea1f29dea466f1243e5edf22dfb933e3fa93a6d87"}, "https://openslr.org/resources/36/asr_sundanese_2.zip": {"num_bytes": 1431289018, "checksum": "f9b9ee2a925d4fd934be3ebe09545ffb3f294f1e6d1380e837054fdf4ce8cff2"}, "https://openslr.org/resources/36/asr_sundanese_3.zip": {"num_bytes": 1446805642, "checksum": "ba3cc0e8e351a5456269c72edf7a3b50cf820941f93d7eed0e8f02a3b1b0a89f"}, "https://openslr.org/resources/36/asr_sundanese_4.zip": {"num_bytes": 1449187658, "checksum": "a6ca66e2537bd55dfaea4e716d847c70aead58c217184ab37afbd4065cca9262"}, "https://openslr.org/resources/36/asr_sundanese_5.zip": {"num_bytes": 1425741894, "checksum": "31bb8a9981b45855ab0b7c634c89040fe99b122455750a6ab956393dc9dec0d8"}, "https://openslr.org/resources/36/asr_sundanese_6.zip": {"num_bytes": 1415730042, "checksum": "3f23d6c4c67dc6f39a8ebb2af43e2efedb57028abb85eb519394f2d9ef8b3a21"}, "https://openslr.org/resources/36/asr_sundanese_7.zip": {"num_bytes": 1436967650, "checksum": "bce8f33b6ed62978915dfc601957162e9eece8bc3190cd2d548d7679409a3d77"}, "https://openslr.org/resources/36/asr_sundanese_8.zip": {"num_bytes": 1436421462, "checksum": "755e0af77d0bd6d4aa7895b2ab9fbf792c57efc49c8cec21d3d728fe3374b621"}, "https://openslr.org/resources/36/asr_sundanese_9.zip": {"num_bytes": 1434660332, "checksum": "5d426d2c99eb91ffd3db193d510e288133c426556430fe2e70e08f58815f5a31"}, "https://openslr.org/resources/36/asr_sundanese_a.zip": {"num_bytes": 1436753516, "checksum": "e032537b62aa8a8abe660bca418ac2e26a93bdc7a357b948a301bde286952fa5"}, "https://openslr.org/resources/36/asr_sundanese_b.zip": {"num_bytes": 1435014221, "checksum": "e999e83fde37ec973b1a1822aaa8769488c2a95058a3448661ac94c319881549"}, "https://openslr.org/resources/36/asr_sundanese_c.zip": {"num_bytes": 1429102490, "checksum": "275ac684fe7b8bf012dc251ddb91496e2d95c2c257ec87ab0847efa379e96787"}, "https://openslr.org/resources/36/asr_sundanese_d.zip": {"num_bytes": 1432973082, "checksum": "34ae64f8a29ddef2e05ca5ce8122b461a737d58d796dbe577a4e8a4a05c6b2ce"}, "https://openslr.org/resources/36/asr_sundanese_e.zip": {"num_bytes": 1443609656, "checksum": "25e36087063e0cc5e54cf04e5a4e065b19e0c1bc9cbc07a9f98635941b53bfea"}, "https://openslr.org/resources/36/asr_sundanese_f.zip": {"num_bytes": 1463531929, "checksum": "3d1410c31cc70994f82b9555967fa4c8d682aee288cc85b05b9c4e6352a49f14"}}, "download_size": 22996553929, "post_processing_size": null, "dataset_size": 88942337, "size_in_bytes": 23085496266}, "SLR70": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR70", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1339608, "num_examples": 3359, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/70/en_ng_female.zip": {"num_bytes": 759856787, "checksum": "e840afea824c9075db8c7d574e993837c6a4861fd0ff0275c4cc223aa00a785c"}, "https://openslr.org/resources/70/en_ng_male.zip": {"num_bytes": 454098409, "checksum": "f619d09d5ffdf0d4044ef1d57585eeaa50c0cbf08844782a9dd08f56ea9e567f"}}, "download_size": 1213955196, "post_processing_size": null, "dataset_size": 1339608, "size_in_bytes": 1215294804}, "SLR71": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR71", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1676273, "num_examples": 4374, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/71/es_cl_female.zip": {"num_bytes": 585615697, "checksum": "23593f3dac085d26f99df38159c1ab0ae2c23f5c97ad869292496abc6e171bc6"}, "https://openslr.org/resources/71/es_cl_male.zip": {"num_bytes": 859750206, "checksum": "ace2cbd6df28e94fdd636ba1263b72b557722b0d2abcf4c6e072011ac870cbee"}}, "download_size": 1445365903, "post_processing_size": null, "dataset_size": 1676273, "size_in_bytes": 1447042176}, "SLR72": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR72", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1876301, "num_examples": 4903, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/72/es_co_female.zip": {"num_bytes": 801960444, "checksum": "03721aa7b6b7fe1dd309a0c545cbef4898fac99ed811f4e1769b2fc16bb7eb70"}, "https://openslr.org/resources/72/es_co_male.zip": {"num_bytes": 810070088, "checksum": "2e72abf283adf3f52c28d9f4d59709d4a24fa57243dc696a99dfbc1b8e534c9a"}}, "download_size": 1612030532, "post_processing_size": null, "dataset_size": 1876301, "size_in_bytes": 1613906833}, "SLR73": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR73", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2084052, "num_examples": 5447, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/73/es_pe_female.zip": {"num_bytes": 913983951, "checksum": "0bcb138a6a4657fa52ec6ec129807dc2476d9a89184ea2ab4f588bbbddc12062"}, "https://openslr.org/resources/73/es_pe_male.zip": {"num_bytes": 1026322863, "checksum": "8baf41802bc59f7d170ee091d8676db725903efdcfeda12d699a31a746ae50bf"}}, "download_size": 1940306814, "post_processing_size": null, "dataset_size": 2084052, "size_in_bytes": 1942390866}, "SLR74": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR74", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 237395, "num_examples": 617, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/74/es_pr_female.zip": {"num_bytes": 214181314, "checksum": "0ff2f4ed63fbbc4305140bb88c71ca9a72b18c6686a755534b47ae28dce2861d"}}, "download_size": 214181314, "post_processing_size": null, "dataset_size": 237395, "size_in_bytes": 214418709}, "SLR75": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR75", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1286937, "num_examples": 3357, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/75/es_ve_female.zip": {"num_bytes": 517000277, "checksum": "4600baead7519afaa5f6b33cf3f4b2373e7f1902aa72841fc38582660b07fe31"}, "https://openslr.org/resources/75/es_ve_male.zip": {"num_bytes": 526316727, "checksum": "3cf8703b1b61de1bf964e26f0a2c7f0ec637b1a85eafd982e98de9301558b289"}}, "download_size": 1043317004, "post_processing_size": null, "dataset_size": 1286937, "size_in_bytes": 1044603941}, "SLR76": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR76", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2756507, "num_examples": 7136, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/76/eu_es_female.zip": {"num_bytes": 1622676657, "checksum": "b3eaa91f2be198c8455f46e802f671e33cba5d95909e58e0b59cb6638f5b4947"}, "https://openslr.org/resources/76/eu_es_male.zip": {"num_bytes": 1418448856, "checksum": "787bcb8369d3797a6b34b0e2d420f5255e12e6c6a385cd4e72ddde59c6018227"}}, "download_size": 3041125513, "post_processing_size": null, "dataset_size": 2756507, "size_in_bytes": 3043882020}, "SLR77": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR77", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2217652, "num_examples": 5587, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/77/gl_es_female.zip": {"num_bytes": 1656677564, "checksum": "e2cda7ef8d5f57b5f3086473d5297e6bb73757f0c446409245f407d7612c5060"}, "https://openslr.org/resources/77/gl_es_male.zip": {"num_bytes": 551314211, "checksum": "b768ed0b77fb4e88adf795dedcc872c53a4348ee8d11eb8efb4571fff94688be"}}, "download_size": 2207991775, "post_processing_size": null, "dataset_size": 2217652, "size_in_bytes": 2210209427}, "SLR78": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR78", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2121986, "num_examples": 4272, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/78/gu_in_female.zip": {"num_bytes": 917450036, "checksum": "bbda0815e0d2e01ad9310768e0e2be9efb612a9c56c66c4ab2f32b817da5c786"}, "https://openslr.org/resources/78/gu_in_male.zip": {"num_bytes": 825772066, "checksum": "ce474d1686104b3bd274a2d5192459cb4dee6e0c9bbcf3de1bb3b39c6ab89caf"}}, "download_size": 1743222102, "post_processing_size": null, "dataset_size": 2121986, "size_in_bytes": 1745344088}, "SLR79": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR79", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2176539, "num_examples": 4400, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/79/kn_in_female.zip": {"num_bytes": 980825420, "checksum": "182a147e5747ad4f4ac50a5e7e1ee3683e1c2c1d9105963365d151d664466b62"}, "https://openslr.org/resources/79/kn_in_male.zip": {"num_bytes": 840093695, "checksum": "38e3c0c51f792a3655cc8f4747b339df8ec4b1031a0fff590c1a1af6a8bbbcdf"}}, "download_size": 1820919115, "post_processing_size": null, "dataset_size": 2176539, "size_in_bytes": 1823095654}, "SLR80": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR80", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1308651, "num_examples": 2530, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/80/my_mm_female.zip": {"num_bytes": 948181015, "checksum": "a7cdcaa5e06864e02fa18fc0fe9595feadf332d6a63aadc01ce51a24969a2708"}}, "download_size": 948181015, "post_processing_size": null, "dataset_size": 1308651, "size_in_bytes": 949489666}, "SLR86": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR86", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 1378801, "num_examples": 3583, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/86/yo_ng_female.zip": {"num_bytes": 462033045, "checksum": "8875ebc839e57a3318ba1ce37d98c35da46d4f99f9f777f83fcf074257804060"}, "https://openslr.org/resources/86/yo_ng_male.zip": {"num_bytes": 445032517, "checksum": "58519b27f6954c446d0e7221b227a6f342b9c5ea66bf02af40c1616e086afc4c"}}, "download_size": 907065562, "post_processing_size": null, "dataset_size": 1378801, "size_in_bytes": 908444363}, "SLR32": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR32", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4544052380, "num_examples": 9821, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/32/af_za.tar.gz": {"num_bytes": 950827926, "checksum": "b702a68486bf16cbf302d6e0808ea2e966f3dfa720ea0d6ce36d881aa266978f"}, "https://openslr.org/resources/32/st_za.tar.gz": {"num_bytes": 724425648, "checksum": "509202bcf6fae3b24508cfdbc3a6c886b29b4c3d822adbf6c40b21d98ada3fcf"}, "https://openslr.org/resources/32/tn_za.tar.gz": {"num_bytes": 729406193, "checksum": "3e6a522d2fafa071ec1d484cb79336ff36008a5d5d34e1444984e5df8312eb6f"}, "https://openslr.org/resources/32/xh_za.tar.gz": {"num_bytes": 907498093, "checksum": "712336c82637cbfb4304766dd7c0889bac1664945aed08bafb49eac29ae756c3"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/af_za/line_index.tsv": {"num_bytes": 218947, "checksum": "c4d096cb50a037ce8c3a41a198615083d93c3bbbd6f1cfdb52c3ebfa5de09340"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/st_za/line_index.tsv": {"num_bytes": 154784, "checksum": "04cd7e8db7eae8ad9044fa8ac79f3e48fd3a64d045cd907ff005fd82f1ca6a82"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/tn_za/line_index.tsv": {"num_bytes": 174447, "checksum": "c621270b3ee70d515bbce846e1b64135dc4554f62cf3528d9550a1512f5841f1"}, "https://s3.amazonaws.com/datasets.huggingface.co/openslr/SLR32/xh_za/line_index.tsv": {"num_bytes": 178725, "checksum": "6a356aac4e698561302574f62be30029536ac057e009633f0af8de68513d874a"}}, "download_size": 3312884763, "post_processing_size": null, "dataset_size": 4544052380, "size_in_bytes": 7856937143}, "SLR52": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR52", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 77369899, "num_examples": 185293, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/52/asr_sinhala_0.zip": {"num_bytes": 915237858, "checksum": "41bcd4cf6edde39e49bf8ca6b54c32e1403609759ff9edea2a2696ef7aa8fff5"}, "https://openslr.org/resources/52/asr_sinhala_1.zip": {"num_bytes": 908852134, "checksum": "7a4dd3279254f06ba8d1e864d2aa68eec1e6740cfc2b718d2bc060b878871e74"}, "https://openslr.org/resources/52/asr_sinhala_2.zip": {"num_bytes": 913568157, "checksum": "746b5ee016e09868016851ff2148000570b6cb6b9acde5d16527f20053d1cd14"}, "https://openslr.org/resources/52/asr_sinhala_3.zip": {"num_bytes": 901325452, "checksum": "a167e6bd9c0b64e105cc57528a455a4653303336b85731273039487d9f94afda"}, "https://openslr.org/resources/52/asr_sinhala_4.zip": {"num_bytes": 922493671, "checksum": "f17fc798ea085e876500095e8dd357d1088303598d190642978c353d51d2b94b"}, "https://openslr.org/resources/52/asr_sinhala_5.zip": {"num_bytes": 922505332, "checksum": "8285340d15064caa1da0635d50471c8de24d33e3d1ae7af3c63e4a23d3ba25fe"}, "https://openslr.org/resources/52/asr_sinhala_6.zip": {"num_bytes": 914729823, "checksum": "a511dc329dfc493c9e25d1315ab95da93a8a4b751e032c1848eeeb8655608403"}, "https://openslr.org/resources/52/asr_sinhala_7.zip": {"num_bytes": 911992962, "checksum": "8180736327c3147bac912c329fe3a571a61ecb6d4da7d4584acb0d34ab204fa5"}, "https://openslr.org/resources/52/asr_sinhala_8.zip": {"num_bytes": 924344925, "checksum": "fdf333751c254f8dc7b649fd1a48cf47ae8e855e369a182d88bee3325ae8a99d"}, "https://openslr.org/resources/52/asr_sinhala_9.zip": {"num_bytes": 920427318, "checksum": "288f4a7ea055b3963ad7d6a6e6e6189672715a42d0a1b6e99a1a8ba0fe67a9c6"}, "https://openslr.org/resources/52/asr_sinhala_a.zip": {"num_bytes": 901532849, "checksum": "da36de6739ce5b8c835c3c232d5122b883a88442ec3f91a534154b2a9177d0ec"}, "https://openslr.org/resources/52/asr_sinhala_b.zip": {"num_bytes": 924132571, "checksum": "4b5dd26de34b27e9cc88842e992626694fd329f23493f40c748d556c61395d2a"}, "https://openslr.org/resources/52/asr_sinhala_c.zip": {"num_bytes": 938991415, "checksum": "f6db1cece623fafe866a56b9f7100976823b32f968036b72a9a634138e87e92d"}, "https://openslr.org/resources/52/asr_sinhala_d.zip": {"num_bytes": 911368918, "checksum": "8ecc58c745998b05b21c8af05fdc741d437a654a8babba16c4970ad981074e2c"}, "https://openslr.org/resources/52/asr_sinhala_e.zip": {"num_bytes": 927771260, "checksum": "f5cbfd3c8d1c5bf6fe7a1c1ee606101368512a852856fb2d01f4dde7869f605a"}, "https://openslr.org/resources/52/asr_sinhala_f.zip": {"num_bytes": 917209429, "checksum": "65782dee2ba4256bab123835ef2277a3fd1116f20f403a2c4ff5ace3ac45714c"}}, "download_size": 14676484074, "post_processing_size": null, "dataset_size": 77369899, "size_in_bytes": 14753853973}, "SLR53": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR53", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 88073248, "num_examples": 218703, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/53/asr_bengali_0.zip": {"num_bytes": 919838172, "checksum": "c1bbeadbcffae8a40d8e54f25c6c3dea922951a322cc7875a18f52dec127741a"}, "https://openslr.org/resources/53/asr_bengali_1.zip": {"num_bytes": 906161405, "checksum": "b6af5d30439d25a5df20efd85bfa2e900ee962e3afb91fe88a65cdbb0689cf84"}, "https://openslr.org/resources/53/asr_bengali_2.zip": {"num_bytes": 921562897, "checksum": "ac0b50d5ad38d5295c16b7eb62901b273bd6df55dea7b1a8495e69c1a50c0986"}, "https://openslr.org/resources/53/asr_bengali_3.zip": {"num_bytes": 918817316, "checksum": "444760953dc4e006cd6e38ea647b611c7be93a07a78b1a6b83974fe3ebba6b65"}, "https://openslr.org/resources/53/asr_bengali_4.zip": {"num_bytes": 908199672, "checksum": "975a1b690ccfe0609ba50738666758ad92c3683416d1cf7771972496adb4313f"}, "https://openslr.org/resources/53/asr_bengali_5.zip": {"num_bytes": 932042725, "checksum": "21dec790c4f96771a28347ed4430c74d3f3bff046684f4522c2301f7029f632d"}, "https://openslr.org/resources/53/asr_bengali_6.zip": {"num_bytes": 900826997, "checksum": "b0f93fb831bb36c75a6f4c0731bfb991f8b6529bc3b16ee0bede3e7108a7679e"}, "https://openslr.org/resources/53/asr_bengali_7.zip": {"num_bytes": 927750265, "checksum": "647cbcfb9c92930f4625dbc107f4218cdd37f8e3494df23d42917640da22938c"}, "https://openslr.org/resources/53/asr_bengali_8.zip": {"num_bytes": 927268934, "checksum": "73168b982a0665fb4f1104eaafeb3ddc01780b39978649e01ce6ab7850a86de1"}, "https://openslr.org/resources/53/asr_bengali_9.zip": {"num_bytes": 906382286, "checksum": "25f678604ffe93fc986cc402dc4a4329f36eb44ab627c645c4957dbf8e85917c"}, "https://openslr.org/resources/53/asr_bengali_a.zip": {"num_bytes": 900283300, "checksum": "daf0fc69dbd041fd254e96df1732359666ace7c9aea9d5c64c03ab8add3a00c4"}, "https://openslr.org/resources/53/asr_bengali_b.zip": {"num_bytes": 910050386, "checksum": "2d6fc0f464130bc3761546ac0e8b085921d5f1c9afbf886b9c1fa95f9755fd26"}, "https://openslr.org/resources/53/asr_bengali_c.zip": {"num_bytes": 897120616, "checksum": "116e8e63882f548410a3b835d2d3b6a11e6a05969374d173b9c01a8ba7112abd"}, "https://openslr.org/resources/53/asr_bengali_d.zip": {"num_bytes": 914366610, "checksum": "aa155d8e0688d032229ad7a5e4c713e696d1ea531feae83ae3230e526f1db7a6"}, "https://openslr.org/resources/53/asr_bengali_e.zip": {"num_bytes": 922936447, "checksum": "2f6f97591adde2b469f29b601ba33bfc3e8049681594fe31be8a55204c70ae15"}, "https://openslr.org/resources/53/asr_bengali_f.zip": {"num_bytes": 917202893, "checksum": "42542ec7d434bd6a34b30c01fa24de206fb2d2e56afea745a14867a8c0eaa32c"}}, "download_size": 14630810921, "post_processing_size": null, "dataset_size": 88073248, "size_in_bytes": 14718884169}, "SLR54": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR54", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 62735822, "num_examples": 157905, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/54/asr_nepali_0.zip": {"num_bytes": 589002210, "checksum": "6c783a5a731c7a9c2cac678823a2ee7866db1acbad7f9a199bce3bf7a64e22b6"}, "https://openslr.org/resources/54/asr_nepali_1.zip": {"num_bytes": 582088242, "checksum": "661865704f3d9adacd74f8c98cd0f6a6e869902c6441efb96c761573dd1d2f05"}, "https://openslr.org/resources/54/asr_nepali_2.zip": {"num_bytes": 589401540, "checksum": "a2b4c373d7ebe5f2d491bf73c2324e6f5645d724df58fd765c71a3a86e7ab6d4"}, "https://openslr.org/resources/54/asr_nepali_3.zip": {"num_bytes": 574596426, "checksum": "6a925d4448f98694185d50cacfa380fad128b47ebf9d5519526b83dd6586348d"}, "https://openslr.org/resources/54/asr_nepali_4.zip": {"num_bytes": 583746586, "checksum": "7315f69b392690c22db32b3c2f14b82b1f64215c5a21d697c421d6d220a55bf0"}, "https://openslr.org/resources/54/asr_nepali_5.zip": {"num_bytes": 572967016, "checksum": "3891b332a9fc55e4fb0579bf67431989e92ab05b9715c0e9673cf356e878e0df"}, "https://openslr.org/resources/54/asr_nepali_6.zip": {"num_bytes": 588104006, "checksum": "78c321a8f55a5aa0c56feb791826a2751087cc87a36b27bba56ac6b124eac73f"}, "https://openslr.org/resources/54/asr_nepali_7.zip": {"num_bytes": 588410232, "checksum": "8b05b8b4aedfc9829cf33cd65ab3c1474eb8f738078b414d40b61f08782064ec"}, "https://openslr.org/resources/54/asr_nepali_8.zip": {"num_bytes": 585192213, "checksum": "0125cfc7c54e44bd4ac01d5558130a752cad26aa7055df753c65b400ece2c9f8"}, "https://openslr.org/resources/54/asr_nepali_9.zip": {"num_bytes": 578834881, "checksum": "6c68e80fe7c58a33aeb91b5b9bc37a99f9374a8f629e2a109bddba51d1712b12"}, "https://openslr.org/resources/54/asr_nepali_a.zip": {"num_bytes": 587798317, "checksum": "03b7bf7b6ace01a677e2a0dd079053ea29abf45743f197761190f3f52678e6df"}, "https://openslr.org/resources/54/asr_nepali_b.zip": {"num_bytes": 584397714, "checksum": "9a98d93ae91e75c6928d9222b387105e99030b8b81df9ada57c87f6b317c0853"}, "https://openslr.org/resources/54/asr_nepali_c.zip": {"num_bytes": 579440365, "checksum": "8bac1a046a86fc3684bfec2e5af1b1e0916ec5c2f1be5ccb1fb4778ecd7bb357"}, "https://openslr.org/resources/54/asr_nepali_d.zip": {"num_bytes": 588470094, "checksum": "9aad327fd72efcc009d060a8299aa70ca1757f1ec32fe3280d53e449ef75e5c3"}, "https://openslr.org/resources/54/asr_nepali_e.zip": {"num_bytes": 578091869, "checksum": "4ba73ada7cf482611b3ad3e17a77685b1ac872e5840953c07a1c6c2b10a83e4a"}, "https://openslr.org/resources/54/asr_nepali_f.zip": {"num_bytes": 577705651, "checksum": "062f4908802ab0d57362da1dfea4898898f6d21ba09596c1e271c2cda47297c6"}}, "download_size": 9328247362, "post_processing_size": null, "dataset_size": 62735822, "size_in_bytes": 9390983184}, "SLR83": {"description": "OpenSLR is a site devoted to hosting speech and language resources, such as training corpora for speech recognition,\nand software related to speech recognition. We intend to be a convenient place for anyone to put resources that\nthey have created, so that they can be downloaded publicly.\n", "citation": "SLR32:\n@inproceedings{van-niekerk-etal-2017,\n    title = {{Rapid development of TTS corpora for four South African languages}},\n    author = {Daniel van Niekerk and Charl van Heerden and Marelie Davel and Neil Kleynhans and Oddur Kjartansson\n    and Martin Jansche and Linne Ha},\n    booktitle = {Proc. Interspeech 2017},\n    pages = {2178--2182},\n    address = {Stockholm, Sweden},\n    month = aug,\n    year  = {2017},\n    URL   = {http://dx.doi.org/10.21437/Interspeech.2017-1139}\n}\n\nSLR35, SLR36, SLR52, SLR53, SLR54:\n@inproceedings{kjartansson-etal-sltu2018,\n    title = {{Crowd-Sourced Speech Corpora for Javanese, Sundanese,  Sinhala, Nepali, and Bangladeshi Bengali}},\n    author = {Oddur Kjartansson and Supheakmungkol Sarin and Knot Pipatsrisawat and Martin Jansche and Linne Ha},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {52--55},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-11},\n}\n\nSLR41, SLR42, SLR43, SLR44:\n@inproceedings{kjartansson-etal-tts-sltu2018,\n    title = {{A Step-by-Step Process for Building TTS Voices Using Open Source Data and Framework for Bangla, Javanese,\n    Khmer, Nepali, Sinhala, and Sundanese}},\n    author = {Keshan Sodimana and Knot Pipatsrisawat and Linne Ha and Martin Jansche and Oddur Kjartansson and Pasindu\n    De Silva and Supheakmungkol Sarin},\n    booktitle = {Proc. The 6th Intl. Workshop on Spoken Language Technologies for Under-Resourced Languages (SLTU)},\n    year  = {2018},\n    address = {Gurugram, India},\n    month = aug,\n    pages = {66--70},\n    URL   = {https://dx.doi.org/10.21437/SLTU.2018-14}\n}\n\nSLR63, SLR64, SLR65, SLR66, SLR78, SLR79:\n@inproceedings{he-etal-2020-open,\n  title = {{Open-source Multi-speaker Speech Corpora for Building Gujarati, Kannada, Malayalam, Marathi, Tamil and\n  Telugu Speech Synthesis Systems}},\n  author = {He, Fei and Chu, Shan-Hui Cathy and Kjartansson, Oddur and Rivera, Clara and Katanova, Anna and Gutkin,\n  Alexander and Demirsahin, Isin and Johny, Cibu and Jansche, Martin and Sarin, Supheakmungkol and Pipatsrisawat, Knot},\n  booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n  month = may,\n  year = {2020},\n  address = {Marseille, France},\n  publisher = {European Language Resources Association (ELRA)},\n  pages = {6494--6503},\n  url = {https://www.aclweb.org/anthology/2020.lrec-1.800},\n  ISBN = \"{979-10-95546-34-4},\n}\n\nSLR69, SLR76, SLR77:\n@inproceedings{kjartansson-etal-2020-open,\n    title = {{Open-Source High Quality Speech Datasets for Basque, Catalan and Galician}},\n    author = {Kjartansson, Oddur and Gutkin, Alexander and Butryna, Alena and Demirsahin, Isin and Rivera, Clara},\n    booktitle = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages\n    (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n    year = {2020},\n    pages = {21--27},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.sltu-1.3},\n    ISBN = {979-10-95546-35-1},\n}\n\nSLR71, SLR71, SLR72, SLR73, SLR74, SLR75:\n@inproceedings{guevara-rukoz-etal-2020-crowdsourcing,\n    title = {{Crowdsourcing Latin American Spanish for Low-Resource Text-to-Speech}},\n    author = {Guevara-Rukoz, Adriana and Demirsahin, Isin and He, Fei and Chu, Shan-Hui Cathy and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Gutkin, Alexander and Butryna, Alena and Kjartansson, Oddur},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    year = {2020},\n    month = may,\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.801},\n    pages = {6504--6513},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR83\n@inproceedings{demirsahin-etal-2020-open,\n    title = {{Open-source Multi-speaker Corpora of the English Accents in the British Isles}},\n    author = {Demirsahin, Isin and Kjartansson, Oddur and Gutkin, Alexander and Rivera, Clara},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = {6532--6541},\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.804},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR80\n@inproceedings{oo-etal-2020-burmese,\n    title = {{Burmese Speech Corpus, Finite-State Text Normalization and Pronunciation Grammars with an Application\n    to Text-to-Speech}},\n    author = {Oo, Yin May and Wattanavekin, Theeraphol and Li, Chenfang and De Silva, Pasindu and Sarin,\n    Supheakmungkol and Pipatsrisawat, Knot and Jansche, Martin and Kjartansson, Oddur and Gutkin, Alexander},\n    booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference (LREC)},\n    month = may,\n    year = {2020},\n    pages = \"6328--6339\",\n    address = {Marseille, France},\n    publisher = {European Language Resources Association (ELRA)},\n    url = {https://www.aclweb.org/anthology/2020.lrec-1.777},\n    ISBN = {979-10-95546-34-4},\n}\n\nSLR86\n@inproceedings{gutkin-et-al-yoruba2020,\n    title = {{Developing an Open-Source Corpus of Yoruba Speech}},\n    author = {Alexander Gutkin and I\u015f\u0131n Demir\u015fahin and Oddur Kjartansson and Clara Rivera and K\u00f3\u0323l\u00e1 T\u00fab\u00f2\u0323s\u00fan},\n    booktitle = {Proceedings of Interspeech 2020},\n    pages = {404--408},\n    month = {October},\n    year = {2020},\n    address = {Shanghai, China},\n    publisher = {International Speech and Communication Association (ISCA)},\n    doi = {10.21437/Interspeech.2020-1096},\n    url = {https://dx.doi.org/10.21437/Interspeech.2020-1096},\n}\n", "homepage": "https://openslr.org/", "license": "", "features": {"path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "_storage_dtype": "string", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "path", "transcription_column": "sentence"}], "builder_name": "open_slr", "config_name": "SLR83", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7098985, "num_examples": 17877, "dataset_name": "open_slr"}}, "download_checksums": {"https://openslr.org/resources/83/irish_english_male.zip": {"num_bytes": 164531638, "checksum": "2e5dbae4cc27e0e24e21f21c8e7464d219feb703f5fee3e567de6561a05024ed"}, "https://openslr.org/resources/83/midlands_english_female.zip": {"num_bytes": 103085118, "checksum": "aa1083a319e52d658b85c162905ec27cdf2ac6d5645b4caeab05a385a2c8a37f"}, "https://openslr.org/resources/83/midlands_english_male.zip": {"num_bytes": 166833961, "checksum": "8192c7a0626eb742f9999e63162289f8f9a86c9cb49ef68298dc7f624acaebcf"}, "https://openslr.org/resources/83/northern_english_female.zip": {"num_bytes": 314983063, "checksum": "22b6229d08481e7605b028185dc55dccd0611db428854f2d485d9ff34395a65c"}, "https://openslr.org/resources/83/northern_english_male.zip": {"num_bytes": 817772034, "checksum": "b627d500d1b2e3c4921fb6d91338ead7b972f67c1c2f0babb300e0ef844c7248"}, "https://openslr.org/resources/83/scottish_english_female.zip": {"num_bytes": 351443880, "checksum": "2dbe5545a7ab87112c7730086586f738ec4f42171f7738628ba084ed4ba15ccb"}, "https://openslr.org/resources/83/scottish_english_male.zip": {"num_bytes": 620254118, "checksum": "c7d2d9cd581c48a8323f6cc3886d879e2e7aca5931d98228e07d07b350d9f9a9"}, "https://openslr.org/resources/83/southern_english_female.zip": {"num_bytes": 1636701939, "checksum": "e0a2e8e64b9efdbd7bae5cdf33ac8b81db495b499c9d40da0a7d7842e42b1e76"}, "https://openslr.org/resources/83/southern_english_male.zip": {"num_bytes": 1700955740, "checksum": "788b1c59fb5713b0e1efebc02b7aa1b55182b21955493b299b9941c70a878cad"}, "https://openslr.org/resources/83/welsh_english_female.zip": {"num_bytes": 595683538, "checksum": "3c2465b9618e33f42c7d2ee753b54ae593714e758e236efcdd56c14c5bd89f1d"}, "https://openslr.org/resources/83/welsh_english_male.zip": {"num_bytes": 757645790, "checksum": "eaf8de0f8872bb647d5c159bb33713cfd58966bd59d733f5f399793778ea5058"}}, "download_size": 7229890819, "post_processing_size": null, "dataset_size": 7098985, "size_in_bytes": 7236989804}}
\ No newline at end of file
diff --git a/datasets/vivos/dataset_infos.json b/datasets/vivos/dataset_infos.json
index 0b87dd10d94..df8ed1e44dd 100644
--- a/datasets/vivos/dataset_infos.json
+++ b/datasets/vivos/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for\nVietnamese Automatic Speech Recognition task.\nThe corpus was prepared by AILAB, a computer science lab of VNUHCM - University of Science, with Prof. Vu Hai Quan is the head of.\nWe publish this corpus in hope to attract more scientists to solve Vietnamese speech recognition problems.\n", "citation": "@InProceedings{vivos:2016,\nAddress = {Ho Chi Minh, Vietnam}\ntitle = {VIVOS: 15 hours of recording speech prepared for Vietnamese Automatic Speech Recognition},\nauthor={Prof. Vu Hai Quan},\nyear={2016}\n}\n", "homepage": "https://ailab.hcmus.edu.vn/vivos", "license": "cc-by-sa-4.0", "features": {"speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "vivos_dataset", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3186233, "num_examples": 11660, "dataset_name": "vivos_dataset"}, "test": {"name": "test", "num_bytes": 193258, "num_examples": 760, "dataset_name": "vivos_dataset"}}, "download_checksums": {"https://ailab.hcmus.edu.vn/assets/vivos.tar.gz": {"num_bytes": 1474408300, "checksum": "147477f7a7702cbafc2ee3808d1c142989d0dbc8d9fce8e07d5f329d5119e4ca"}}, "download_size": 1474408300, "post_processing_size": null, "dataset_size": 3379491, "size_in_bytes": 1477787791}}
\ No newline at end of file
+{"default": {"description": "VIVOS is a free Vietnamese speech corpus consisting of 15 hours of recording speech prepared for\nVietnamese Automatic Speech Recognition task.\nThe corpus was prepared by AILAB, a computer science lab of VNUHCM - University of Science, with Prof. Vu Hai Quan is the head of.\nWe publish this corpus in hope to attract more scientists to solve Vietnamese speech recognition problems.\n", "citation": "@InProceedings{vivos:2016,\nAddress = {Ho Chi Minh, Vietnam}\ntitle = {VIVOS: 15 hours of recording speech prepared for Vietnamese Automatic Speech Recognition},\nauthor={Prof. Vu Hai Quan},\nyear={2016}\n}\n", "homepage": "https://ailab.hcmus.edu.vn/vivos", "license": "cc-by-sa-4.0", "features": {"speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "vivos_dataset", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1722000675, "num_examples": 11660, "dataset_name": "vivos_dataset"}, "test": {"name": "test", "num_bytes": 86120132, "num_examples": 760, "dataset_name": "vivos_dataset"}}, "download_checksums": {"https://s3.amazonaws.com/datasets.huggingface.co/vivos/train/prompts.txt": {"num_bytes": 1075754, "checksum": "d6c6fcbe258d80d0f63e0f87d414b805f6ae11f41d40cdba5454152c3d6f14c0"}, "https://s3.amazonaws.com/datasets.huggingface.co/vivos/test/prompts.txt": {"num_bytes": 56446, "checksum": "ed27898d081eaa41b1e7e38451eb85f7ca06138896b471691510e7bab1187c2e"}, "https://ailab.hcmus.edu.vn/assets/vivos.tar.gz": {"num_bytes": 1474408300, "checksum": "147477f7a7702cbafc2ee3808d1c142989d0dbc8d9fce8e07d5f329d5119e4ca"}}, "download_size": 1475540500, "post_processing_size": null, "dataset_size": 1808120807, "size_in_bytes": 3283661307}}
\ No newline at end of file

From 63d0d47bc627abbd20f8c59f117f32682c23ca7d Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 19 Nov 2021 15:44:19 +0100
Subject: [PATCH 42/42] fix dummy data

---
 .../dummy/ab/6.1.0/dummy_data.zip             | Bin 14443 -> 11523 bytes
 .../dummy/ar/6.1.0/dummy_data.zip             | Bin 4298 -> 0 bytes
 .../dummy/as/6.1.0/dummy_data.zip             | Bin 4097 -> 0 bytes
 .../dummy/br/6.1.0/dummy_data.zip             | Bin 4057 -> 0 bytes
 .../dummy/ca/6.1.0/dummy_data.zip             | Bin 4163 -> 0 bytes
 .../dummy/cnh/6.1.0/dummy_data.zip            | Bin 3886 -> 0 bytes
 .../dummy/cs/6.1.0/dummy_data.zip             | Bin 3928 -> 0 bytes
 .../dummy/cv/6.1.0/dummy_data.zip             | Bin 4456 -> 0 bytes
 .../dummy/cy/6.1.0/dummy_data.zip             | Bin 4182 -> 0 bytes
 .../dummy/de/6.1.0/dummy_data.zip             | Bin 4117 -> 0 bytes
 .../dummy/dv/6.1.0/dummy_data.zip             | Bin 4109 -> 0 bytes
 .../dummy/el/6.1.0/dummy_data.zip             | Bin 4428 -> 0 bytes
 .../dummy/en/6.1.0/dummy_data.zip             | Bin 4014 -> 0 bytes
 .../dummy/eo/6.1.0/dummy_data.zip             | Bin 3952 -> 0 bytes
 .../dummy/es/6.1.0/dummy_data.zip             | Bin 4007 -> 0 bytes
 .../dummy/et/6.1.0/dummy_data.zip             | Bin 4524 -> 0 bytes
 .../dummy/eu/6.1.0/dummy_data.zip             | Bin 4174 -> 0 bytes
 .../dummy/fa/6.1.0/dummy_data.zip             | Bin 4186 -> 0 bytes
 .../dummy/fi/6.1.0/dummy_data.zip             | Bin 3864 -> 0 bytes
 .../dummy/fr/6.1.0/dummy_data.zip             | Bin 4085 -> 0 bytes
 .../dummy/fy-NL/6.1.0/dummy_data.zip          | Bin 3982 -> 0 bytes
 .../dummy/ga-IE/6.1.0/dummy_data.zip          | Bin 4096 -> 0 bytes
 .../dummy/hi/6.1.0/dummy_data.zip             | Bin 4303 -> 0 bytes
 .../dummy/hsb/6.1.0/dummy_data.zip            | Bin 4363 -> 0 bytes
 .../dummy/hu/6.1.0/dummy_data.zip             | Bin 4258 -> 0 bytes
 .../dummy/ia/6.1.0/dummy_data.zip             | Bin 3767 -> 0 bytes
 .../dummy/id/6.1.0/dummy_data.zip             | Bin 4029 -> 0 bytes
 .../dummy/it/6.1.0/dummy_data.zip             | Bin 3995 -> 0 bytes
 .../dummy/ja/6.1.0/dummy_data.zip             | Bin 4274 -> 0 bytes
 .../dummy/ka/6.1.0/dummy_data.zip             | Bin 4434 -> 0 bytes
 .../dummy/kab/6.1.0/dummy_data.zip            | Bin 3902 -> 0 bytes
 .../dummy/ky/6.1.0/dummy_data.zip             | Bin 4390 -> 0 bytes
 .../dummy/lg/6.1.0/dummy_data.zip             | Bin 4074 -> 0 bytes
 .../dummy/lt/6.1.0/dummy_data.zip             | Bin 3824 -> 0 bytes
 .../dummy/lv/6.1.0/dummy_data.zip             | Bin 3862 -> 0 bytes
 .../dummy/mn/6.1.0/dummy_data.zip             | Bin 4768 -> 0 bytes
 .../dummy/mt/6.1.0/dummy_data.zip             | Bin 4094 -> 0 bytes
 .../dummy/nl/6.1.0/dummy_data.zip             | Bin 4075 -> 0 bytes
 .../dummy/or/6.1.0/dummy_data.zip             | Bin 4316 -> 0 bytes
 .../dummy/pa-IN/6.1.0/dummy_data.zip          | Bin 4416 -> 0 bytes
 .../dummy/pl/6.1.0/dummy_data.zip             | Bin 4103 -> 0 bytes
 .../dummy/pt/6.1.0/dummy_data.zip             | Bin 4130 -> 0 bytes
 .../dummy/rm-sursilv/6.1.0/dummy_data.zip     | Bin 4041 -> 0 bytes
 .../dummy/rm-vallader/6.1.0/dummy_data.zip    | Bin 4384 -> 0 bytes
 .../dummy/ro/6.1.0/dummy_data.zip             | Bin 3976 -> 0 bytes
 .../dummy/ru/6.1.0/dummy_data.zip             | Bin 4811 -> 0 bytes
 .../dummy/rw/6.1.0/dummy_data.zip             | Bin 4150 -> 0 bytes
 .../dummy/sah/6.1.0/dummy_data.zip            | Bin 4569 -> 0 bytes
 .../dummy/sl/6.1.0/dummy_data.zip             | Bin 3947 -> 0 bytes
 .../dummy/sv-SE/6.1.0/dummy_data.zip          | Bin 3993 -> 0 bytes
 .../dummy/ta/6.1.0/dummy_data.zip             | Bin 4385 -> 0 bytes
 .../dummy/th/6.1.0/dummy_data.zip             | Bin 4467 -> 0 bytes
 .../dummy/tr/6.1.0/dummy_data.zip             | Bin 4222 -> 0 bytes
 .../dummy/tt/6.1.0/dummy_data.zip             | Bin 4270 -> 0 bytes
 .../dummy/uk/6.1.0/dummy_data.zip             | Bin 4550 -> 0 bytes
 .../dummy/vi/6.1.0/dummy_data.zip             | Bin 4094 -> 0 bytes
 .../dummy/vot/6.1.0/dummy_data.zip            | Bin 3339 -> 0 bytes
 .../dummy/zh-CN/6.1.0/dummy_data.zip          | Bin 4204 -> 0 bytes
 .../dummy/zh-HK/6.1.0/dummy_data.zip          | Bin 4210 -> 0 bytes
 .../dummy/zh-TW/6.1.0/dummy_data.zip          | Bin 4052 -> 0 bytes
 .../openslr/dummy/SLR32/0.0.0/dummy_data.zip  | Bin 4125 -> 12652 bytes
 datasets/vivos/dummy/1.1.0/dummy_data.zip     | Bin 1884 -> 14710 bytes
 62 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 datasets/common_voice/dummy/ar/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/as/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/br/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ca/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/de/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/dv/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/el/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/en/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/eo/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/es/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/et/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/fi/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/fy-NL/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ga-IE/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/hi/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/hsb/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/hu/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ia/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/id/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/it/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ka/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ky/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/lg/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/mt/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/or/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/pa-IN/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/pt/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/rm-sursilv/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/rm-vallader/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/sv-SE/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/ta/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/th/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/tr/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/uk/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/vi/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/zh-CN/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/zh-HK/6.1.0/dummy_data.zip
 delete mode 100644 datasets/common_voice/dummy/zh-TW/6.1.0/dummy_data.zip

diff --git a/datasets/common_voice/dummy/ab/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ab/6.1.0/dummy_data.zip
index 26e2c823b65c8ee650fd1b86a5c67363b0199f94..282b285dd46de9cecfd66000b7dba230dd1f591d 100644
GIT binary patch
literal 11523
zcmcgy3pAA5`yQ8EB9|JBLm0QH41+KchSVUEOG#mbBDqV5Tp}tV3MENRh&YtXNhyt7
zP9nKf2ss^5a;scQ^8dEZmoZKKhU&l9+P-fsYwh>h&-?8C?ftxCV#vgdM1b}v(A#Y0
z<-b212r-1iepgq2TL(KIJ9!K9B?#8n#YuKUB}sPRx!;ci!N5GqfI!R)U@|ih8-i`l
zK+mAT_@IGoZH@4{o6RicF}AZ~&KYt98kTBEdm^9x9^x+Iu+gTi!`xe=OhOprP)sX$
zgcwFvim{9gDex>3*3}8SuqiG^R{Kq8MmR6JKJ8puPLicbTtaqIO7b}-q{D7YOY(@r
z`Gnfhj=Otq_jH;sm8!S#K_E;FS=k&_`5#+D2m#7j{OF)rBA8JK1adABnDu71W<Ks-
zjx+ENL9j!h4xUu-2a+}$RA?h3gXK3w{q?tE?G5cppKjA#M_mNULL?ciNt^PU_hZD_
zL@Mq78sl_JbvR^T(0@^{?c&hkzc)(1>vTXRYatx2Wtobx@Z42byK5z$$jA8cDc>t^
zON|S68NH()_nLA=yA3Wq<>#Zc6;s!>s6K9OtY>G^$%8`U4-3yG?YDE4YiE9P;8kJa
z^{VmU54VkC&eXlTSR8TuM0{pRN^H`ToPoFIcj7Z|h{n9~A8*Q%Id+ZrAHLMVaaE)q
zPP=sUZi8S)0LL9xAGN5X&boVi`sMZsG@zG1!n*cFyOrH=VwV>spN~*Qf`1Av*}moT
zl8rW=CDXRkfz#WNJ=_v@nsqBCZu4Ai7$~Yb-To(HwJ5Wtz%|zrz7>Z9z;L;&40^2s
zX&@6#N!JBrfRrG<X=lO@%8@9gl4f-TiHdeo=B23uA)dn%eA*8XX5C`tAxvsV!kO`B
z_!O-BzG#Ibs$6XigaP3Af)IK^3{Wi^$+=pw_m#7E_wv~9EvJIP$|))+D#&3K<*?W<
zt4S2nj2P6xlS(_Y9PFS{qa2+P<}v~nEs!aAQiU2wE-<c8;5jm+6Sum+?@iy)GIwJ3
zFmn)ths52u`zKAt2^tI!%P$Q}1|8wrM0g(%>_jSLGF$oA=4-bl_o(+ujYUNH6<q0E
ze?Ch1%65$L^M(z`;UdmNoN4_MQO4NJ%u)@>1Ny2h@;SScgZG|)S!-ca(l+GVu4UVs
zZD3?Pd9dW*RKR55^k3>#udf%3`K#Hg-?ds&p}?t2Jbm+w3iO8I`Jl<HMovA;EC1X}
zp{jx*#v^n0`&G}E`sPP633nTxh~>XDLA-TC-sUOvzdKgf>^=$QncIv_Pre(yV}}Yx
zVR9UcE~JGK!`~Fyb%}gr2=6#o6&a!v%wOM!;`RtvFIvvbX}|I|0}x_228mkyjSwl5
z00f}+9U*=<-_!~Pc>wDC{R&m0&<{}T3Oe;;x9f{YFF`0tPzO&c<;;pyg(3)8!V2^U
znXA(*C<@Ricv3|hNX8p5VUb6=gfe+N5-!*7OjjjDjxd*nJmAcVix2(h5Ld$vKjet!
z;IlU6V3&ULdl7}r(}#zb=-b4%O8WEi=AJLT62-?W(Rx-hhG+tbRC1Afm8Ea3iZb@d
z3yD;8f|0XS&87E|{ara+<4yiT0#o?sdHZ?$OW6yyKRNrP#A5XGC&kIOOOxpvrxnhI
z^60RASf*UNGtE~pJWfe+fSYS<I4;)-lBj$YdFH)uTe;LeB8U3Z3SAWyIlZ^($^I#~
zvLcqh2_CHtha`G!-`Z*SI}fMX(E`zlqqp)Ll6ayCdvbRq-E<9kuADR?b)hCULiKIy
zAd#E9^%fhC*GkQquU7N&q)m&yktk&ffJD{jD^V!n)G7s8`2SJqb_xLjmD0P*L7?sT
zIeU2jcRYdc_Cx*kx9XYIYIF83QxF0)Va9oB)yK=u*$w04?K|`7KY3o0*bPN`{%y3n
zhB`>AJgmdsfyXrQ`qC+XW^RxFn}qQ>a6EUkEnC&L{J0-aEZOFmUe?1nIn6lP(Kk0D
zIp0MJ*0_6gnv`Ux)bJnLUzaFl5_!)}V6@csQGDg+yh)d-=%>-81l0!;iTWx-j~-sx
zSJ$@mPqCvlCsstCZqXFqjErX7BOZpmZ)vhvC%%5K+l?zEL%am9<#5(6<x>O{ftZSx
zV%&O!mEAP`u?q1m+A>|Ag`tfJO^6p$W04Y<az>50wx`{07i-aMCgW__w;8oZIDC-f
zej&}Bpz-NeZ2O3F{J4Kj+J{x5#_5IC)>nGdN9>*nYrQ?t89Q#q5-reEDwg6EH5R^G
zGygbW%JqrM#NvccI~CgBiX5Mq2zWgB4n6d_dWsQjv|4K}l#4<t1}UJOoO~!j3sW)e
zJk4vb^;`~}282*ij4^05-b9*E%_zpNZAOmbNiwK^*kZU^rbfS|R(~}{R?gT+Uw^f}
z1s5BOY+!y+fOu3)r+>}j^-MEsRb~@wCpck(xy;V8D1BCfkD&Qu=-)0K9DTnEaCeL*
z@hT)f(5W`dx-C20$O=54d^ew5X6}1VZOmD`z12igjLm7XBk^_?^Oigl3-c^)bTfCr
z(n*yL6j|^O*^$8qgAR{fSc6b?Zx7+C0(|iq$>lYSH-vk)wQy#Tvhp@on~aCNCz_Xs
zJGm6ZbFR!`4Y4jmXYBEm-o(i+5T=ukG&+IZBw3(f;9M`&ba_ppcdpo0^V4M0nq5^P
zLXwT9W@oVMRy8^v&W1HvaZ)m?&;@ag;fL#zG2to$)dp4>r!w`VjrN-QxaM@{?;RME
z`!^uIn*9XM>`kEB%X?ZME(Z@AmiTs|)@>Z*OAzh)>mlOyq{Y9F55;I3z8QH|9DZkH
zf4JR%rq0@NdAH>sDz9Wz9plTsR2+P-uTzybyTvWCnX#!jN8;1K7G`y;Gvy*zKYf&T
zx^O)xzir~CF;{R03*o9U5!Kfqs<6)^CC~2g1O{O!neu#3d#fbYj#+dadfL}n-}3CF
z&a`*nNL%T^t?r`Y>w{z?k*f3#wO}A>>DHTL-jJw4Qfa3f2m!aCt(jKlf4`)6UI&s+
z^gorwc5&u_<k9BbvGW^6Q*Nrx(B$RmSG13#x6fBegFUqgk~8R3l6H)*-Ma(Jqy6Om
zwrAXg!y=DeS~g>q8f&l<dR_@`9V|&<H~Um_@)ma>Inm9^MtAUDZ}F)q4!l-{s5%d2
zC$5CuW>)WCIP<US+gyoTQAm))Y<*GdxVcwczDv#~PXUch*UM01kj2XGu4PvGXIla;
zbEByl7w@`_+R-RoLNYt!eJ2A2E<SOz*i+&@FLQCH?d3>O9hG=<^GZREM^Xfv-Ao(w
zT*I0w34^<$3?&kBi?WtU>AXVjwLg7;S*+1Z)0V3%Rr(L67>j5Lfz!A%e@187zFwZz
z(VgKU#8om{JlLgiNd1|{&Jx#yw&T_v+(iM^uQEDI20W{u#TGBWQ0gKon%AG_(*H5j
zx}_`7(YIP|+_kBAw0B2(#L}r(@2{p^%hP#>k3k?e>a8Hk$LOhX>$xOotnb&w#vR^p
ze9`{;`Xi{zc!TtN!lxRyOKrb6PP8tFFT5Tuk?SpUDi?#i#QitHVNYAnqnbpAz%g{d
zN|ZT~3IB4s&MEiO2}O;R51pD?b?<XrZ{oL<^*TR^{B!loy$X$*!UqPLR`uyue-!+e
z%;7_3lyt9<lCQYK@_A;L-`mm=S^zBwpo!T9L8%t7Al5EuLC_S=?_UwOZh;l?+3KJk
z>c36?jjeF<x0^iWBm<%8&*Mz&?&IX>HD^s!LhAu^sx?s;X9VtU5l>zs*CDb$CA8H$
zi%&<l6O2rY7>e1D>{oUojxJxtPdK^qLKuG@F^QvFvYgQqvGu&_dDXMyXZ*PwQ+xR3
z%-8AW$XrkkG3IY|wy0R#`uggpl`Y5yqydK^Df(!Fla1h-$iB1uTSR0rsV4DTvO_rX
zu9B8<DPLa7d^&v93%0w<BF-e-$L+pCDz^?>rm%@f#vk?!ThWYYy0^4ntlQMgUgo-i
zG}$g!?y@D}HR|++<a<t6<ydP4H)O0lmaVR;8Yi22QrSp9{+ee2v8F5HT}6e=xye-b
zshqCx{`?rltGdhb0)@2NhF&^qjrNsz=B?ZJ&&K|e?p@Jr<gD{OwQFOwk6|=S|Cnn3
zHv#>@RU^1JZ8BPyY^kDKFhC5i8oH$R>_BYrQA7aY@C&)@@fTaVK9p5z1wC88t$)0t
zip#p`#gwOSLyc{N=FARfb~Pt&3#3_)wC`(nVJ8#Xnf?8u)hGp`9c<si;KlIeWYTWc
z@A+fVUWDeaLjOMQ?B;8?&l&Ox9UbOKw+NCg=-*0ruP_#p?tNLG5aRo$#O6%F_hx^n
zZraT4yP}mV997|W(lLx3UAvlb`R;Rq|ES&2xzek1a_y0j5r$}$A&at==4rOd`ET|Z
ziLGi+J9&H`@m1(8t2&zl>vqO9lx^1}sIw|YI)-@fDtoAZt?!n#_W83Vt+(^<uXjCx
zB!@NLXOS@Gi?TvBkUboO{ymp!C4H#?dtWRX!MAE_Xwq5FMO=Z+*-J%8FG3^PPAeMw
z$vfm{G@ir>gcIAMWFkh|wsm3!kjzqoBi8M{JvUE8p4qQt^TcYgMOxu1{o!?;J*v^|
zNy=rFOqD@x&$(ESr8V}9*z?{j6x4bVaz{pF_<->gE`0)>JHpRh9aO4$NY>(*Zosw&
zer8qB$)icS*UR6!SlA<0E7OjARBURycc-c#aR-sNzvjB6#k=f05yIusi4z$)UL8J-
zl1iAhhbku@^!Z-&8J`|wIAEq`@866XYFg{s*yCqr<tQ{Ld#GBp)h&~(RUf|Y#A|tQ
zQi)xp7IqL4IY{#NMUG$4#{45C{C=rFn~{3`=8rX;UrsEEpHg0bdtwowx^e2$uk<fB
zPJ4G(S9dpCUw3DFM_W5DTSa9BMI|gw8RP1qG;=?6heQu|D)E0=g3y-_0ntCxW^;i2
zO~9~UfjL9K;64vv)OYlsWNeo$7Kc+%#^O}J0t5GUxX%L^{?{@#158y(Q3dxE*i8r+
z+~)yo>u&%C7@O5t4+ISE^8jY@tAQyhD&p4Q6x6;l_7nmJ_jv##{Ays3#&B3=oYGfd
ze?!3FJ`Z4KzX6!C3JyC9tP=tT_jv$Y{~Le-#%48k7Xk+Nc>pu~O~BTCP31!f7~JUq
z#=wYV`hKbNzkL`nfCq0B#icn_N}7ewF9XA+Ps*YqD1E|hAXFcET~Hy^#BeqQs7_K=
z%RqG!Zlq9lhB?^hn(IHPRZ>TwtagE_8{9~t3D6ho>m(>>qV|sAU^G7Xa4;|^tpXYx
zYzn`IY0h-MTV9#T(GSaMvjl=#4rP@GSj})Fg(lG%aW1n%kP^RBUV|qK<N-ocG7HK`
za3h5ZFQU!tcV|saIY0sE{vVXnstCa-2JlZH6clOwZg9HlL$j=d@&sk!0^Ek-Mhewn
zp*!eY;(t&tq9h(frz9R+-rz<Gy~8?B;^$W?fq{U4)MqG4@eSaE*yyH#-m1rJ27n(y
zX}ZtNKp*{~X}jw*svz*hf%HLiN;7<J1|k+OBm=a8{6PHx2=Lx1ZCc>H!;KW`!a;Wi
zX!)@;oOntSKx|6?6d1v9BZXQmSqKTVVEhjXKrSeaIbhbpjTE|$i*5>NI&d_sTS^W<
za7x1rcz<w%_~%&&2Q+ceK?0@IGj}<d@X}2JZA*uSSx3zRr2{i}IfyP>2n)1<&_M&G
z7XS<&xRF8^LFXkpKL$U#^wY3^sL7zbqyyIhZlq9V{)Le7BRKzq64)7lujb&M3pY~e
z5Cn%#N`BrapkxX51!%*H68JT+=yC}tVZmQC+OW{jk3oKdt@(Kbl(1k=fHthsUjU14
zhkz0m><G|?WhVG**qXh&!QKFPQeBZKVZnX?ZCG!97S?>+0ZLe~8$cUY#2-J;7Tw+e
zB`nwrpbbkz=;vY4?F>-Df}H@`uu6UbEV_LGN?5QDKpU2c@XxaaVA16Ym>4dCE*ju>
Qq@oCB=%n!q3XVYhANM2+D*ylh

literal 14443
zcmeHOc|6qX_a9r58oQ7%vQ4r~WJyK{gUFigV#r`@*-27aQkHC`MG_%<iMVA=wv>{s
zYp-NWNM#H8ekNtsYVJS3dw=(Z*T;)LoO3?U`#k5I=ly<0Pn(K*6A1G)l<RKr?U(OA
z=t0~dJ0DlqW7c*y-Zm1(Msy&m>pS$3n8(MT9z;PsK?MR)PXa&m){KbXv^6mA+kcmn
z69mFM4ERRG1JLf@49TUyk(lR~hJF_f#(?mek&l>YK7Egd75Lwcpj%gL4I0NT9fVSV
zK)jS75G#H(-kvsSH*s$--)|vlo`)l~foC<OHEYc=mReLW#)gzZ=qxo#mN)6GwD#^R
z?J9ny?K#hF$2E8(W;X?2<VWxZj_Sns$XP`AQ&UlWN?BqarfsA1jkIU}>#3M{7aPrc
zP}fY}{jsh7bt)%&k}YJ_W$MLWU6Tj#cNNphss-$EgIfJ%>MV&u*|j&J^D|iWJ5`QN
zmA~y=FsHe|qaPx3Mg#R{uGTBYFkgL>ddf>2QEc#2ht;9&y3<XcHd>8K2AZM4@h_P+
zKQKUWh8aUVT`$_Ul)5VADNwldmcPEs+Pg`?$sstSwEVGlj&VQxv1Hk?%cWHzpX01`
zYgL1qllUh_?d6LX;@;3H>G|7VyBKvosc7Y$Xl{{&d_H=n`1#!1yE|J=x-;g=wG>))
z1{R<b^jfr(ldXk_6e`It&hu2*BoG3U02ezj{<7QHt|k)(I_R(9>qxUTP}*~&niqhN
z8-Q;sksW*l<F1oNu%#J;AngaQzzDdqrcl!DTW0Z()5X30BJ*dub7ZZnBmEg;YOEsE
zvYy9^!DFE_@9&3i{uIvHc+|aLuQVsAkv+(#IZ;qAyw+{&%meEe@pWGd7M+)){)&2l
zkbSx%QB!8>#q(^J=5FTOeCHaYg`+NZ!1)icMo~KO2g907^){)*KXY=spPjC)yo1T~
zLzb=d1q25InG6x6JbZ?h&LFi@2Kf<UnyR^hqMHhWi06~rAjmI>=9qQuNvZ7N>wve{
z$Xn5w==6lyEr>A;3Na+?TPly~nMTLY9cxTk5arcPEpD*P9!Z_H>E%}X=+_@JXSgA1
z>(B$fB+s+6q4w}Qk!(pt^BKsJgrz?udp`0+&d&#QPJV(+eQj8##NvioZ(Q8CiPqcK
z#sv*j1N2esw!thq+pkc(2WuCiG#0L2#oiflwy?NGjJyU-rkfh3ZmS>M8~Mv`&K
z_fQ;*k?%FRSu}r7m!1IMM#sgz#L6sc{Q^B>qbvqUJ3|2TqD&u04JQ|Ldh+R{U8ghc
z36AEELCW<3zRH=Gs~airbB~yGY|c#2D$r=qn>+OxX;dBR=v)}Td1o%|Da#5-rh|vj
z!Oe7AgH=*lb)sPh1qvm#(9Z<hGWH~T<?|gjx>#e-Xj^}3n?S39;T0I2d83LuTDvhT
zR!~F~QW)DB8q$1OJXB_)LCZYz!Ywr+9VY{C*WA~4oF-<)-Uh@s&_&A|z7Lceu2ovd
zIQc=l)OUbGQDc%Vfp_4~=b*|(<F}nb=l5&BpYAOQt(x`;wV8mc?46Tv+rChjomn5j
zmUFY@WbJ6bEK^R0TX;KVTS@MYr3rm%1@kM_Jb6oBgdA@aow(CIU#1H_*|!0ar;Fqm
zZQ+%4aZf6+37HoMX$vI1a_H$2fZ0&<DneF#(VC`L7gbig0;jtlOq9RAS5h=tqr+35
z+9!7slNYOH6mi|wz^S9}bRmEs8bELx5rW_I0%f;`<6I&f$w-_QsQ4#N7BI&BFPlty
z84w{noz5MHU{bA(uz5Al1h>8omx9YiWpWEP^|5Y-v`gz$DSLiYPEN98GB4(6c9(Vp
zC)gXBe;!1$i;f<ulNT;V2#6mZe1bYO!Y?r(W>p{wfu*WtN>M;zQ2Qs;Qg2KW<Zo#h
z7=oD;HTFkws3I=YQ8qbhNrKt<A$)%!T|ABW9nGp)c~xZMjg0Cz>0byUtn8`O)m($y
z>JXE*XSH`E<loC;5mXstbvkm<kD9O56K)M2NEYIvI&U1cW9vovE4QPvtlw=<>3f~&
zybWABQ!+WQFG!(x-yfx}$F1iq`xx#8G>m2Tmri&z^v0BIzwyACm$zWNz<K=3Ez6F9
zK$LHT*qm!y$;^mFY8dnK*yp^Iy9Fwrl+S}$HPnQW66e+A7}T5-_8k~kg~f)bN7DE_
zdv=B+Ls=`emit1hnV?zP9MZBdzPKoKN4}THg?w?=n+%T;b`IS`FB%i=0%susJ2{My
zRLa9E&5rpuqj&F1TIh!>HGj@^EmPL77(u@bzrAbNNwO8r?KjaTI;z?5h4XC<y>|_z
zz|lv75|64jeEn7(R<i(89dNUNohS>uQC{9_34o~!H0ufQA4>qkyxanPpbP|(Y{y5v
z9Z{Zm82}>AfaX{o-A9s)VK0YS6+t;m3Ej`o{3}sC2pxla6eSy3>9YR-o!c(Tj)>cN
zBbdDad5QkDKsBWY=<s#f>#|qpt{elSl84yEj1*OKMQ%u+(q->L8$a6A^)7E|X9sHw
zs}{X>depfDM=Q=f;iFgC^?9J;$$IhnIj86g^3qwr(izOu7ehq{H$G$uyOPi(Z=aoB
zZ5hlWc@SBn9O4ly+zx81Wgh2yZD4prq)1Ds#wK4Z!xZt3<D&ZITE{#w+9#aqnL8tL
z6l7&%p~-R5I-2o!JqnSH17V*YJrcRLn0$0OcOZ29&iUPWsw@S8+myPehEYm0qop1N
ziY{+7#!Fw@Ms2Lgx<2$|Z_NG(@qGqd%RO%sAPcVhPL8B3Myb}A%BU7jAVce?Zp!ug
z#hg3`3P6Mmisj4=>JKbb)G3|lJzz3E_oyCh**3WB;oH(^-2$h;7HZ5EjiHeXJ`Nz1
z1|YPZXp5rVh)XqQr_KWp;ouvq<HQXEaB6~jpF)0aOKiWz`Pq<5_TV7~U*RrrD90oJ
zI8-nl<jF3|?e^C=-^krp$sSRO+k58JG)0umlySvQqZDiDJMSHI_(XeB;v!v;W2ei_
zo2~p5|A=j=FoPo$Xm^LBPI=i@JlDKCT5h@j`qk2|$~#R5T%%cQf?Jz5?9gR9YtGS9
z<BmG<_FA&J(9J?v6JHdFP4w{TOIJN;z=7>K%slCXr^7d1+^y>`VRt99HBNqOD6;#k
zNZ53@Nk42WE43i!v}KR)P+4^N6(1?9m*$&{Q;J13KPdJO$wu{DlCG$usyosB3QQZ3
z(mFJKgsH5UQ)%#2l?cxVKiy^d)Okq$G&@7Xi3ji?sBwg9fXP#T!+M~Vwh4`R9<war
zc8^iIWm9vGZ`sVNUzQVTfn*wQED|vOlvBWi$e5Xr&dl}f^KKQ865ktCxA=6_H_dx)
zWtPIvQ0~aFc8;mGy{@f8{)XnLZIjTT2Hq~WTQy40LKUOmNnjCu3k**^MT*c90HQ1a
zq8#{$5`$R|%ukqy08CVzj@t#;<})wHpp`5}9$&q##tq7~9lENdqlDaY&|*~3^l4gk
zCZ9lhzrg)1*CQ;8G`Sb=^*3ZTKP^M;G4ZJ$Qt#8MR<NC!yc1AmJ%6}kIlOO0xWFv`
zn8`v`EMjMe{h>c?^z%!Zt<Wl+o${Vx!O>TwMi0WbS&NjuZ}Hm6_`<9O9`^3wCS?b3
zrN0}*KW)@hVYf@ZF5Rx$#7!(5-r~U&CdJt$G6!a%P9fENX_RKP;E36tiLbVPLRPOm
zXu9^FHMc&>aCqjE+|0<d9Q@E=n_P5R_DMta)A1o1J{iu<u-Y1~vo|B(1~ag3xffcQ
z+0-G^cnap>%o7*E$mM)Zo#n_D;~Q~lTSK;3$lQ~2*nZ1VEW&L60Zt9K$A}Rp@7LTQ
zt!&M`p)Lb|%62ZktvKc*afWv><*OGL!(gS7CEuh#5Et)aOm-FGOK$p?+tZSdN3v5E
zr@s2dj~_od_S|6Kq_EA3=R>%4`pcSv4o@lGOJ<2>6}vwAfLFd9k5_wWHvyb1_;BJi
z&|CmT%(IT<zY)n70Fw5QNcL1<W10aQlYm;l?}S$Y7%>k4l&-NBZV50(^^`v2CDE+F
zsyrdCm<Duu?}ZKi;_2D@T_|TDv3eKzAtIuMer*eM%o2?fU+blIq=bCp6e~?IV{-n>
z>YOvU_yF_m&V-k_VwRD21B#Z8he6Ll#pqqB`8;o3zRfsXYuHB95b4BEVdTJFyNfNZ
zVDuraVZvRgu{+(hD5RD+O*$PH8#@YBSef>0H=>cz?vA>}8_O4R6OY-qCzSAIXsSD#
zMbiBtq3=+erDqGdl*)cOx6XD?klz{U)6EA2S@Pq)coxc}ZC~h}v#jF$V!7Bkb;kEq
z@zJVo!<ClG&C4ZI;lA(6d>w_KO;nz69gq)vz-k~;SaH?0M2fwTBbCObHq2xr9If?m
z2qB{$ondRA_G+J9bKQXhT_Z+bn+hEj!Lfdbhlf9Ou}XJp)f*dy%sw{g>@*kD-23KQ
z1ocN5F0+apcy_wr4H+#3+7-w!3UXCyTB=QPw#S#_i^7?{(!rLX*=<604|peJk0?dI
zj9KYlwAc`|?^24{^F^h^KYf%UIJ#98Jfq9tbKAFZMMqol`VZHhzrg4qg;R%X0Jt4&
zs~9-}xH$p1x$rkGskV9?037oWfZp7qjTwJ7yqa;3xOOAL+9Tj<<LtNbCk%94gytd4
z^g=_9ipo2k_dSwOm#(71W#{_DD!Qb^FWG3O5;yZlg0%0b9(I)6=Mj29&F%JO;HCvr
zUX}*e!?5b)JDkBij05IA+!8rweO${u1bo<<MADgZz-Nt77ku|jl@xc+GotS8Qk`JB
zly2Gnj1g*)7B6Hj8eQ7WqUELmt+>NoA$<3`tzh1PjPfm3qOb33sl?IiLPAr=&n%R=
z-l&g`HciYN_S@Qb_#yv5M$FQM^JB3ok%1{-wSdQE-deZ4_6~&yi<W)9J~zgq6)TzF
z<Kbk+lbw|ON`1Z*;iEp>)8&5VnojvMn-x_B$*;Mu<heiXIP}Jq;*G?xte!RnB@fjq
z7m1+}ahV8upz*DKF@h$PF~8w4lQbYIESb2*+^rh?856i_5P%buep(IRAP%bpSjO;z
z1Q_CmN=$a)hFT6dzzOvm^##}@$y#DwAA8ioC%$l_r3Ug3-zI**n0f9O0TZ)4KLW-q
zg*QN;;u|OAe1PzO>R~l}V#WtI{3ap~Yy1yj8pHpP41U!#F%^MpI*R5e!T%s800$Ui
z;uj8xPr0}b;%NyTU|aU@*9Bu6{{V$+JAsbSHck^E&p7_<k86AnJ)v=;lJ$ER#KS1A
z3)(GsE(k_bz&eSs6nBdf&qlb`JHdF?@s^#$l!R-TxK-mCzQKrR7=O)4G;LhV#9aZ`
zatt$`Wl{@HqCe<q(`>*LI56-{3*_?>3+68@NlPfx2;g_3Bfersj(WQLc!|k~!^Cz=
z?v@mT?G}T<$kcC^2nw$-nbv=B`sFv^KNE;Zmo`cA9pF?sK#IcDAgo-oCPg)`ll{LV
z`$-pRaq@v!tJeSwn0j3cfWBH%u`sOj_5U8eUYdMp392>#Fs5HNtu^H}fY*uq{|u2Y
zL4JB*+wS)Yyq+2VHHimzK>Zud8E0<#d1|av|Nm#yzYO`CnrJZnC4lRiz(1!6tdb*y
z2lzeYLrr*=|Bf{FJipf8u@VH~X<n2}cZ5?S_GG{2Wc^mhKX|Ky@bD@{Ca$E)1wPk_
z#dn?S{2ABz<8BQhIUz<S3i$mNUtm~U|8;@k|Bb*v$bRi2GY{}4a(u~fEe+Pi-+zVp
zOL!ZZ+^vq!d*NSGUc-3(n)2Vcro7645z^T(GPfKtM}mLpVhxk^efN*<JFJ{Y$eI6M
zP9*su2dqKjk2$Oo{}`&CNKRb+&;uZVU^U9M6v0&z38NIi5j~fcZvmgCz!Bzt!Se&2
zeiJ@1%l{*MjJ!(2MV_Z0VkU<3C!l|EMH0+$1y{_J{Qg?2(yKlB&T#4=o>oP$m_MVp
zaiy85ZDhaV+A#+3N#Y2D`=*5Aq%_`}Hf<-{Cqab)JY!W$l5b0>T1wz~9^)n3Gj^Q#
z{Sp((nfSm`@ssToXO<<;E3VEdgZHYHJIMA*bdLFz@1LE#f9xB~w$BzM+c)7k{8z3?
z=d&o@YjX&Z=lX9XGx6~dcLRM8CeI<MMaGJrBy*YgM1<?u7fPOE{G7t-xFlT?58t>h
zgG9-5iMMyKnlwp|#Iqo-M;kFRJz_t=24oQSq6$fu#N!UG%V}})ToRWitgMgYc2&b>
Q0x^LCfh!YTl9<N$KlEt0t^fc4

diff --git a/datasets/common_voice/dummy/ar/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ar/6.1.0/dummy_data.zip
deleted file mode 100644
index f49335dba973d2469c6d9afa3e0631ffe8b02be7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4298
zcmeH~dpy&98^?!9&T|Y26S9Pj3@IYbnnTXV$7vacEvJp#qMYg`<rIaSLe3FUIlJZ1
zE2W%b2_;cYx5{zi*{WAa?sot8{PAqB{eHjwbG?7x@8|kl-|I5gX9BVTXrI@`-lkt4
zzF*h@q5u~H4tL7Y#o6Ck(ZZY+z?8Au#EJGJ1hNAdfMg~B060y(7;hOwYGaTSMEMgZ
zeZ2{O@^A&1yb@FiDi2eVhr#HKo^u_&;T8AL8Qt0U8-P1YbF0_qr>=TCpf3k)+A)=4
z0AT<C#2EnqDE$C`UuP^{!QYSgb)(`lG@a&?HGkfyu1oePCK#wa^A%(ch*1?!8{e<5
zbhTF}pr*IzG5VdJc=Q}w*kwtic<`iQ0!hs-IuOXj^eJ<hcY>vdl^Em7`|P2-!k_^2
zd%)mq$=#_Pf%kPnN!RRD5%4yJ(R6iy<X{zx>d_z-JjCIdwI>vVap`@4EzRRM>DM{+
zp>Di?(Uv($%p@EhrH8pwj2Z<+5>2ey7*m8U2xy+ML_m6tW*&Un?(hy8Y=h-W7~tW!
zXNnY#v;h0!E~C3^a9SlA3|`Oc#%}RHXVdU-3(Kpi>(no@cp-S|n(EY*n#S<Y@s38V
zM?$*NB*~Mm>Q#&JFIlyW16|WEU$~f7x%yVFxKdHQ6gyk>cz*oW-ltYWdGmEBji-hq
zix4t9iiL6dX$3Nq3A*IDz(mWaHA!R!Yv=r^l02bGqCmeSE*Ro(1u?P1w?IM5#N+XN
z=EQWhIjkgU&{#@hyE|n#xuF1PQf6#nUce3R<qqPdz=wreghe1T(+{Uz`aM}r3w3=A
z(QXSOYUf{R>R`Mp`o^k{BR{vGOt0N|{>*15^QH)Q&x!<&y~QkN?CZh#Zay*w9IQLS
zbh7ykV_^nT6;KrRk#tX<O0r+6gr)iA7Sj%N+nJqGPfg9P!dPuPbiA?p9R+dHvU1>x
zxTg`}T~`z$;N*6cZGO^?qcVmbrvA9%v2qXcocwrDLOW}$y4m|+wTV`(#k|nJ^=pVD
zLWlIG1ro(a?mPxGP%Os#LoOcHe?RlQI-+rg5aCSL)HyJ(h~Kq%zc9ZoTA=89b!hA4
z3ss(?KK!{}#-8fp-OFSXpoZ<$Cb5#`B^mdu%G2dT3$;dEp~KsdB}Ps{lilJ_FYmN6
z=kNsufWB1Ps2gce3g!$HKLlPSVvkv;Qgl}Rf@g;Ak?Y28S65EA7>c!J537Z)NsG)t
zo02LIK~7Vp<)ccwpMGhH&R$q*S;Dyd5SZq+UpNjEZ-`HVaYPMFK^pkXeOCNZ<6o)u
z98c9U)N<l9u$z>&ewfpgFCmrtLh3GOMznq9F;U9x7w!374{I?hR)nV4h+$Nd26}e7
zJgCue!Lskmx#874Wj3X!tQHI6kbC=FkKb`NDXrmk!0Pn(tNTWV#a>mOG|=4XC|mQs
z+ix%EiA}d=<XZzaZ8xrlKs-1wXVO|j$*a_ebzf^T8=sKBLKuTpRbe$;cMjqa-XxL3
zXv4x4Jx3z1paWza#(bCu51+SnJjrc2`$=u~&5WAn@mD=+v9*PvW{9&1;d+ET&n{SN
zi^%!wG2>y}{G7KV8uB0X!8^{td_2YCqd_8`=?K0PoEBN}IXl8R?clf7+;-h?myfnN
ztShXC??k@w@E;QepbC#2i0~S@quNgyuRldlj1s40uK0;?ziQC3mzb6c5fP_cDr`h9
z73VJ9oPl<}Da@pNSP4jY_ebc|W7C7vdz@E&n>8JC2U^Pde3ivhY?5p1_kARAt$sZh
z)?B<^YDM~`_Q&}7|4@00y83^W|7TQo1;+?xQw?qN?~c!t{_Ysx-^q-Nv%@FCK#0s?
zi)Fcj;KmXekxT8^+~-N#0u^!#4|_4rg5!)YOM+$PDgt^IS$P#Z6j$1mcV~uw5|*z?
zwBhl5=J2N{SCuaB&HluJVtM<RTS1k}e?&t5gvhhwfD*npuG|Dow3_uWx5s!$GMKxG
zw(b*%FPm&;F-yD!vG8V1zu<&YV9sR~5fH>+DjISgDIq&RF69Wjm-bK8$en__600Th
zjv+j4Vp#v9XyVpdV2lQ*WDA}xzK>Q335eQ%wo6x<uQYziw*sECYw`KKeWUP_J*EFc
z6mhibWaE(8YIg(2m+BAah;M6&?t31Q8&2a!)PwKwo61(yr{b!W1uKNInY~&gt+s1o
zQO&QBa71h#+BIkNpi9?%UERSq=6-Ay?uWSI0+7v?e-HBS??<&+n1|1Gn)dhGN*_D$
zGCdmj5iVj=U!+->E1d;LX|Sw<Cotet<r(FkLvtje(2_>faLudkkhz^QF89RAswcE!
z2QIC?pxA8-Ihc}Z^O&NQe21VFEi`mQ!#B28bAH!Ok=R%V@xY1Ji%B3iWyQ7HPV1!H
z(2XErYAOg*QvpuD;j#G5)qM-!cXjI<{RIVW8=duzPS1*V^a)Dsn7cqi8k$A|{*YyE
zhFK@I7wPD{HGEHkLc9hmE53NPB{#CBB`+0Xd8+ZYe@U~vWoquLzAGkk4=9RX8o6%A
zQ&ONO(j3`ztz0;a1RAj=h$<GHC*bOQqzD2JWOI3nxXxQ(l87oFs;h<;K$zS6j*xj$
za_xH`fgpA{2{N{Fu{A?{D7+q|zFf3^&#er!bct?W9jAlb*j<!PJi8G%BKuv`Vl6JK
zEjHFVIe#Kx$FOCy<VfD7<)n*KahWl}nOsMMI&1w=H~kL0J{Kle`#EENiVqbBEhYJQ
z*oYOTT^mI#)F26n3DTf<RJvi^Bj?p48qk&EQFYNzyN|!bF}zfqP+jZDX+o@T>s9C%
z;=2sklN<Y5niiV|00u@ermf}(q3?VUfUHe>+%Q!zf14`)ML;RQTEH*bM>huihY92N
zq|Nii8i}lZoler%X@d%*O&h;l*4mM{an|?-qZ$T~dlpE&t=p$9mqmrGn>?u0E!A10
zZfvL5Q&Xr;rIY%jx&A$KbDy`K=?kQr`BPn>l7G?h8SB$>ac#95+m7{Q`)zcS*P4*u
zFPG-Hji=yx=#lMoL)V>&^iek!-1Sr=syFErc2oKNC-yhXZ>}-l*wjp>rt$zg-Rz$$
W&Q@q<s&W7T8}*B)W)Fgc_V2$S_#;sO

diff --git a/datasets/common_voice/dummy/as/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/as/6.1.0/dummy_data.zip
deleted file mode 100644
index c47e2a63cb7ffde4f29da96be61cc0a946829e1f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4097
zcmeH~c|25mAIFbnYBEHV3RxONM#IcpDak}KLYBdVmdjYm*oLuG)?~>ti5nW*xHJ+f
zEpu;llS*lliJO>e%98!sANy13IjUDkZgc;8{<vpe=gj%*`#!((`F=mY&#^KS5*7hw
zkIoWb>rXddE-*kAa1HeKKJMsB3Lt3`ZC3z73?+io>=}3r1_%fb2myfbAb7D_vWW7+
zA}&EF7Yfxk&>y9(fk9!>STqWQMPV>}R?k20yx<X4`K(Uz{|pfS6}QymOM<J00*(33
z*1<y=0~rDUAO<cB%|9T3N+SDc1o#JidQmA1oYQ1F?%Nl|N#Xdc#z;J>>r&fS;VZy$
z(49G!$@UB}WGb&u0qpeiA$i$WI>a&k?N=luQ*G3VZNx5uR2@DJc66dOC*x!7i_t7}
zZb9c^Y@&1LR?CN(*(_0$9ggNdk7cpeUW+K-kr|%(Knvv(tP%=MRqF~H9^FlBfRu}e
z9_e7qY#gx9{QajUvtCzkJYKi+w+4cPUDkygCZ=iG6g&p`X8P51$H_~?99vr^-PN!`
zrG<59tlixfd%fwp%|7FzsGG$$-s`Gw5ucs$=4~e*Qr}f+Kh{}(BUvR}kDU3cKJBl^
z&Kn_uPB_D<=wTSiGAL%)Wvarc%&6?%`{L+U`8KOH3M-*px}(x-(u_$StoA^3UDU8t
zn_eG}dOu0Vrp9hC;zQvG!|>F-%C=hnD<7?|88&6O??&~EdXBVMKi@mTtaU)AlQE7f
znRhVt;;uPxRl$8?58PP^k1F2kXr9U*>hDiXZt)95gkoBCQE4%)^*4HM(p!9aC{3E>
zHdVyvnn&Ka|JrpKuT2WchxFI9%ctj;ym{Qx5g|0&!E-IZ>Q_-S0$adC5CAmzZvj`g
zpdU6svaUsyzC>(ar=*U+tNtf_4fgM-S0#C?w3#u;T~NXXWb&>Qn$m+mPA<EGH73a!
z$Hg*yW~RFpIH5lKql-BrmWMCsZ7`S2qejYduD9P36B`-7{VMG+oULq&lZCe*%vNrA
zl|haxGLi0-KZ~pjEO!58bobTd;$-_=k%yO7V^Tbjo4aYA=giWAt79zvNzyI`MGry>
z%C9MQ+d>O7YShd_vFS+<V==!RC@r7RLTi}EiDho<%Sk(k?S7P0c58Fr%<KCxwOZ0H
z0h<GBCZ9;FLeD^Rp=SLSc&PXb)T-uH`=AOhQ24M$l*APYi&f26b&boa2RF)`gWiN@
zA&xcZzUd3%^6F)3VbO)*Rl!D5dCVMR6{C)Q6SO!oyAq;&^n_x<_(W+)bm<g#bS=AU
zYmUO7XD%=Ei}hV6q?c3~@p0LlSbG@G)^7`0Qc<8-+Mrlk{EHPt@*;!A=H~iUve|k4
z&dj<c>m{XPL}UB<)h2{D`N^u{IBb*mVoID`y1@S5^_8L}AP_M*vt@`8C8T>RQm3YC
zeUSRIj&aWe6YbZ3?wu&JC*C;M?&2>M>64gQ`0nA2Hw5tugAB@0!RE%zfd|IpQ>^g?
z4nql-0s~N6m2MZ~E=I|&!BFD~8<v|Qd@nX3h#GtIf>$y;iY{TWDioOQFlODJ)gdqs
z&jg~Ws7IrEAuW#P$AWSt)NB3gdGu{j32si}du7dEdTal2{0;HX7}>S0@%2W*td}7T
zJnx;k!6958CvfM-o&ifrOKjQTfHy@U_}z}@+{6$57tdAobZ0d;@3g(slj|cU<yn0;
zsvKUb@1R%)C+A;79v6@_pi?(_4V)-NaNQwty98-xlgH`U(H^la4x6$s<<gz3?T~`8
z9WTjFBhhfS+Wx$gjc`ShN}es&!eXX7@UhC7LB?bV<?_u+p=pJ;<j~T?L+=k;FdXZ=
z*3C48W|#kGGG?56^-gX7Xd3QmJgh7PsQhL+=L@dMzu*CG{sCXLUc#UM)%yQT>r*WI
zm-^t=Y!06PU^Au!xVuq*BsMD1K;#5kRJg8B)of4t{x~wZB)moTHs>xa*rorOsNqC>
z+I_3H@}0d|u%?zN0+JwIy@J4jHxsjuH>!_MeM&uZdq!R&e4`6!Lytfk(&oP_$v%sX
zXo=?Jq*Pc`p(SIEj7o+HO7eco*<s$%0xvQ_!h33WEq@TFk4{BwyI3rN-eBKTGDR2`
zA7Im59dO)9+3E1~QPZtq4r*6hKdk7vU4Qy{s~3TBvtsas$2rbWN}U$H7vF1hw_q&C
zso50AZQfE+u_ND<ngq$mV?5mPh{vQn1c$nPBV3y9#|$88r6p`SKbC5Zf@;f{Qk=@z
zr|%|Xo>CFl*zOEOBovSLg-B~cJB<*+wg;UOQnRgz`p$HHohOlAeYeEwE;BWb+bPCL
zk?3+<m}qv`(e@I@)ki3=k9GDpy}XzBZo~W2hu&2wr&s1wnB}ZLa`Y*#T2ITdnwDsy
z<rUVZbF1Z~YHHA_6TGPK_~0=13ue~vuZ04xKxBCDG?C#acTBK3G6MFKCAvyj!RK~T
z!vK8K&JIJVns#%oY9bSJMUj^n){tDUl8S5d%xyJdf1ouYw#SYgK=g^*@0<&=vq=!_
zxv$3-{w4h)D4DV_4$i8&C^riT$_p(OzoovwC4d2g+1mwS9P&Ag|AT_cz+Ax(21W}D
z{!K9dF==rupCb(zoadADDWrq2*^vI-WzEgn!ifGE296%Uj1&g{=DnvSmj%M+1$L0S
zq&jodg{g2pbvgK{z$f)<M*J~zak!b!WC`=l{KjEG@^@m6mDy}B&aHOgE<2xGu#9i=
z+@Aa6<<3srLc5<2O@Z<aooD|1Q5RbGe5&pWzNw3h{O{PGM}D!#d}f2447T!07~kw~
X9cL*t1XK<HM8Ka9*gat&00906FP6sh

diff --git a/datasets/common_voice/dummy/br/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/br/6.1.0/dummy_data.zip
deleted file mode 100644
index 7aec9e6726100e345802184675d12e15fdddc384..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4057
zcmeH~dpy(oAICR$HrF$^iC=PwPBZfpi6+I)2r-)?*O^npIM~)jr4+fHq!i^&nsTYo
zRQx)XOG&7XYoUbsNlTeqnVj#^BX;Dp{`vj&+a90qzI#0Oct5|N_v`(Be_nIiDl8%e
z;=Kloe#3pf`Er4PR)9z}3T2-+$(QPjB;X}M!dY6*KD<TS3ju*e*uo%?$Xnpzv}_P<
z{vdvIgkMP5Z?tfPi2({>WN2iFKp7!WC;_AA<rDcHakYTaiDB~q*&Ot8j}HTOEdgrF
zBjRHHx&t6!5J*l41TqvHKn?RH2OChs>7NsooQ3w8I*a~$qMBM7DK&;rv6tl=qID(4
znb)oV9cAU5f-pV{tqXmeaSPKiqD_eRia@TE`)H-`@_4Gfe|8v2?G2=Aa+7FW4|BNC
z+|#XiYHTL@T&KMb&hK#N>aK!S9)%%Ifl0_%l;j}s@m+23Ju<#Gf&u&QO3$m!@y<|J
ztp4n7n9io;jPiELf>q~8xGU|eDTKpv$0Z!4&fS41TD*K2twE8}#o9LXozyFQq6tk-
zv5KL2j&QmYjiXH(tOD5>n!9O}SKPm-t+f7-3QpSS<7u4Q{VPc=qHyYmqkksv=~WvY
z-{EMFx;q)c7;N^=#w)>RDjhlf7UBUb_KLsX*_W(=mx?6LnD0LyUO8(sBgU(lg`_JP
zQ@Y*&5<CGCZy<QmN&fWjrhJ=~W33_7B7HkPqEE!GUt628F?2}$410amnP(RRAXsj^
z={r<etnjuw4AlsW^U37aL6v&w&+6Md)t=bEygK?yG;VDv9c@HjNVsQ{TC8>w>v9ht
ztxNGvz4^r4K@+QFoDn~A4rhqjA7@`{BlJt0LB2@Kw%oL1dImTY3F_O5SGW|;jx`|F
zd+k4=t7QFCHD|y%d#wu-H3g4vY8Gd-D-!JQorPU!4*yS`x{Y{m%x$-Q6`w-A<Q<V4
zE_HjGXAX~)=NA;sY|t>K{nC^4{Ph7Lh}g4tVslJ~X)>ppW<mJ4w{wDPI$^qyQ*0Iy
z7jl58?gr%Z6R?a0&newEhzxYRKj~{`d4>FqV)jqo8mxtaT6a9uhxd{cx8!|l^zbS5
z!ICcZU?UUh_ra`jEw%E53i;?&&PjEgV-q**^MVI&T#i6qHENw=k9Ep)p9!)YncJnB
ziz$xsRAiOnIraXSQFyA?Z8wKkw9;m$;QExp5=>CCL!%BUpn>tIwm827S0F-clqtEZ
zGD7T!J87DIa6dX#a;BTQR^~)jc1r#Z#_3CKWu{EH*6c<6DoNq^O8>4b`s(_Ex5}sO
zAE2W-VdbyKhi2Vbsx$AR{Z3Lg#*8^%D;2&yS`+VwtSN!Ya@nU8xrf+B7KV}Cp=f=a
z`-ID$iC^<oDAvzmJ07H~WP8;yQ`{k~d6Os7A=V(GuF~|=*i3m%#5o0V6Rbx?#JZ1m
zQKW_#rPCutjm0yhj?M2RPf_*IE?Drbnukb7ql0QD1Ik0Y#Lu((-0Q*p#{Jx~L)`IW
z7qf^pV~^g5j>#^_$eTxtWkd=tBh^2g`c=VY#?k*3{GTZJ0-n3x0cgVa!1{p`C4{=$
zKkR!FQj)d?ui!P|7_SL;VPQIUKWmvjkQRg4{=Ry+C5N$5>_Ab5rgB2Yi!}o=Hm=|U
zM&3ai9hGU@(O%sYiJf)bBNNmYGF4_hz3rA-*-e=g!gKWmC?nmpuM6|K8u}rpK~6IY
zuYWV5S#%98@y$9L?}Ol#Rwr_O2*FnC$s>wa#YlxtCH;oRvb{r?!@dK7WtSftvNFQj
zLQ@_|yXz9N_1e5{qCP%7dYa<KOxlAaGML#yZBXoZg-3n_H|*v%renaBN;1oQln%Yt
zz(^%uC-)CO-?8?9eM^!-qDC8i>af|w$v~aB^SA)Ct%T6C3s;@DG(!tji+|em9)F77
z<lOWy=k+5P(Pdm=;}JrT{86dnV1<MvLg7>?CdDNv<AS{I8ugMR_kE|4gvng)#gEmp
z+{l%c?E6z!EVfOeXP?fNp$|Tu{=?W3%-?wBcMy7HKr%_d2$%@oc;w)v6LgvWqa>U2
z#ES|10E<e;yE#3Lg%gha2{)^WLb7G9a_;PHhfKSgZ81>l!m28X*q*#I_dfMGsWCLG
zl|uFC*%=ZfA3nwA-qx68Q4g-2v%8g6)tSdMR~Sxz%arOV_V#4;qUAwrT!iDK-L&C&
zeC=h`1Qx#NcYWBa_3dbwJ>eQT;IhZ2&Ey#8nZdMN6U-9~0z^==BOOT*iVJg+zAm4d
zlUCqr7pHSKEwh+Yedu9kZF6<@7F&Ip+zb}tW@G<Aq(hbCi#C&IvggTW+?Jk;Nkyh}
z0i0iF8bp;nWqW&0Q^9iPxL3Jgux{LS*TiH6@qt8DjGpRd3EGOhbqc#<&9rI@O)7FH
zG6J`n8C_G-fy%)Nu^zT|(Z;NL;p0Bi1(rE;V$t#LD+d0sxKuk{$KjSme4t-vMNX&I
zp-abyDl5A<cDK`SboPx_4Gq)RO~_aGOihfx$>FFc1$VR}GA-FN#90VW*m;-Bo2ZLj
z?mEGR{UR$lgx}`zv|Zwd!9p6s%h_s`FMK%=+lqI~XR<}-P4*82)CMgCoVBvy5BLv;
z`}?G&c6))uw#pZf^x1p^FrNAT=C&4GAiwpVhXD-;x^P4U_*j&uWw!;u78!AXx~w`2
zRDSQdn7R+>d;zIno#^+OOTF1*W)tv#N<ik{<_tjohD&qW%B#hN-SXRs#pD!GfyoQz
z;``g>bsYZ*xESgxE--Y_c_<i_e}FHhN=ph%T{6!9iM`y6nP&r)43shp_|cCE{B`0i
UhZYs%g_Z)|!6G0K0mA$C7mv2Np8x;=

diff --git a/datasets/common_voice/dummy/ca/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ca/6.1.0/dummy_data.zip
deleted file mode 100644
index e823fa105b578acea1ef40baf4e94a9d9d5038d4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4163
zcmeH}X*`sD7soHlWM9UT<<_0`78+X&gVCU}Rn{89SjQGdW{k<2-N+I{R3cl}7K1jW
zvgDzxg=AkUDx^|(i+Zl9J|VfA``z>6nfc5$*Q@jWpYuEC{D0=AOw8;6?en<8-}39r
z_Xh_c2Dk<K`W|+4b0#<|?XzYBm~y2ooM^v55(mJ*Jj(<C%yZzweBB_@tAn@_VXn9U
z|3EwpuBZZ2RaRAosi?wKROpOeQ0TJih_ZA>cl~~Ro-?}Z9bW*<nic3RaIrMNIa+OF
z002Ek08pkMKnQU5@>3+>iC+tqmaFUZ?zryHg=$Yj`ZX$Z#(FFYXrZ9fom6!qOly&m
zVP`j)-Hv;!c6+84p$!^Z)uWd+awg|*>GZLW2?l4WVt76wn~nnNnJDSLy9-({>zyZc
z@8Z~H`DEqE?hFa`a&c_V;&WQbTk{C1540a^b@`$Or|Y>!2;IfJR!o<QAI{t<tc6CY
zK$Cikx(4D1s27w&qF457v=OIjxpbqvE4LlL1?deI%6zH<gLd**1m1ZIz5SO@=t1Va
zlbW;7-HW6eEaDYOxFP53HVXj`)kTt4aUKZs%xw;MtSi;;oh15metw&Z(q{AWqC`B#
z2IsgBQ&ySdf#4)e^hYb}vvD&o;CwZB265t9B0Y)Om+N$_a@dF^>2-g5DnE@6j=vIi
z7kx`kRXRpQdYGv5{*r;KmQ=>#?WxrFn`ynZEY5wd3@QQ?#|IQgk$!R9u*Ba=W2xuY
zD9)>MAAQC?m@LUjRB0(Un`WxS+R7TiHIGo{gElQPAm83fj2v)qXR((ohBu4DJvA&u
z%=P&`!jS=m^7`RtM`8BH_Hqz;M&1*0$CEDddN`otaF-fw(+Z#;x<y?*o{5P*Q})z;
z)viM@z(uQ{#y*JH@+RCm+M+E@qfc9dv2iSZ;2=;D(jTP8y{TL|wKNlgsWx01&k+%v
zkswWBkmv<o<|l#eEqJM(80Ek|&63Re%2Uw1SM`QhEg*=N=X)3Lu}Cw`Qkw<XTTfDU
z+bUYb35`qEn7ed@c%>~~i0{iVtwWZ<<&DJ?A`~87-;q~8ihJplYFpw1xu}<BM^+Hj
z)S=W^v(?T$zq4_35o8LDc_;8kP4!e$^r6v6DVP`uWyfvU{GW(ooh{7)h$XD$_GN*L
za$PKgY;0m<K!MCafz;?1i0JI&1+E#a+Ygy&)ANf7@=Y^M5tHDI>%S*wQhmbV=(|bQ
z>JEkjce4L1Dlf1mn0>$?^c6SWPt4e3vtW1UW!Tsw&zp{RP?n*aO=BqSbdp&Ae0Yh3
z$(O5Nh8vQblHcl=06XRc4jS5!&4%GAxE=wjYDj~q!JeY>UOkxvMK=e$IzmLy@_e5t
zyK0kog(8F#LAkO2(XJVXLs}G=ASq?>TDqkQOI~tdDXBDbWXcn3y&=hLdMhwm*2VE+
zu<t4Uj~>DC@^rO6*~0_eQX>&M+*&sWk6@S1oJ;Nw|8jsGrjY!&sJox#QhlmzLkHhy
z_0Ui~zaUgcR#1K{Um<cn`Lquu;+m$jAnU{))pDwwrd@4{t^RI=H;Zv=X<=fmH%}z1
zOq=w%#MYvQ_Fg6%g$fBCMWfR$9l|D@l-?4Pl&CWM@~vt1=Ue3*6ebI^ng>fwx^6G^
zNA8n5nHAj`CKkibzftsIr))-eN-^w>PM<EcB$6LKhI3b{QDRk+iS>_t{n%;OhWJ4F
zD;Ig>NP9l4P@J|eS1PVVzv2Wep72BLb=Kwo#r|Itd$Xylv>!O$3jgkM#1TBP0l!t-
zLOXqv1ZUEKm46g$A)Z6?(Zd-93G&mjlbsbO4%c}Ej9I;CuCyII5sBhbBUE_hx*>7=
zr&w~g-^)ks=X)JGG;-OByK%7P!{w&B?7GUWb$XIFyq+ZcZ^RA@#VWP4v>fAau~zXw
zBZ5s%xMm)?u}f&zY>#(o!qv=NefHofBZj7Dj3gu7l$@bb>tcBS&@?9dIv^=!bpP@}
zKkmq~42&~rMi{{sWH3ne$r>V0?IPg?55#P^a91jOp1SuawrbRWfY^YZ<P_c%^F)SP
zGhQu}R&X>d!@`bL(}j2>#S(|*G$Kzp37>mV+0)B*g%5SwUAT%V2IJz9y2;dH$-62R
z8eDM|U1GfxBgwNHS^QvP`n<Wxm$W&Q$LI9Vam3G`YWI>_w__cf=JW^U-^mvC_RYI^
zAYNpv$%Z7jClIOgoii05J{^82qSQv-zdiUxz^B5~=$hks0iR}C-sgX$oV{V+ijkYw
zjHsQ~SYF$-A->!_RR0vTKu<tz;q(u&m*3jOz0NGRcN){I52Q;yBr!+4%gq=gh#=oK
zP62qxt|jPa$lxk@)Z1aA!Aqt*V6)l2tguo>jiC!KJ@th4Yx*_fhZh;B;jno@!&nQU
zZmwfj)twKB$oiruhb9cU(}O)E?YPdmIc}lIs3vv1jWvaO$2uk7f1q$Og~IzNBV^Po
z;V;RZGg@yZl4PZ9`9o%v6Q$ZZVka17W}_36A=S+Sy~h@-1kB)!7hYelA30aaH0eZ1
zEyIuNjOy6$HABLY(F`mfYyCoeOW#9Cm#X}mA;ROMs;^ER+8$nSb}es5@d!^?Xw;Iy
z8;)fwOe^b3KdnxGS-%(63%A-((ki>gc{4En$+Vt<7yi!Y1!nbVudD1l<eQ!U7XhV#
z<$z0iJ68w%2W$U((pqD`Oq$gzq?7d3=7TVr&HrUu%RX|o$^QleTOTNjWCs6M)M?$c
zK-db)4^r1vXPLU%y{@F@gZq?D>JKmbd*)h0wvwsOOgHnVDF?~Fux93_v|e1!cC|5B
zNmkoHH+k76{61aU#I3%dRzf9M>4vVjuIQt#UY{$e@7d_4u34u4#Qrw&Yj?~yHrUBv
eE1PrB&Hj1gtcQkx$^ig7`1J$3=OzdMfd2xnX2FsG

diff --git a/datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip
deleted file mode 100644
index 9c0004c00ee9597beaf91b600870f6e7ad2fd337..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3886
zcmeH~c{r47AIG1;*kza9si+Vc=dv6e%VggbLLAGO6Iq8b$#Oz=g)y2g<&{vjh!~ur
zn4~1x8)Zqh91`!?lJGv0u9zdIasGa1u4nmUuHWarzrW@Fj-?4b11muJj1~JI|MK$n
z#s-K2?m<32=bYVL(XK~rtXTm1tOHgqlq)Eh4S+CA(*pp*2k>UOV-QK|AP9^c!Y{x-
zC{RvW9ww)#pr{}RQ<Q_jXpEkd>qfQ2!!$-m_<C#*II}f(T0bASZDyb+$L)BaL~a5j
z1OT|`006cN0y@Ce(^noHi1{KFHcQiG39tEgsajK7tQr;AW7}@Ao-7;bb<k29GGr3c
z!y`D$aj$~1WmvOC)f!s-$6n}tTKK5+$wRjPlVgr**=dvnRK*P?KG!$6EI+(9_=i;K
z5nErZA6-JztzHcAQk+*wZkS@~t!)1e;fB2UROZi0(J_@AxGFZTkg3~bQpZW4!L$iS
z#->?BcR-;h`8|S_?590g_~`bB&I<p>EQuv6Kb=lKQRhO%(`G>Tu~n!}XvxaR09@ll
z%iL&G?9lX0as68N{ufoo!jOvh=SD@6^*z%p-Ehv=C7{(w?eK)GmXa%xP7?8Mj=bx8
z+gAZi($6YkmkH>EyCsgX=_R8~cb_k$BaX~ut{?o&NGX_2!L|9OZhr*je-Da&bQglV
zBQZ3{?E}iK+h)pN<l<_dCe@R=@*9V|o<02Y1+A%Do!ZZg4!K4-cIpK0PsoT95|(W<
zaOS-o%-1_Mlp*+na9*BiCKd0s=7e$)3^uQY@Fegw#rV6UIB=OqrBAhrVgv7vqTglY
zG?+HW;^}`s1{=%n5b+~C;(B`aVAZR@k)XaZ0_P$40WO~->V2w7i`;<*dExUD&{ade
zSEu%tm~+Q;#VC83^$TVrui|HFF4yG$IX`>`9zWpYz|nuD<f3Dbi1``fRJ}<;$l)UE
zg+x+K^Le><8i%bzZ7d$>uyH(PW?l1BU+?-bH?peRtm$TBmj^HEdqd~EDX?Ld+i?rk
z03jJr+)nUP+J(RvS8q?S`H}A5WTsT~cdFVa#l-iK0y~s6DIFFS#j`N7@3kxgpV4D|
zH+7GZ#ZNX-zO2^81`*xYuv2b*tCa=!{~M0EjwmyGE>_(|`lH$+e`RS=WZC<J_qEq?
z2yFRZrT1rhntINOjhHCBO{dEr7lA>_<AmA6W||y-s^EESEUYGpxE!4aL&e)Qz<KwP
z6NN&3Vq}7J-}FQvkj?ftqz(3DqO5^bl$f)=g$}!Q1uooaiRr_BOq!48GMt%t_TFH7
z-CPw}!>2`KAcb3g1zme@sQ!{l)c{-aqNbg2QW46tDbm4@8K%(K8_u~;g{96gvGvhY
zL;mt6ft)=^ksN}pjB31$wyY7q$?VGvb&mPt%^{_oCnUYn6m?mU^O3}LI;G2GzTi;p
zwid}S$E>$RocNq`oWy2gD5GX;Vib0f7&I~v{VgXuOq&1a<lp7QgI8VLA8eF^yBz<1
zXb)t-50ip5^A&-y>yw^I2S@yry;g`M{Ti)flr9x~8}{Cab>93}?Ub%-)_d+w<TxY%
z%aJqX-p9sK!ZrgRCnn2TGxWMwrsvBg$d*Ctu!j%A1cLDPW2H6eJ%v?uvj@E<!<@#F
zT16GFbT?kbU;4;b%s>=8QZ<~(*&T+<G-dlG9<|SP|7mzt`PK9S{eec^B0W2Wk)NNq
zb!gsCY=&Wro4f*E+dh{TY7{Wk;O};?lv!E0r$rLeGe<7sl8p><7y1@{?>QUG)P@-8
zs17GX$0zD}CxO_NA+NY|<B)LjHTcb*5Lw+i)i}cAb_X&yR$8U7v(~y#^qzNt`!m~j
zqdfY~4B_#GsfP6~Qm94c0ACLcY&2mFp4~J`R2<b!5<DZtFmH3m?upeQW2u}%DZT4!
zEri|+tD~P7msUr@H4<tb5@X&@AntS4pkpCg%MpqgLEiBkvDtS^cjS5TaVJ$7k21Bt
z6w>M=y}tsirbIlXZnuuEtg&J6pm_<VS7jIN=IOhA67JA-ttslhlmo|WoKLNR*zJs*
z1oL~M64{97IIm1f;mK>}ASNDTXYvcqxcS>^rfYt2{i|qeftsZ?f>Qu|P;+X=7*{W;
z)B<Vj&n)vcO{%=(=|h$~gW^UQ4<>uW8cJWZJ)Au)=^|W0*b`Lmt!k(&F2=@dg&quL
z$ubJvo2hfaJnSV)U8`DLa;E;JW|xbQCjx_|1!64NiBu->sA=ZN>Y#xKsMGi?ufJf8
z0>iC{0ghT(`2aq>$-GlN3E~miH_R%C>o?D8&9?>MLX1l0Zxj?<zz6hOAJQW%ET0-<
zqe!~p7%3C%t6BW=P~L^4Uj_B_qjpRbRkxW-2%R|nP99b8>jJodX&uTdm1Bi&IM!VR
zlms>duIlJf2mA;3`eV{|@488v*2$%jwBcky8pX+ecVC;firU9+(7?h6@*<(&V~bEb
z?u$a(;%Y(cj``WdQoFgWSPVE(X~ll?c0Y!0cQ{+2B=G+Ut<b-9If(v_*ReFA)Z^xU
zsn77OXn!c}=*<)T#~Y^f9rY=-mCC?GJ9W#MMH`lSf^NmWVWu6sZJz!Ue5YBn0SD_D
fENDgWN4re&x5={;8_GzD%?f@o3;>YxWf}hi5-4I7

diff --git a/datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip
deleted file mode 100644
index 555ddd9291d931584bc740caddd92b54eac0ca31..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3928
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3O)7eg%I
zUtovJ`657Hb1+<8loa%Arfm}=BLl;CAT}W0f|8=d%sjo4;xceRO`K<!@O_#cdO)2$
zZ<}}6fQR)#q-`r-0Lzv&_WpOPIhz-)aLLS(+<0lxUeDt)3d=c))@ylOf5&REhkXKr
zy;aJ**E1GJY<PF%7T5VBkGAx@U;0Ta>F11k<-Ew8%Dx&Ak1J+J7YfMjh|OIdHeX}n
z*K_5|zBg|YDM>fk>iEG=efE@Ra_lT=$?SK9T_z_f@vdy{xTjViHRr11&4yb%6H1;7
zCFeLwO3MgE7`<tFzuQr<+|T-39fR;BQ;iiq!8`ZQ?sSiGXRUg{cqT2X>f@=RQ$^<s
z_0AXd=Wi1G*}DDxYt=aY?VsH=7#F?!wDDQn+;;|!tAA|Q`W;}aup>><Z(@G>PHo?a
z;;COhKh?j?4@xPBpl(_pwuA#1%zuGckNBWYNi8ELpqUi~SRc%q(<B`6N3USw7VU&I
z*(B!Gu{o#rIr}}(a$Pr3?c3g_sasxNK0Eo|<Y(Q#TbA~6o_KScJF@zK&_tPMA0wpK
zvAkOOXI@=*N;hkPqTR83D!--v%KgroU0uV~=GpFS@Sxd!@!E^?v{$lPXe&y!OE7V~
zRuW&JtMp>UO@+DNudPtzIh-;@xGafnZt%t8>?J7@F(G0XxR%RqFz}685UNw~ct_}+
zq6u7ECa7GS^ZM~!k;0zW8Z$PnDmSZ~_T=Y3&gZ|`bGK`MRlV3W{nOWZh0ZLAOXluU
zC{3;F@Q69N|LG3-bCq#_K9=p>dog^8=(VaH2@6-g>3caTD)wB7`}I9rzNmkYMufM=
zyriJRY4f*)0>k<X5StJm-erk7nZPn2H3c5(KY&Ragwe`@)Csml(uO>4*}p`pnfU)R
zOqj}DzO#jm#nJO|?7CySC-OhmV!t35q{+_xc*lnx>xMrL``BlmzxDm_&RN{=)i?LL
z?OyS-(lX|Hta<SD?f)V_1)dG8z8Zd5NjdXbXvH<d(ipej34u1IA5xrsbV4~4oEetQ
z(n)DwdP~8fG)Db{+#;{Mb&1b(=D9EBuruZD%=&s!`}l<JY*DR1rUbe7&s#bdGN?+%
zyQVO#@XXb9yOtXH|KU7W{!Oxbw(C8Wke}{tV86xxPXFh~xOe-P|NZ*!^{w~Yr+4n3
z!?(BgTI=sEFGIF|w|{>*bME`??vHy-cW2%Ak#0=PO)=*83zjYj3_dHuT<j?A&GF*M
zq#&pJZCt&oB1Z%o75X{cC98ZDZ*@%45@UJJdU26%3$tWS@^Y)BkXfRcDU4E9IeKZ1
z{CsP>CH8AucU^1#A5a&kZrS<n*O$ipf5!h=5m5*#osgqYpZF*&Ni8mc#~zNtX*Bj|
z9(!m0CZClAW<CKXlIoWHl8n?MLeY2j{O&?v-NN=@*32g19~zfx-cF2TpESWYTB%ei
zH{Ql8W~a|Y!Jw1Pt>>1WVR*@KhhN6d-pyxgf8&{e?@P9@O?EGu^}BdogvW{LU!o83
zo#YEX(Aw}_*Ynt$#3S3b&W`$>B<Z2f>zv+F>ehH~m1FdiQ>~7gw-*ZR5KY(;)Mu%<
zW24Z{V{2X2VwQ(qIFs}sBWCwO6R!(B{IMAi%feV#^jRl68l5XyUiyK}dU@|wzP836
zHoiZ9=6$*G`%-b5j(Jedvd9V7ZRTb;&N5h2Zl%BT%JJT~gSxM6kM2_JDOnOHrDFEx
zON(IqiT$A`<*)v#JM&oDc1q2n^r{0BuJKnbeOZ|Oxq>CCuu1A~;oncyv+i;%T{nNW
zTWWpcen!*`l(o|}Ob8eStiUKRBR&IV=3$nCIP%fiiKlac5z%)3jmSKO3x}t@QQg(#
zI5|Y=hoW-W-NdR>8`~F2f0>lFewFWT=gMF9O)NheO<W_3<R!z-sNdi^;l$*hy??om
zbU{^Iow6Kj-=+<77&}j?_;eOb+EhA++pa{1H`*=lNVMxQi+3T<n+v43h~JfRxzAgg
zyEIkke%JAYj0d+^U%e67!nbUzjK}L+*YEM%3l3;mG3U~nIg05_imTnS-NTmXDJpbx
z<#C8<oGt!kXFlo6malGCeIM_&JhQs>`1$FEzJ+Tpb$ouS+_(LE)=|~B7E812p8x$7
z_0(f|2?H;i*G0Df*G}G@Td;p$;MV(k7Ga;i2BpqS>FE}kxbepW|HXU0^+j)tp1$R2
z1q181^B=iEF#u^~f{HNYMy4t8F#rw!%zV^V7AO+Xs-XaHMkYCCT&-FOsQm&AZyiBQ
zlr}Caq>W30fhr6L1OGa@q8f;&y-V08%+@Z#rnioZiL?pa_61r7YWrgK7NR+Z+VX{1
z2Fxi8OBz{#6jC_h^cK)Eq;@dSR-Cbeuoblth-_;<FpCmtE4)ca*iOub5VD>3f!!G*
z?L;dOfHq@k5<$`;!ndgPH?qx(Sc$e7QJWL?E-1&LRy)X+Ca@E2DN-dwoUN!OEwZhT
wIf%9uquj-3H?A@UVmB})1Cw$pP!VYn3@>qTS<T7@GMWnr^H>-du7F(u0NlM=^#A|>

diff --git a/datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip
deleted file mode 100644
index 037451f05570f46163441670e1517e57a90ae04e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4456
zcmeH~X*iT^8^`Y%V>cN~S&~8W$c$x>r4W+ryF^(BWgTW1R1`8%>Oe}iQ3%<ZP$<bZ
zXcfltYAh9=5(XtbB=4w>P<fiZy&v9tj{Bbb)A_rv>wljA>pU%tnIIg%@-<QvVD;tY
z+k+Dj0o;TA{KA~wU4vZZZEe^ACgN60m*qP+loJ3!CYS&KGD&|}tQ$mfZ4fkG7LCCM
z1mk2?<WRDT3W^G{C`DNmioxjFIc{q{@h=9Wqw!w>+*h>L`+R=-s@Z_9Y&WY=MD#^}
z5CB{R1AqeKfFP`^m%m&P4*zAN5{X(aA1`VBxKU3Mbj%tRV0(K;`PD&Kw_SKMYxP%w
z*C{F+lgs?h!jNwYGe9aJes<}NcBBU}f&!ZYm};gUf7*?1i=Jl7e^=jnGc_}dq+}Gp
z(fago^j@VFrO~vrZ_JOJZY8J%hdx9C>i?+%?b06;Hj~y=tZkJYE5lzw48_zH1`szZ
z-y#tOZYhf;m+Fxb*E5*Cih1%ns9`r^STj3;zqCiLSyA8YoU=Lz*%n+{)qb+J`JsWe
z%nc(@p0xMKmVLI}F?uXd+&gk<=K)rd;IKoi(Zf%nihRZQp$pGE1BLEXaloe)H3*KO
z?3hJ@x4eA~JI02Z*c%gGv%yin@58&Rg9ObOcmG#omGcx#r?T+#MCvv>W20kJH5Wu3
zyt7ryrjC-NI^~2GCQt~$!$B_fpJmm;fw?gAV;bS|5hs&w$M-BsafY07Gu22jjPtqk
z9C;i@>JO<tk(WGfJSO$NDAaIvj^BxAHdxmvb0Oj#Ym2~gTCV8Y9J@8aj;^Z{T~|5A
zb#?c^|ADGn+O}kT1|>f(W4g-5JLwFe40>H=6Ujw~yN=_M?KLFq+<^q9Ub`VlwT--+
zH`IB1etz8ZSmS^kskl5w9O3Ak6f8}GNcI%@T6a}R<=xqS!b)?WVoS%+#&*Cp35j@S
z%vBo3^N{nBk0z65c|jEzYN8PP%3u%DG+SA<dic7>rZ742eW`hA?0)|sQ6_l_EmD!n
zW+gCnVT4D#NkLPeUVb&lhV$SiWKxDbd7C7MLPw(hGm|({w6-mOSLh^IA=<QgFE9Ip
z9Iu<aO`3NG2CJh7EklGql!nksb0+<I7n%p}Je{RfmZn?W=j=zr2HLz{-!6rJ$dcFD
zxoDJ$$v<@=mJ&OX#=n&vym`hMWf0gr)umxMr{M%=3#v_xcZO5jRU7eoB+eIVzAXab
zN*!qyb7M571STjhJi%c}yT2t?T>9|f8w~H@ihKq{H>;W94Cy$!e2R4Wlo^)~@9OJC
z_b(6k?;<Xj<u6LpBf<1cI*N<XL$n?S8pW0l=Chai=0|>NH|+E?5a{eLHcGkIpv`_N
z*K5-R=o3nyu=Z-Ym3$MzSpIcb80tpRM357OaGOTSQ4S{ChWDC;y!8vW@mLI;wd*J@
zw0FEX@5EE?TS!jxvLx>~Xml)})4{r~i?H)9Th}I^8uJ9>2dKZ<pj_bmX6W%qk!tU$
z+5_js(eKls`?0O+2Qm+wam~(lG`S*l#z*o|_0`8%l&pg(Iae5mPRV#Vi>(%I%fw~Q
z_<M6nP4(?Ys_vh~G}bPSw?v3;SnPI|TBMpU&ddjY{4{^hS-<PX+=UlnyBBVzzlSe=
z9IZCggtoohWvVxRI=VGbk~60q4o|kcGCXb8FZ86<U;x;s>Wzp~la1|C^+U00pN$d|
zxfVs%1?jf$<lM?q5Lv%jF#}u1T!-VRuN=K_-dzN30BslZnMi0gc43Z9u#zycS!%0i
ziGR>sDH5SMn-W1ZfC%u|y-<sHr9#H17S6`L=$xY70Ly?fnWyHtsqhuqm)&@^?DCAu
z9^`=w`mXpoX8eD}|0flHKk3Q$kY4C{e|G+2f{u7#e<!(LqPf2V3fA27bi10)@Y07}
z(6SbE6hwn!QDzzov$rWJ%TAvx!Z)(S=@PNbyABn+Gw8DQeUL^=pYHuEB!$k9CA?}H
zbq#FvHyq+h2q$u<On@==-Zamp1_fP}2KxqcBw(>Ey#gj>a97_t$&Oc8&&ScX?7dZ3
zP7X|b_y|*FUn*Ps6@6LXsqQeZaEgT9)u@nGN2{H0xQ2AdBGJ%>N4{Z6PU<O0Jz}Ub
z^*Ps}%_$OdsgS4;9s5MdX#7sUg}3KHr%56l`N`PqYU0z4h}~o8u<du}OG;TsE;hZ9
zfR7bM2qkB^*CZX6+2=reTR6@V96r#!;6%)N^ysAGX>9i62<-s^$3R3+i*bEn(1^C<
zqsZZDUScKfmU|Gz(^~cKi5-0L<>W2x!hG#s<~JOvFv|mFh^%XDuYFC1gomtLw_1;i
zhTDkagmzp-m0ChE6^5V+Zl`~mr<2RAg)*+N7)J7R3lb0U+uuFuQo#p#3L1?)*E5jj
zTMU->P*x9ARxvsUpUJuKF=p_&{865J!)P1LXPkMZ<45lXMbBgc$4h*oy1P!7(Mq&%
z+10eAA?_U-PHeQdohL3OPdsg-<sNzO@)^9=CB#I|C7RN6<}f{HR2X*&FaMv9hjq;2
zNrI_=g91#yXL36hJbb@*RJm1j7c34&$s?KS6!hcCTl%2cy1II?h~9v!W=DD)LsSQQ
zvqt7z90>MGGMcH+J#~o+r-UFg8gSV$DGeTH`NQ{fwy-L0P=brbG1sNLJ49!)_?|<=
zbBhqfrLXmqh6tP&j9V_>im|>Va~;Pj$E|3PS8UlA;F00pjdDG=!RDCwJtV-zW~Ww!
zRI0I{;I>wL;1HRmj8F=|1&?oqQ1}JGf_^b>M81Ne!pZvwbp`9)nkrzr)+z;~QB&jd
zsoggpb;wLwZ4Q~vAfvr8&qEtR>n^KQ&Q&d`o665!?yE4v+-&s1VLYd!((&5TFLLcA
zLgM@j2ln)vJcA-z@9(ImNtMFcPQ*EKm+yNiw})zHa!(B6M8;{jr%8Bgd~n!*FEdov
zy5H~E+{qF?E$qX9g^g{M6N6f{pk?vC4js!Dzk2A{!MJ$er1Qe8?Yfus`)ND@fkm0t
zn{oud@nOJ(&hqP;nTPr7%<~5UC4rTIi#mJP2K<-#=l7(ar=Jzlgia2Fq%X4&9kx6R
z{c>3=9rM~r=qrp~WI%2tg#Nc`pVnO#9kyyNqEpvZXN9`9PhU-?(yN<6>i4ew_spN0
z%hk+u2*b=Dyo65vMVo11yqt?Gt6kgQtR^S1Fic+Qc7DIy<%(N-#IA;(Vq+M(>cC};
zy4IvuQ>*B;$)K=5S@yrN*YlXKY<ecsQ#qK6VfK%XvmTn6t{ebx(0~5)>=~v50N}rr
Cg>EeX

diff --git a/datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip
deleted file mode 100644
index b9cf883295201cb5075150393a9ac7d2e16854df..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4182
zcmeH~dpy(oAIFC+%@)dCG97aqzYKHhxJDuxayv-wn{pd+Sz$sonrfQ4<X))U#gLXu
zJS1}PD})nLlu?r|6jJA_dW7V(^Uv?E-}cyd-@o3^eZJ4n`~7*%0>cI70WH0r76n*+
zzWI8Afh0h<5IjEA1?Lv*ruc^q6vUM!ZRxu72{{G>ae!G|AP{&8xLB-P1i88hmVm<g
z2L^-$q0|-7C}kyOB@|j2g+{YkJtx<F)lZaVvpP2P3xMyU?s`985Lop_P;ZXA6;yk%
zX*&l9q{9gUDX|X-4s`SKQwR<se11^WEM3?4$-3V^s1B;WU$Ye4u<xzli$mQMDgKz^
z@RQC`zf_x8aFMroHnZAobo5e&<}e>!J`G&kurCzeN=OkQsOv*dCq;btOF0vMea9lj
z?^ku}-I^D;Jkx!LZk6;n2Kl}=)UHHqP<gwrQLk=-#Z`B`3EpHbrIpCZQK@#^3hAm8
z-x(IY`=H^2(UHt46EAKW*s$(Gv6|7*elZWN0irE7-a)tcdW#wA_LiI;nrNcIRZ(BG
zUl)VJ^OPpRs#3vWc(k<t&L93{tW$n~NNCMAV*RAJ<Kr`Lh^2QPI}B!qm|m+rB^|qA
zTXvKT$@KT*F{-1&Vm7n>2RitkF9eci2i}A0n13c2=(JmQ+SQjf=StW6Ydt09Vu&Sl
z1LryO36%ALc7*@*rlp;<tb!b(?;;PV*a@gmU|$8!gYZ)k@#TJFO7J~CLw5XL_aW*s
zn83P-`66M*eHEMf6SjwAr(f>hn%62TQ6vWE9vclI&($jGvw}M(J%2q!nY{6+;jS2;
z(A}g+q?x4|I@VqYech}|_=;NfgG#jp?h-6^;?lM)?|x5U(Nz3!Tk8-A&ji>59nC&j
zm4L$mqE)QDlEldf(HPH73GbX--$H5XhXTYA`Q6Ood(2G9v%H<E_o}og=XEaghL5AX
zN@{!$hSuE-sHG2DE?B<1@~NLt;#bj;A|V4l4<}C|3tFCLq!cyn5yR!Cx#K6m+1>Ix
zx}qI;OAch~1*FS8;3E}U<L!GQ_nq}@AV2z8?>bhK*F=EIoR*6aND)G|hLiZMllg_i
zn&dTL60fr!7gs7mW}DFR79*s^+4Kd(vJ>MhcdH(!ohT>32`}J8m3=1!H(xJcj(Fg{
znOSO&AGC-E)r69#2ein&tEUAL-6fMFNDNTmF;EXQFUq%>rBzW#U?nwt(9sq{sFaN>
z(ZZTE3UXo2)ZXVBABN0z1@V3Jw5C??v7}>mQT>^d<=eGuj-JD$G?>yQT4OVmj{U|P
zXGrz-b_-x$)`+<)1fPEaxg8jy<I|GfFD}!{FD)V`XI!2Pa#EK%)q9B}4&obxmKlE?
z?hdEoja$o8tx%ezhZVjrgAR{zylfOk(Az&^L&Ih?(2YdqfisR;)T|n>oHH(YUe0f;
zvb&4j?w#u@5P9~BK~~S<dzxseGi{rRmd<0QOViRpdpIf+@lvs7MCNI^5E$l>v-UwO
z{rctGgJt7~BFZV8*DAeRLyb>}oq~|@?Gh)=1by<+7mRuZ-LMftv;mXow}{*~YIBYa
zVv)J^N9&GUy;(I)q1D^fwashR%b7o$v1niLDGSj}joI}^^L{n4agl3T<<g{EsZx=B
zmBAiC!QU*dGwA+r@qcE~e)n|C5uhPu|J`}zAMEK7_%n_8^JbKXN^nwNn)OC{>X2Sf
z_T}D-c1#JWi^g}+t{D9%J=3hBi>y+ISJeH;UH2Kr4;5iWTc_-1M0bmm!-f?a<`wvE
zFM38AWEghbx1Zu6$5@wS1|+@m8IB<a4R21mD>;zPj8N4L8%LR_iIN>tC2XN1B_>iC
zFbiIe^FGs1d`DG|-Vw2K+tvi@|Ckfo#2l-PHqzfXsw4}_rd5E2rMAXGFvfB7)iZqz
zSR1JmhBQ7N_bTY7#ta=se-Y-QoA%5gt9a{V7zNi|HhA{=e9yxHdqu_!Be)~Cuz?yM
z-ksl&U1Y?VjZHS}x^wMiHcz7E=)HpBVx)g9WY0B1g#t!=Fzbv=HvZNvQ<`ksh?$&#
z9;HzODd~t75)hDaJL?|wC)7wl{8)5U>z<RIDsxchy_cg%JE&%mZ71lQw3<(^<Drr7
zA4M`1_#zsz_EHxMUzLv6mog#;XUyJ<&J!~5eNXL44a3}GAEXz)JO8{?&r9O~wL1W{
z>g+eDm*3iHyH1}*SYS#4!H&-}o47afEbb~QWlU}6jLVbAtq?+*Z+i3*UFIb0M3v?(
zNNud2u67ZzI@<3)DHpX9H^Xp=jL*!DYbjsEyB$({WAwoy>NnF~?%jt6((IxUIA55j
z4rG5Xn4)<q2daA8md<;__D<=QCnrRhKD^B6%1s*HrvAs($=>pk+ReB?`Ob;BE8Ec;
z*#`1|iR4?37kY5}2HIGSadup6DlZ*H9o4UY5~|jgoPhsOJ)Ak;)-^$#`ftrUsAM;M
zuwh3@vG?dqzsQlA`k~rtkHd_Q4%MkGh0`_&nb2a0Z~iZL;)v;TF&jALF414#=)-NP
znIsCFvnW8@w4FG(S&LeOJcFw(OHz_HLhU@g&#%PWI!)Fj?|?EKN>6E44n5baCjSi4
zl0h(uRm6_A#E&8wrn|yiDD7bGI!V>BOS3_=wvl(ix|+rVU86+Gobt?Zp1F__)_xw$
zg8#<s<#fC>bXP`fHTIo;HR|QRI%6(rT@%JRI3>B(%k-jO`TQW3{?hHLgb(?W@c%_Y
zBxt$dqQ2qkg8z{Ae@<GP^_NL3{aiLlpOZfTTT1>vT-I{fT%G&Bz<}NZ<wb*muNBW}
z-DLr=75@SN>biE8sjDOIN@_gN^lVbU1>T=C*G9ONOldIN%<o1FK>k6Pv%oCv#pTtm
zP8=)AQxLYv%lYHy%U$ZY)d%ECXeBq>&=toed(_pldnGj<$~JY)B>yM&mzG~UW4^F~
goeXScER1dT_Y-G5G-Sh4X&&J12kf5v000E~8>eUM)&Kwi

diff --git a/datasets/common_voice/dummy/de/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/de/6.1.0/dummy_data.zip
deleted file mode 100644
index 1f7d059d1837552904bee2d91a141c639c47e535..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4117
zcmeH}dpy(oAICShTnnv4E;IN0HidY&{@mJ#gd(|Ylcrc^oH*p*C%3S5p^l7WX~f*(
z=MtSrF1h9INa#coPDQIk{JyImAvx{*@%!($J@(!AulMu&`944I_vdAg=H%iBu|5;|
zryV}se7OjKBtf_!Kfe$UoEOPU%gK=s#7UJq=*jv8oe=<mxn?;*Ag;fFi~XiW<kuI$
z25Vpm#M4268akQ?4I~^1*FYdO5C}G_XVWn29#M(S>NxMufNfbun>{`RST!%GFB{_^
z#gQ2c1%p7!93T*!eE^B*h4<Ga1qOe5Q533?=lfKnZ(me91>t`mDr`R8uA*o1hMPz5
zk_v1BG{xZV*P+yT@T?RKE9th}H_I-Y>S?#u&JP(XR#5dri4+bx=RUUQ<C+Yro=sA1
z_~?FCL6b}1vh@~EUPHb|(x+)7uEdIMuDK!o4@CIp`j6rZ#2F8VPa0c^Fii8x-bIfe
z(DQ-GSSKKev|39A{IxB3#(ASk#D}vTPfqp1Q*bRRj}6Yk<<AT2_)f^7<Q?6KUJ$Qe
ziYIF#I&C9wSoc7$OH>7>g<XC%KIQq2RJ>cNybBdmE1MWUC_oLm&*)>wxqE`PCgnOd
zkCFtW#9Ix_qasksdMmPFg$4h)6o@@@j!Cg7xXhfLxtxqOc4+tUR){G_*fHm6Z5qtj
z(rUud=`qm;k1-F{Mp`uizDBuV4wx_pOlY!i0_Pq4V-KdueQ?2Z15Y4TAey|NXf>DY
zAXG<ktR-N%tm|YCT_-~%P-5px!Gb}j;l#E$A>F&X7t{P&MhR4HRPDI3zoSx#Ze+c1
z)9hTO^9-!eZi291t|3megb(9?5#xUKco`!AKW9sV{>1|eR(0vD%^A-?9AI`>RgMIA
zEV$n))p`4%zWeqG4k44?gR)$H(J&N$a#V%2k;^PIi!&W8Qg?zLmEg>cEsMXYlw-)G
zo+4&!uhDG%K=eQE2Ga~Jc2?;#RIfM?ra;9tz(!-;QfwUt2Nn?6d~zmvK{YdlI1)0H
zZ>~u`tlv!>LAMy#kv-skXM`=4yI^KZ6*V8#UN#9Xh~teSlCF$CpqVcu(dlAklhJfl
zE1StqMjg+kaKAV08L;VhkJJbhnTK;Bc6(E-@F|2@+vVjF_kAnadoru8tf?4>k{y}F
zD8SVV;J1f;SHWJV@W6QS#(g!$>YDF#jyI*<?;cz*rt<~lS137m1)Q*zd{hf;;ZV{w
zxD|67Tx}COg>icL*3rE=)=G5Ar20IPK&SNWs4!(5YqrULJ3STt2hOQxcyd&<GrI$M
zYZxNRJP_o<tJO{KIFx4B$OXL^6}StX@K8IJgMx<d{Y9InIZ*Lg)P+O3zT1iWE_teP
zLKQE<7e<_l`F5cs^MZ9xTO7Y`Z`h>hqN-i4IxfdykS?TOmXL~fm7YwNS4zET%Y8R`
zpUq*qyOp?(hJjFZ@*~Ax36&Ay4+=sc;n%%n%%9$ohA7iQi*uDY$zdMR-L6E8!o-E~
z0M1AO@NQu%z7dH4NbUY%*;AJB$@w{=Xyq6rra9F$u|AyYr<4LYmRA?z%QL7;EWgp~
zVn)(=n~iz%N+wUf`Hy9&VbKT1V*2u+Qn+O5;?tv{*O;x<vUAZ5%akzARd1|mx7M2$
z``$?2fuyg>HW_yRSN6YA#>lIDNdnqT_n#e51kwp_;*XpKQCgEFWd$t;+77=0GellC
zAs1(Eof601Bv<F|w8+^r&;0Eur8MEl*vVr8qnt*n<pD)k4ZR9xu0%<Q8f3m3UCfG}
z_~;vfJ`$(Viwxq4>uHfTsE>ZuTWY&t>a6%%_CSYPT2w%sVlI5Qi~jU}iC&Z{maZ1*
z><KeJ&!-ZP3roFTUXM%aF;O(3ILF)XiEKtps2?(sQhz^&s>js6g>biIw@)ARtRy^I
zcCCOt)_&|QY=);BSJ00eKbg5(G_S|=+BrZf>_+5#qn4hY)cgzGt?~Nw*S>v?(?bj3
zR>>=UNj`cb9d7X}QRPLBTZ)?A<A~G%Ti&3kp%ME+{X2QaS>CDf;YYZiXQ+3{9p`VE
zQV!zqc2jpS7j8j$NJgDI5&rM^e>o6uLTK8TJZtu1d>#AoHXSZX-NXI2cOkFhPg@8m
zOp2MDZ&5^QHb5`;8l}!GZ)v*oWa&gzPn#ekGiFGQ?CEX1_`8V!Q)p<wpl!_{Wi{jq
zs_=*wVAm9|tHXXn;{7*{(oOoZonqks7%psaGIWQVDb(T5A|zQS2`r`}V|fo9&ud}$
z{An!Ct8h33`wZ=N`+%<4eBHI5t8l$fFVHHx5s9^Yj_34>($biBHUA9F&s4A#-tOC=
z7*}PRP>S49#=zs}>c~&(UUa>u!gI%%Sy^V!AxaPL(olulGWc}fH0FS$#7odNuNv<O
zMoaiX={v}k)0QW%mf7_WJy*Y7l1Qtd)THzW5{G2X`Kh*6ssRw-*b9x&)z%adn;`MJ
zRuNju#f-Jb(Uy@*2j@zcLL$<kK118Y?e=+1UUkryjuCa0tZ2Nrd!9I%Uaa95k;wb{
zccjE(N`MBplaa3DIDZwk^<|O4_)l$)*nQLT13SlR)8l+xI&GiLQeOzg_%zd(-kAl`
zt}O{L50bK1>QlC@hFg}NwF!AI`+B~}!4uX;M3$iqSsKhC&AC}5-}Qwr44O4%-L4Dh
z+@AybKPV^<S}pj|)M9<XzX|I<CT)!BtE5>|8k?j~!5x6Hg8O%uwHg-J$M(-Kpp8J+
zBe{UTHScNDWdX1?As(P^YG;+YKBBIrCIkJ;CiQDz{V{W6)LP5b=VF`r&9nf>-w9pz
zXx3g_UG4g?u$DZ>%{F;8I{bLKtd3iM+^vP?Y+)O^=I~>Wx_%6=rAG6yP2Dh*|Bn5+
n<u}fl&um~P16vs;z&88aiL)7+n}=1JANcwMyN3b*K%oBus9m`c

diff --git a/datasets/common_voice/dummy/dv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/dv/6.1.0/dummy_data.zip
deleted file mode 100644
index f9ed06c341c470e34fbc3d0ac74b088b8f90f0b7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4109
zcmeH}dpwhUAIC2l<&Y*)Xo?iW*yJq294cYMx~W^v4|AH>c5f9=lqYivp&WBqDTgGs
z?haTaDGEJdib^7Pjk|8NNZr?}S0lMwe?G6*bM19qyZ-vVuix+U`F+11jJ+^K9N=Bg
z%fp>Ny?l8{0BZq1QfMgE)6bjet%GwF1B44UV7+)Zi6Q|AK{&zy02u=xm_>`I&M)Fi
zM)(pU!by<`18o#S7paRxpmY%^lz`QXOMK>iqMCr!{m7pID+<jQ`+Qk&)ysef#Xinb
z4K^#Rg#aLE2>>7k2M{B?@nPD;Nb;wRN-Hq;;%1qDyHV|_4vtMoXk5>*teH^6`{OrC
zX?d^d)<NlIbpJc2)Dg0z>sg2lbt|*ZytwTNz10EU@7i+>FOD2`r;&0qE!(0FC~IG4
zDPXil*k$<F&PRsJKBPq__#}{6aeo4NvNfXbC8<HX_fa=pj}cuBA1(3fl&!|z#jj2<
zn!3Crgw<<jl0Kk#Vo!-oe24mYg)3bB9B%B_KUkDZ2)QyHn`aMm213zAI6!^`zPr||
zBvk>Hlp)u(o-HKgfYWdK&7P%Ip?jKPzjNqr`DMA<Qk}l8m~JBdm+2=n`C%`N<ePJ(
zWctG2##}*FD<&Nn+wz*d?;zUj<(Q!(A*3F&wJ2D-fI5_>cxl2|)g&6f=4J1b)l981
zqvkUhtAt{A?Y{JEIk)8y8#mg=1gBP+`pH{^tUXgn5jVP5&>efHBoFt@9=J;2nT@X&
zdmM3A4^#^as--QsT7LfIpVR6USFQA1-s|5%t;DUaB5d@Q<1`&DHIG2ma%zz04#5t{
zr483*2Rx5LR$Ul9gOg3{=#Mv*>je^Knu%;OMa}1GP<;y$jcVF7Sh);12p>iqF^&k@
zRdel4U+~gyh|cEZ7dyo)%O7dvQ;$=$T)qDaTrV2$A1<YGWr>-Ww~A*oeFsbeZan6a
z^=4&8h5>u6vT*Y4Os_2so;&>W5b5o@)){+qv@#V$YDxVm4SR$fpGD+Q%dcH(Ce?Ms
ziki4MRn?Xn_H~-pb&L$@u6?0bWH)dDZD&w0YMtBpDALcXx}*jaA*|U<^IvHflyVb)
zm;K0K9&cc%P28vy9g7;unhl%?sd)D=3CesWr8T`yWwqXhIDNv+wk-1(F{5iKtxNyi
z#9hMIW?O-eb9p+$ya?20Kd6nK;M$PAL-1hB@%Q_x5bq>?=QM?49)`)Fpi$%op9Ub=
zjM?)cCotp~n*PWdO$mi}xTZnZQ=X^Ml?xB1HN>%3Wlm$Nc9}mj4$vDJdY5z}x3%J|
ze(2uvU^pe)z0SjlS-#w}{Nnyg?^ob&2i?BYNRWN5wNu~IKd0vG83}lm9%b8r$yx1{
z3K!UR9u_WRp?+<0lHJ9m!Jz#x6h>clW!D>;w&ggqM0gt%9iJntXq=fIFM||S6<4{?
zn6$}#x&GwIlGZ#2S1QA;;y<K-Y&Cz=&g&va74{;!&-O*lJTPav%xJxxnZk9P&wqG=
zJla<e>r<oF!xC0-j!igmEU&!f=ni_AYLr;C!uOE;p}(HLD+<%hS|iK_TuT@mMcmKb
zXcQ5@6Dx~-+twUGTvH2|@|eJxwx}2!F~%cZTUS{-*#2_wx~a>yT$a8KI>*fYWtq+s
z*o4TWgk#+#z#Ss?zON46+I&CRjXMyXm~8h>$LHfC2BC`rz2DRn<~2YY?caQVL-1DH
z2nDzPUiPu(?H5gsNh_Jgs7bGDAre~fN6*hf!+Sp5&Uk9R#UV=R=~liSc#U<g9drb@
z1JOT{_|*xE^ws}6;eW~rM{C{K7T~~j0Pmj|u>@kEf5eZhfJn3yKZKNo+<U2Jzb$Kb
zJRV;j{dn!Qc2?4U-&b9d=qXxeBW6(P`Dm_0_hT+r11o=14BNhP0GCI7pgGC?lx6ti
z#caBskPWsl3fRI71otdHY+(~y%(|%#VTvfpxSreagRzIEi)>Ajm9ZJYtY;{N7B-WX
zMz`5Lv7~<$ZBK8Y6TK#5^OUY)%=jp_V<wO>G)b=43OdeQ_uu4Ydu7TWwjVSMd1ssX
zG%QN~e(ZzTp17@L)~eeX3hI?^Ou4#Sn(a67@r5xSsFd~I8x^&7*#=;Jn=Ry|BllI4
zQ!aYHG=m^fHj0NwgP!b0C$~u*x-|lot{fg@^-30;8x~hWIBOL8?mV5iO5fEnzqn}p
z=<BQYgH(1qi3P0TkQ%DXqC#89k3TThahzrbQfiol(y8@UZSDu~OQnxy-x%;SVwQGB
zrQWKuu8+UG`Y#t#IL0<~oc^bC;lu_2NB%?Ox04Y>u|z49b#b1`<yX3yWy0ta;)mgW
zL+138M>57!jpR>l;0_fB$SltsM2i&p0>&1VfsvIV3}I!OIgTdl1QSb9h&A-E%IC~*
z>QY9goLo*n-fS$zDfQy&ClEMQ6DGASQyMUx(=Cnne6H2@uxsBM$+x84+elUBj!j2f
zO&esNx;LJGlM8Xo_nuwK54F7J1v>TXr7yS<zSsaym>&&!URaQ!g_bA@FBZkue1Ss&
zjwSDPUN9H=9L#^Bpen#GIBRJ$zu-TF^dFNJMsz-jV_6~~=~G|_VZ6Zp-DUAl==`|;
z83rC$fF1(@|K{w|qRRqdbHY1FT~r-Db$;xeOT7qo3IVBKL+Fp03**#WrXxgP<~Mx_
zB!4GZVeENc<ga%A<ep3R5EYorpW{DXF7L$69};t+0m}r2&KVhkQRg@Oxm0N}fvF4H
r|39%mANhqn<}({~GH7K534z()cAUk~BA{{rAP)Y*K=;r<008_AG62Js

diff --git a/datasets/common_voice/dummy/el/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/el/6.1.0/dummy_data.zip
deleted file mode 100644
index 965ff2c7bdd9ec7ef078685b3c5767994f28ced2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4428
zcmeH~c|4SB8^<5pV6@meF_N++vacyJA~Fb#Eow5B!!$DnGb2*gibD2?u}g?3TZ=4H
zIx3MpWKXZXl4Iw6RG&~e&H4BJyfdF?`Ro2Z_jUcQ-+f(12nI$du=0DCLooUB^6kb3
z>;UXYIGhK{&W30sZHi<C7!rkyPp@30vupqz;}`<~Fph&aqYW1kUcZPfS<=?ol|XWn
zRFaaFl#`K@k(8B_l$E7%_0%-<x=-9g<LdU<uYj$|>KlDNH@IsSpg9$7QlO&UXF&%5
z4)g#ZLpy-zYU6~LBD#^k1S&jH{q#(X`j3IC4J$XK%CJ}eImzwelkJLqcJv8VD|9(Z
z_vrDWy|;6L0wP)yZI8Wdq@7Wi;Y_$7B6TsEQnU2<;RJEI(cb2nff<jSaqZ@~lzdxU
zjaS6mmJkZZ@$L}sa*SQgU!frs&J0Cz#NC$c!dRH8*d6YEDfiBxD1^i<P7hs5O1qYx
zgeca}K`JAZE3dC!Bxt}%*g#)Myeg2-&@D#vpo6FZwI1WdH7M+6C?=8~S{kEyM(6%f
zD$+RHL@#$xp|3O6+9-7-JA0nQhLg8)2!<M3?z_|+z;)K&4XCT1JJ&|7yt6?6YKdb@
zS(&eUo95M2wgrYlI<J#@xu`0bx~z}c$o31(*WC$K%<r*;>O6OXkq=ICR9f6es9HV$
zV;ghpg`uR=*L?B1JSk6_C!LP(YRrDGgFU(=`RG5)T@r%rn#wFaOwtD`J?aG>=(w_=
zS^2%-?u)Is?iB&9W>7BfI+X>?;0$<4(Vjs&d-8AdS9idW0lo>+)1Dx>Cvb34r4~aT
zWtA8cg7P`~0^9n8bsgED8ex1dt%HJ+GcxCMyJHrfK2CGNs|Mg=>v~hSC>-%yEWV%M
z@Bm22tM2AeV&sE`liAU?<Li&fen#F=I?P`ebu1t>XTVN)BsNHMQb|{z=i)v6mPEF+
zQjH_3x=&49Pw=3ot<X<M@liN`^A?`Z&v_NZJue1ZZnMsp@~Dhd*zIu%QuCa<6dTRX
zQ%Gdbw23vGAcw<-`6>$)%xb3cqmA+=P4J%zI<FtTkTn~IGJhu}o%6}!;k8G%;efxw
zn7f;K8MOw|L@mEKB+z!)<9U)t4Qg9d_oQ3%j!CQNuBE+mkkxqWCZbKw&5Xx924fu#
zR*VAe@h00~oj_}`xBDLTm5TXA)!-r!_zoHNORX>W5&@x}c^8=fjx8tYVsHxhdj?@d
z<BFWbloy^UsZwDodCnRqvkvchqwSL%=e@IU**AaGrHC>ehwJ;B>tPC`xzFeehb2<k
zQ@)tilAID5sW3@G^hf!tKvPZ3nLtE()>Dn4=s;L$lPaqf=Z&azbG`jGx`lEs9Hok^
zaJT-mg@0_G4nuN4qH2*1`?*ojK{4%5Oc%+H+Pz&fseUr*w)v-21kf%}gp&GwKe5~%
zV*+Hv#-gPEZyz`7R~bDKXEN_x4na@-spdYV!=HJ14tcE+{~+i6kd<1?<n41I59dtk
zkPD{EEr-t!-y-koRZweaQmd&m@XE8D51!KS?D+7$s7km$c&GJUiEUSuPRp|(#!U*_
zp)9+iIYg~2=>rqerjw&5xihLVuD?p&$IGcDkp4<AGRsa5ntmj>=N|KJl_Rg<_BsAK
zwvqjeh%^JU?E&cAh+YP6JyoYgrl+4ZGSy#ap@!R<z3m3@50c1J%5ka<=LFq_$6(jg
zdSTs@kW``a`Rahc;CnIrj!f`)(ssSVNKAK!*kb>J;V4d7aWwwy`4rQ+yJJI`&10~W
z6^qQPT3K<|wOWy;T`NR;H{y4hY+$bcm&yN*OiB`L$0?wt=>5Ax*qP{P@A`Y47~rF1
zH(hES+N(~YLPGjf$}@RIW)p1EXSw*F$Cz9|W=R-U&|NVSf!u1(Rwm>j>k9KRO0P!R
z%o3HeuI|q*Zk)ZP<S~DQ5}0f%GKH5{JLx>66*4lN^B$KNe&aPJy+piKJQ`Dw=K*g}
zw7PQ8)iN?yy}2RSS;R>JvYF|)oP{$78uCHc(Ur(qMupVeXoC)*twY`vyEMJ_<MS6L
zW;HXY0#@&2MpWi*SB@0S#A-?kDU9U9tYG1$q%!Ng=~jyQjRrKPkc}glvQPO|=<I=s
zxo5r;Gb!FX=Q7XtSUny_&E8C!Q0zv`HP=heo_fNM+Pdh4QrvHrTo|FvMzT9-ESDzn
zMtD;?pXT4i$O*5*S8qo1lraa!QpogiCm2xi%A!yo0F!plQ2VMj?DaN29JET`%z_@n
z`xec1Uo+UQo*7#}PcO_%IaRSqlqrN+zw2Y&lCL|%^7!_J%%qRazN8*!FG&?;uMX*q
z9@eGasS599w=K*1GgXJ?EX#QkvAi>_lgI*l!6S~%gSZ3N4;6HlFIPtEtal4%_wEi$
zPzE?q21>NMg%kegvt<L*s0~9H(!;^CWo)k~wlnTR?dO+Hy2lQkj__dTAxVg{ahXX?
zLG}DJ*rN&l!xEMZ4<><|%BIUG)1hT~``+PVl!aWwSeSmkOOeZfYg0Mqg!1m<`+44a
zY6QC;%t*BspMPXJHdu-uzgZkx=bGh)i9#H-@5+9H*nM2mHReXJr@)?q5Ys$K^D0kq
z&d36NOh&0q#DQ?}IMrIqJ|^?~`j1KkZMzUpvvjla6-Za0(g82ZYFsI4yDhUbqGYFn
zxf+qCjps{b@+O|Ew`<nNqL4yx&ktMfd|>S$Fvp8^Qg{35SfES%>twbn6IkRw8+Rq?
z<OisQmRCZ^ViliC>E^}ssSGh!$DyO10i1^IHnMm&EhI@LP@<;Xo<qR0f@`pY6kgn`
z<fkauRt5bd(2-4G&q1Dt)FSxe_EvHIUZ%$%D1-n@#=5gk+XhREwA|kY1uWJM@U0~B
z>tL~B`)daaMcNbjEsajj>pSa}T>dnI(9!Q?*l3dB`Nn4l#^5Wj>*gA!uXD}6SWp;P
zU2qYuy?()em~nnj`gy`xC5^$;Xe51^cR<+6yz|R#t@g_6r=71bu;74nUq<k;CQch}
z3xutieL(7l*;%Eo@6gv$tHDaAk@~$~|2^~PE_E$aiIHaJ555ADf6=cRAy%}wy4&?V
z&ss7XLNj@_1N!}TS1NA(3Az><#6mN4&3Q^2b-iJ)rG|mENn>JvvhROlZ{#sw*`Sg^
eDO+x$nf;^VY=mY4GY0@r@Q(-8^A`vJfd2xdt!`le

diff --git a/datasets/common_voice/dummy/en/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/en/6.1.0/dummy_data.zip
deleted file mode 100644
index 8c7b5942542954ae8fa5c1a7f14054658e351eb5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4014
zcmeH}dpy(oAIHC#49lI6Tq05p!!$W2omL64q$4xhGQu{_%q2qT$R)QzC8=;yq>OP|
zj+~+rk^3drjxKIH3Q^?r`z}3V$!X{D`}b^*&v)NF9^c3N`Te|K@Av!jvb7X|hya|=
zi+rEGpKd;1L;)qh)sI3s;^Io66LjtO2?GL|>ez#v<rgRl@IlxD008+5yx1-qM1woX
zA%FNGZ<>#vFWg860oT{l*MlSU;Rpne(X+CNT(79bV{{UA9w2@Rz1-`|fV+kQty#pq
zg@wfsX+8ju;|Bme-T`zP!Go$p_x1mjsN_ua!FOrsuM<`G#GG2C3KMKVnhI*kcdFU-
zuPA82i%TewrE(xy$A{?DKG&As%g*mD`%bxJ6sH)u?%!Bq!QPo9h3RQ~mT%N4+aCNL
z?;(TSP3V-lA86R>q+UmJim#P)Ap*1uvTvYxi0A{K6T&R<@Y8JgCARAJSI+S1Wc77p
z^879ss5-;eyL=~2e$%?-V?T{HvUaV(uQVB689Q=xt$5PWGZ8{U`+*Rc(!T4pd!GdO
zrkK|p>s6%p>(+X<tWKphPqPaL(<f5`;C=Z%RF>R@xLzEbhTRdqWk>AHP1WOCgX+H-
z#f9o-SorR-D8=U%=mcFJZz!=Y8d`fKV_P}JuHg5JkDoa0a2(Ne4>TH^?6BGVH!JK@
z?JOi8d#nnM1e3oTOui29$#*6Be>e5X2x_Gs4CDS*W_y~BIpIJWrRS1wm+7VLHF8AD
zsCAp!?0lI6L6J1ocQH1{66u!x4m#mgQ6xE*NJ7GW1|DtqY}5LlKeMZ1?pGxTt{Qam
zi(O~zbZd{bSE^06SG;Sphj%b?n87}Bc=oPBu4&@a_G_gICcmCzp3Zxd&Ya3@EF62p
zwCX4<>6M6YJH>V!c~m1-aQcGVqr6mSHL-Zboc&hamZK&}`Csx*m-~3&n?ri~-`#t&
zHomQ>ahTMTf(q$>>mb?7$aw{m(Wl}>7{^h3Gxvh6wC}A+>X)m5k=z2UlM9V<B^2Iy
z;iT+8)><&LB$24tu-XEZZtc)YoFijzyV)AOLo32YWxXrZpXvT=A(4d>BUYb8uiH`?
z#p+k+R_ab2P{@;}S^w7WRv(u5T7c8o^WyBvB<{WVU~82wD9&AQ8S*ZUKY{E4ju4XT
z7l}BX@r~{vC0kQ*2-xxV8tn-3ZBY$uPBrC5gMhpg=BapySrl8-I5efSj8>_T1VO|6
z@Z_+JG|3z6pM|GA$Uliz+{~RGAWZ8eB+gwVyNkMao#@$Sci}w2lv!YOv%ZUIgV2n&
z#VHEf^NC;*3{#7*or6J|A|Oq;#$Ho3SsO)!afHcLRNRrN7SpueD0TnCof`;^)z78E
zQ6*2c<80UZY30V=d#OCxQ97_X0Bh12x;8nzX~h4R*W(jJv$nUaF5mO)(hS|i_A^O8
ziWHN@`zA4raDsJ5p&s#pV?JH3u6)%qtmmk)x^(>Y1Gl%?>txD$oo_}#nTVuLAKxK{
zIYH~xJ|(i=+ubETQ<o2SpRjlf%}%YKQ3=@-VX*x}Wta<Ez^c2V(0jt-OhHemX$aqf
zIGnMySR7s6#i5gY>0fkNW@`PX%m1Ovm1El)9l(YW_;<&NH~lb)_I+XQCagw4w?65;
zSvb{%b0tTz2b5=JB_E&^kOtCqR*EmiM##Y%p%AEsay6yt1<Izb`~9u-5RKc>=?~{V
z1e?+~Q$ln;Zo6}pHP}Qq*rdOujv{+!=<Mys>T>`v4!;_h=%^-&@Vh>`Aw@3f$J174
zA+4;I^5?LOtQ~yU5GHv|*}M50lr|is)Uo7)BgA?IqFqz>uh)DGc|?F{{lQn9qIVQg
zR6jA}93W*nLR*m~F=W^m`ToQE0srC?V$+T)Q%6EN2l8~guDIn6OnUr*YR(O{ie!#_
zco{UhW7Py!zC0ico|2Kb)4%Z$9fF8v$|nUyN2{_@o%LOz1Hj6w8ZkQW_Fq%gJ}9#F
z6hr9M){@!=M+6pYZF@GUGj)@*=w%VhNW;e;6H`}|&pwzbGSKyR+|*@#zUN}i39A3a
zgq|B?!z4RAd5ul^-;}b6sgX%*ljrf%LfoA-UXXFp2vo2cTt>WimIrm|;9I7_@QIex
z)d*Nr`?&T?AY2Wb66NUDX(um(c^dh+a+l`HpxGi!S&VA5`0G^_E^V0UiPDp=pHE1l
ze{aAW9z}bbCEA(YL*1DQ)9yod`>D(*J5*^^20?EQ=gq!+fdL?*J~alO{FLtV`t~u!
z2Q12E&LV;i+nYzbV@nAs!GAS*dX}rZ71u*sldk7qsYc=y-HbI8vZZ|sZrqT&PxBfU
zUO)JQYBhuMP^B9)d1bbszMXxpGS{H1Xap)eM|_<fd~4s8&LJkQiTKpVLFw1~=t6!C
z3k@fnpBZXBQVrv>B}dvJN7=oPiZFWVT;0*|2!@mku#mc2(bkeLroyKoMPaZ?4>gI^
zk6$=5s5>@-)<ow{JbRKj^g|oMG>uInj+wQre%QsFOV8XG=Pa;Q*tl(`uho0Ob#mt4
zV(W_JU8T<&d3bYsH;zzCOfDb4vcPg)S^hI$0$`hSZn>PZ;JkDG7XdYZg@AMB818`o
z;G(}zTI!(}NNn?L9!Z}ZH3;K4>Thmq!P0U2>3JB~NkC3G1pHZ)r)9SV!WOw}kh-io
z3si0+xtJOb4q_guUu@;~nM?iAVx}^LXXaN!10;XLDcM?bYH?w=+=u^Sa+x5{<OM76
z{q1r(j{9U=47C&D8M^44<BiHajTcj8g?Xkfna%&io^Sc3GiII*Rx()1W#B_R7S-1i
WXF0Us3QlMd@Jj`&XM-r`-~RwcBbD(0

diff --git a/datasets/common_voice/dummy/eo/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/eo/6.1.0/dummy_data.zip
deleted file mode 100644
index 9a4a43aa63625445df893f3673136a7ec2db9fe0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3952
zcmeH}c{r4N8^@m+BYT7>vXpg>ZDx|P)EQ)dZBw>vF*9WyV=O5qNeK}VCn@`qB^6Q0
zeuR@XvNRQQq7ufTqEb2UGwRAra?JVT{qLQ*o@bsv=DI)6@BZG)*UF3y#tqOvFG`4*
zrH5}f9$*WA3-a|n>WXs<bW^k0&jqk$$XFkwUqK-}00c&30|3}Gc(YnHh%9puY_JN}
zk3<X#P|;9Ds-O@kgbEU+f<&?yJtx<l=@k`NjE?vF3gFM$v)bzmgQw;M`g1_Q^nC3v
z5CAZP0sw+_Kp@G@lb{+H5WJMA^BH>%en{Q(W1>0}+s$D}fxX@G+(tLv9QH{NqCyp9
zyK<|ZxpXX4T<X?Ci9;RGE+HaNYTLDv{VM%RnL-0WrV*Fj$Xm@s?RO{9bet)%JY2ME
ziy6edq-HUqJe6#j^i&5o<=Wc&LgMx&SyM!#LnAIxPciy{Jv<lZBGc!NZXr@*i*y|Y
zfUD97?=*-+gTXX$|FJFkfA*#+J`=d$)?spPl#0a-lO*EL4jUDH^pMuV=UNZ1d+rGL
zy`HXPF(lu33-2_fuW9z?hRvmjrq{iSkA%X=Jv_Gd!2uCZ*Y6cDOnp~ZuwMM;lt<BD
zNV^#>PK`INTypm|?17TY92aWl2hf=_Z!cv>8Tvc)@l_DE_efn=>v>6zOd?IZ-d8ed
zKGJPP_EPsbt!IIDbzAJy7@?QcvL;Hgr$c!_O+-LVR9V*qhYwy;k)BT9-x0j*)a}}*
z*C|53m`O^%CmihA_M*5lzadyKOFJlT+J>E0J2?CS5Dby(!`zX?ZoNaD<qqX{68e1(
zHB4#Lwz@Ysvf*%4n$qmLi<ps9`EpKoTsyrJETa_S24u36&+9$kB2Pk+4Ce<XjUOJ$
zft!xTrD$G^xSS*4)8QZ_)~*TH7BEPqDC{(rs?8a%<)v_3j~7=q5Pc|@R@gXm!WM4i
z<&NPjy5=4FNF~OIkjovpA2ZQ+3`s15D!7#SU5kC-et<$d2IUhx7wYF)O(c7mhBw+G
zH^K-re%6Z|l~Q=o<{~T24-PdI_Ex37>9dm+`t6wJmYrq3qH1?i5^OCGE~3KOmQ!w=
z;f`^yO_sI>Q@#LR>a3?c*v-cioELc9_r%lN;3uts?=dHg(i2~KjQdJfmh8k|+bZ}J
zV>?1xs!k4hs%;NQmn`L`I3EcEq#6=xRLe7e%TE`-Qs0S;oKk-2f3GHf-}=7RPm8;I
zZ8~h`RgdU*1oV|gdZs+CCTy=mE8a!pT6HmM4y4{x_4mVGjhrX)MMYGiP3sJ=9zQfy
zHh<v~)UExpO!Z3%>s%^n)&yd+*)-Q8=2X}NYI@`pwz)V$udPS#u!c^RuTHnKy(&so
z)sFx1-9eg2M{E84#q3T=g=epC+@kp2EPs+R74-Di#RR+2`0cMg<tZnZl<)R8j5-Rr
zTv%Q;u@CP2%dIVNi2VITZ}b(|yN-+!J1EQ;mEWZ;rCI#8ID2DZh2%y(g0r@}U;&gM
zh_O&lAt%+gQ1S#((r54J?WoQye?A%3_1n~-`;nW@!!2%u_{=`jR`*ZX(C1yPCc#qS
z-F{ggw~?XC^3kUlqkL+t%NK|b2>h<xDnso5%KcB3d#Erz@fqAkU%<xu7yHLA&;w6e
zOS;bUx`bwFUW37D_%8I0lMSb`T(Tz#GT3yu7pHYohf{stMAfJ1&vUfvf8!36m-=?`
zidR|gs!mLSO+Wd3)qFj5!71FRhU+N~{}>XpAg`Q4Jbye$$+?bGE}l|>RrAamb8Zlp
z$ewG<D6Kq>icZxWf*dt1<zs(Uv#Cn!u{Tk;z`HzoTvE_n_?`dQR_n0{b^fTh&GF)4
zypa3yh59+NG0prU5nk}Q(sYwUMHREU3AyR2_==fx#gA%j<c!+SXiXz`)|`Ow(H>8}
zI>#x!aZ%ySZOY%ou(4!-zmoQ$!S4QCNq=o!QBwcM`soWb#V5^5y_;(PT;O=Mles@K
zBTs3{fEkttH;M-9{ozSiIp9`llg`9O>t+P6L3aVSXv5U5Dy%k{*xi<tv_sav<&WtB
zG(Ry9rZq(4Q6uI777O8q;d$@<O>3se+io@;7k1oHBYn1ylKHkPJuA%NxLlmFp-c2x
zpt%}e5Qp$Lj}NHiH<lDD0QSnO-;A-~_aaL04}@TaHB#HsY&p&h@Qv$p#!@UCG#<6s
zZU|Np<I|CDIo4!9sorAzpVZwR!FeZdn?D*rd5q^^eIzg?P%Sk?cR*czdSYl7c~rkp
zdr;zKfAkYa=w=0b7j%Xt0)N-ywbNk}EM*Lq)XSrzM(RFIT}UgDO}+hI%(2%VEt!q}
zgX1Bm%DkDJMP<sz;S*HlMWs5skq?&ecA9$Lq>@j<#>&qK*IzN3Dad@^>$n3K>)YxW
zeB+YwVsM{3SeN9RAfA5A?u)L<3Y!XnO0uoyb~k;)3jj1-`XiI~W&i4Z|3yGqU^(Cy
zT?6KT|KNkyCarYA%Osj^E{mikPYlB7p7^K7T6SN|j`%AKY#ksk0tWst<Z0DofiMPt
z3{qEBXPL_E`537e!JcK2`rQGp&0J}E7@0gUmYF|H2$1{}`(tHBuf^rlGVg0f@&k63
z$;;b&?c>rrj(OEELQiq93}sk6tWlXqFC%q=lV$3PLHtkbuPwiF#(ZUil?;|L8T=EA
ZMfKyvSq;s;jvksD{1U+GnOb^_{{m()ev1GA

diff --git a/datasets/common_voice/dummy/es/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/es/6.1.0/dummy_data.zip
deleted file mode 100644
index 8a396c6648cad9c611653cfed50631647b84f5c7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4007
zcmeH}c|6qlAIHZGatv}Gi!~@mjxnOeBv*|XL#V8K%qWJLa)yK=7G)!Hj;v`$MB~a*
z$&sVoT<gd>(h9LjC|#6(-${>{WX=Bb`)g+&^PTTJ9`krVzn}N(eY|WeA?!RL#%to*
zaobNfpD(;1X%Ie$NDOtuV*|082kp5*kX(fW?u<`R2rr0@eFg#ou|EecHX8;}WDbH0
zR>vLlKOPjIuB)M?j({WJ>RJeOEiD$K=TkhG9#M(K=$-*90D(NijUHbFI5iijFW<vf
zXfDQIiwy*F0fRtr)&YV3SZ|U>U_kJvL?!1My1%_-_;sS%lkXFo;83$p3(+EX*!{)V
z7fkix$_XQ)a!#%Tn5X;M^oZOfwHNaD^je3Hyk-00=M3TYp$qa(?zhTrNM<Gfc?UY{
zNl9ydq4m+cTIg1{bRojX_oAA344OEtud^g{K$5Q(B*`s!Ugv}P_3wj>Ca8yHc;Fp8
z^dM-t0!2mwYfwP`yJT1GR`3~9;?<W%=D9f_@p_AE`)2Pd#|U(LTm8<Xf;8jVr|OxO
zWNQ)Yne}@p$wbOaMxRbj-{(f79;Mggie^d!kw<d1(XG*t&ieEeVOcjTBaVni@ss8L
zjYHlFWQDPLNrFi={5SR?QX#1jmf7)ECvBVQSe1~^jmX3Xq}J`U!N(tHP5u$c=I!ZZ
z`S$Q@a5}C~xHnBoZW#9HjagcHiP-Wp-w92*J8X=~T9sx>-d&3mkVXKIMuT-}@See&
z3u8=zz@U5k7et)%1hpkN>6D`E5iGu5(H?#dagmn(R3)q|MRkdE>*S-wOYg^R5SmLl
z6bJ7;i6vWH!HLuQk(K0Kf<wemIkH1eNPM`Uh{>^pZm{Tcr-H$;iw_3BqjGu3LC%qS
zHNYl_DVrU*^BA`Cr~+T`o<TN+a~Qq3t1{NIm3C5Hsy2{YekK-1f&~pxQgQeNuZyxY
zm4U4g8lF-?P7{#LK?ypR%tVm*kuP-xra0M4<8Bh>Thu}?CQk&xGOs>K*?-4v?2Wig
zlXB{x#~#X`=n4C^=zS(c-D;ts?qGIApKAiLYFQ!_>Gq=PsH1G}g9@ZrQ77G=hZsp%
z{?x}S**4Ghu<bTQE@6ReO9R>7$$GYfvA*8G#PGy_$vLADzR?WCKdh1-%0aix>FnN%
zfrHy7`EF(_AO~9TwzV<JzhzhbO7H0?FD^Y#{b)TGkUSHn*8VKyRE8bzfJ2*4WO$M#
zrl+NIW;&rF<HxG)amGhZdFMvYtK8I~J4RGI0QcF$24x#AwMd#g$$*s6uNzI-g!%MQ
zG`W=r9ANf5iaMr#plvPu(yC7A-FDljOHYvFGT3AWQuY1BS`CvaJRH?k&gGm5?Q;(O
zeyuJJvi^oSu_{^P#D@*xC`XMPeD?8(QN5Yy*2?cGbyc*9*}uD5;wQ7`TQa9{de`^%
zO{+Uk^fy-XQTRL`*pe3#E|G;%BUmB_Z*~5_!BL-X=QB`qmwapE9LuR%0hmX=!@VVQ
z!`_r)ADiY^G@lZP>MiiAUhs`!?rq0O;3nq^@hu%6;(oN`jH+vU<HB#>Up`+x_HtPw
z)6ngO^kgsI<BH#?Kus+|6K6Nl|5wKRWtYyG_YmghSu3X|>v9Hq1_XXla)Yt<pOXJq
zB@aElPgMgN)9BxAsAGX%p8lIj*?z$a18k_X{i7m=^<Jqv>s(;`^^&SeewI|-!Yrsl
z(%qEi9JYAZb5M_Vsgq+$@Al80#Ad4WVUv9CE;d>Sr-IAw>)w81ake?!s!j>hwA~^(
zI2KlET{RxXtJ7bp`wsrvSY^nYxcG-4Oi!rMV&RIx4ijiM!QQ!YbkN+VxC@z7%iogZ
z8b(1iL;O?EB#xxm$(|pNb?QyI=Qq=Lbx9*cNCZ7e3C=p<_^RcXX=&b2mzTSrOMB%Z
zxF(A)V95Pe<#g+x#<`JcW2ZDLx2kdNpg}|Ga<!hbTgxhlhU4>zwYtSMztB&h(SDdr
zop)~gPcEBU@V2S@#O{hQDYd)T<9Anh=iyk(Gu%G0Pw1_v)51+PzT&E{T+JG&kNLMP
z9cl?1?l|}HVAC$<{>soTR^tLxQ3O=cWxc<=N$UsZ25r{fX-R65gL3|*vztRn;_P9$
z6H_wsI5cNGD)mC)U{AXJZliP;Ny?*3d)nd&Zg%bl3rYqkQ_*;rr9}c6BQ}WoaH?^i
zFGXs~J?e)PwOui29AO9Llr~l@U}2n!U~eRtU3->ao0yOvm>bw-ayNf(SNA}(Bn%vi
zqbRa#?~hV$ydAYqP@H<SMtRmu<TXY(4<<dtT_TzX%gTFfPRP2_H=^xJty~(8sqC)d
zDWDCsEglGN9yvWLc_*r8paG)Uuhe>IUO!1B$_ib!j7C{UJCY6&Q3Yf|(PY}F``E|J
zLS)@NW<@!Op9^2W2{i;SQ-f6Me6+4a0tN|*<kO0TR=kgX_uZ=9WcanUo>hnSynoKR
z_c3PE#z_4-cgqHrg=}P!B|h`Ff@X{vw@j9qW5qK6i-3xt)qo$2_c90k2m8D^X}x(~
zCCwO9SR{S2(g2KMrN4QsRky}$q*q`-e}Jx>W(U63<Y~iW0kAc;8lY~d&MKAJBd(=h
z1{#w^>KDhjIdi>JTFc~PXPNoc7y!uM@Ju$Aj9OehE%Q=eOCIK6nY_B<H$N_;<Cyo)
zS}1J`%g{C3h&3wnd|gXLa<fcbH*Nokz0&gQXUqy4sAQm&D}g_;SX5t6oQ=>NoQ%*s
Oz?%eAj{+~_*Z%-(SclI5

diff --git a/datasets/common_voice/dummy/et/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/et/6.1.0/dummy_data.zip
deleted file mode 100644
index 1e24b21b907d10b87b9cb3928e994045e5a33cd4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4524
zcmeH~c{tQ-8^_0*t&C-w5(jaNEsP?O@RDVw#AJ)gGRDqe2Ibgy$2L>6$i57TL$*`O
znjzT^NA_iGk&>iR4BnaQ3YF8Gf8Xn!x#l;&nLqB&?|HuWeLwft0M5Y32B5yC@3<pA
z-F&&Q1NZ^9Uaqc}Ep4r_*0RP%tN?~I5ko8L$LkV1fR1sV0RUipMY$O47({e?5Ty4Z
zB-+#63v)<O26{+dPG0U1RQ?bYN@Mh_Y@2P52%#}L3i}zrnQ>yL$LFQ2ng#GQ%LZ{z
z*IfAw9ROfQ4*<x~4#0X^JG#kWG2Wjx3ZHhu>ch1Y-)>YpUd3%#4jAFQ!h1ZpKC&^l
zpe~4AL?W&yrprBN`uBuz-6GjUslq$hDl>5r*A?uBm0i%sntGDN(=wlz%<mdtQsTmQ
z=e=uYVBz=4_;u|Y(d~)+MI68(QTF}921Ch6sr15G2!lv4lW>wK9CFypQ=U&d@$aef
zCVjS{Zn-N0`ZkS*=a6}O!Z8qG-OMJvx22&M&)OGeUyROtp_i4aZ)XsC<Oz#)tTr}O
zLnKGhJj6>M+6H|#c|HM4@HJ^*Q+@pE*yxep#wwEz8cmpHggd>9|3J)Rga;wW9ON3c
z-lUN12F%3j%oT9In_n={&(H36RueV8+i1lJjgI`%jlB;x8rJE;H~a#}9XF<(Go+N(
z+2DWF)y}{r<Cc6I8^?s2WM7WK_1e`31Sbaxw|F~SP5d(Pd6OIDJz%!uzPrk1yaM^F
z?cNX7a;(u&^KMf(MAGh?78KzkD8k9mE}SjO`=_$uG~$PafG7K=c=@f4Gz39i%Hm3s
zUs|BVtDHmX<@+(2H4SY{es}*Waw$u2TB@D9ZnW`H>&8$xy+!7T**tTZPG#}+>?4KJ
zi3w!W2G9TqQx_M!tlEND#FbcHxb9XfEQ!<BQ)$zW_vCf!fx#ED(?gRyd*vd;%VU>p
znIUbqB2f-)vn#QNaMX-Y3N9msbyWH2xI=cdQz@x2UknLiDQ?KY@|X9@H(JeDD-t4a
z%C;hYS-8^7UfF7uRAN5RhcbQ%o-$bmu@}!k#4gICv6}09$U0~bVDWAtgO)^G@yyyo
z)ARWU(eB~)WO8BR1GU<YVpFl3R`c25ygl}pgRJ`d1L5_m5@fCa1f(n%t!vlR&-Y8c
zM_3m>6rH#rYX*H?2^r3>j0_mFCxL_(#&d6}o}HG;(Te+EA-T(j5I=h#?(d_?c7uL#
z0q2`DZy`1)zQ!{kxGAA-nhk=*hfn$`MZ$85ghyzX(A(O@k<z44wqJ#$*4_{5&lPu?
zDr9(H+FAFY-YCHh5`!&|guMgE_+%us(t=qGoGfK)QoTe(q*DIZ&5&Fi4PB9gjLs<-
z`b4f&OLf1F7_L}-=U;qYc`(o=5Pr|YXEN@?EWj}F!YG$~%!^9cYL>$DSv;OuHG<?X
z*ULkkO%*(5RXn1OL<2_}_p}$r5%CHM7dvxwA}(r828rCL4~tR{a|l)Bul6_<x}4nZ
z;9F^ev2vX}F4Cy9Te*$!$p6Cfj1SM7q!&;5R1QF^|M2PaS5fOc<76RXOxmpoJ6El#
zU*T?Od9TN)C)vrOqgiqiM=uU4s9M6mPnB_UJbCE+HPo1*S!eTIQ5)mV^a@3yUfA^h
z_p8B)ie4vbOly9vfYp`uG4%+Lbc<8xv5*+&<*e`~UIG^?7U(O;x;Bh_ZG8|et;8YR
z4^=+OvBzE~fQ5&<vBmsom2iev|E_@uk7Nfiq#ciz7+_UN2XgGJ=&V0y6X#Vxv@gFj
z9Z$q_wZ{*8dzS-oQ2Uxk!X;^Ce_pdNXvM?Y4a_{;=87spKV&i57+Vgp?e7`-=rP=V
zApZ$ahF)Q&{O;@E+<eugm4|puGPojlo={M7OmHStewXkH-n_+;i!?DvT;wdX+)N>A
z``(&Dvb3iV7KOon&7K{)_y4o!e?EKMhfkjiq%@%Re|Pw!u?{HDpQTSbPBnf=4jAtI
zj28nyYUnBcdJu&`t-8i>9lK&r6x0;00KWnG(O-}@D%CA`OLuGe@zI3PP9*}a*o7#t
zbUMdLd>{98z=#b&)P>BF&TY?qm<fJI9qEL)=kS+dYO;TYWzWgm@iK&kT`0#A^SRql
z0hq##77i^3y)3cZ&J~wfr&&)o^P(JAZRaWVcF|j{J$#N{hn>o=&dD^zJMqO4Fe(yy
zJWss4OZz-u#-`-VgKWmk5HMDr>V)Phms!}y!6zE95e3=x(deTO@vTHu29SmOT-fYB
zZr@Y5maY&wfjr-{VAXmAci}2Bur1|&Rnp=1@}xw0JN=vBQ~PU1`bqqM$Kc*gC9s7^
zNe2dTB@#FEavg`(!AX4$Bu*eN$v<$fX(eo&F<OdcVv^3!qXdTmb+sk&<`vWtZ!9`b
zzxk7NF{N4g%zBv+bAJ?n?5c=a|9i-zWkH?FgK7c+6~-8wd*fB(>Ewm6H;6mS_Es#!
zgz|>pB&61Ey*;zNP3Q?qCs<O1j-m*yNPC-bbo>6S*`Zss<1{=+gn(Mqp+U%77^o{<
z_PFB#KdBE8x^AYM&qzKmZu*$j@2*hufw8(mf^}6)zzMEb@3eyj3>FHp!4GtL<A+2{
zrKP9FR-0MaOPQ8~APftuV(<aYhF9j5GPRj$j6sIT9?62PWbvY_4bQtycR52zr1CO{
zkhfh449egzP{(DI-R%l*#I9`H9yj@`-c2E?)*M2FeZ=G1m|Q8eeP$%7Fp3+Hi1!S0
z@UT6Bwy3Fr|AIC{Ka%c|>wI=ug?F{fDZtgooV_#msmTZNGyp7I;A87hI?S(~&fC0p
z3B;pj&hCHPLekY}biLbCZ#F?ym@BgU*{Xt(l3)+g{E}UWUN5J~dt(o?#rR3^v+kI}
zXot|eJK$HBWEdgp#TJgP?5{WcqTrc~ohlmx9Vwh7SuHSsA?PDRH!5h5eyn7#*?6k#
z8=mXefSoadZ}eH=wQ5|~wbIY6F;M0FY+R^af2(m(pk2-{VmhL?PwrGjzq2@W^uI9d
z<dpV(;R6Bl8r0iuzKQ9xZ~7MjMFE=u*ECLU5BLur>gS~I9n>aiUL%`E(kDMffl>X`
z50|w$ly7%apJ9};1LOoVQogqA(~ircz_xfR3Ux<yHmTdKz*cHCrQT_zel-R^XMR7s
zZe<2B(#-s(yHLnKm|_j!)Lh(L?e>vpEBO&K&E(Ai=;zC&R^0ZJax0XMm1gLcbCWje
z_O`v1I!CEZ8ioC?iT@LOrylc}P03_RD&Oaznf-0Y*$K@=Q4RoLqrBZH*(1K2`s=@m
C%$?}~

diff --git a/datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip
deleted file mode 100644
index f04c5f3ced9218f62754ce6b851b3a5808ec6c20..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4174
zcmeH}c|25mAIA@4%bx7JB5{o|wwf%FAzR3jZrS&BraOirlwHZbKDw5yVM=xtw`^Hc
z$`Y<6GGfS<EiZY_RIgCE&Hd;3>pAl}XU<>W_xXK4zwhVwG0=ff&;rD3qUfUG=a;V!
zI)E3j^K^IjwzRXxTEk3?_W_XfLuagrx94R#fQ(`m0ss{6!H2<~MTB=3vGs!5qA?de
zJ)rWEaHzDDv=kIB4TZx=te%x^v+EN@Nvv+~`31m~sl3<cvx2Lp0eZ4*4A&)5RSaYR
zz)KDQQlta07;9&gB-X?0bE9I@m95s2l)rD(vlvxWgCHxi?*l7u=8Y%U0^E-(g)XoA
z@^bS?@EkSNvt;oc^t&YU@}?~MVyp2$!lA)T^ZqUdHeoN+t4r?K`{(8+OzU;s@>rZ5
zuf2|UE1;oU?yd3KI9y)kBgNQ~%AB}5a7sjLYCl=XBKv_rez7BQdWiYf`0`<dtZ+|w
z?QmpxYfzVyJ#8FTww`icy5A}d-~pZ>2sOfo;^7Aw2BJ{(5Gfqte1?b2OXQ^RkW)w*
zz7%!C&!Angcs`M?JifVX!F5O`KZJbNyh9<!w}@9;e;Il06J^E{8Hc=7-9W7x-oNo<
zumiSqOG_f82R5eP(!L&;!;?=P?M=<tta0Uvf%oK6JbY{4DulRew%eY^7;Pa9w%q}2
zyCmstx3l*mrRj}=^y;6RUZ|;gvRY05#j_T82K{OBLsaLT^(>{Q$sW$MjVzYgofwAL
zJfILo8Q&Wl=0=9zscJVBuck}>YaE@!W}}X62~6P;TWKuMcE5Yd)iR;Vr#El)fY1G^
zc#-O^)VlM81c#xLFfMOtM(!y67OtUNda-9K$G!h3Z0Jxza8jLaxVc}ZJX9bt<koRH
zt~rg_NH{L6>B5N>$ZAk3oJOxXw$lSL9_TZLzbz2fRl|1_s?5wG&JyzW?5d$mYf!x&
z%WCBN^22drkF*!An~5c#;!ztOSSnE@pT#uE$<CktaPm5a;XyEd;us%qeoAgGG**Re
z%j9Xdl6{TQz5J*Z7vDrW3}f`!25rQlbnWRRVWpS7;SSR>a`J6*NLha-oo5wq6gFF%
zn6{PCOt&$7*sgik3RH#>R7RF`WxTB2oI&TXxBI3KaUgy&6z(x~Q9MHQq0eWGIu77~
zpt^kRE9VPLHfUnWWcWn5vRZY2o0#hBudKq9Ufq)aWCn}CZ8$sQpKj=T^dWo2@zrVm
zk>>C7B1Z@#F^=#u+-doSio#cB8WRyteZD;(nvRzmBek;zy5i<q50osi{T`>Go~eR=
z9T>qN)YPU&V;maDu3noMpm@9;NIS26+kuZ&?D)G%t-5m`mcl>a9EAo;)k>aGh37>P
zJQe%Q3;%q3>;Wz^y`Hd4g*WG)v>g)gO-J4O`_gdqRLM*jE;V{FAz4Acbg9uLe#t;f
z-nqGG;VG9NCWV7eEZ*jJ!_1?^90kk^r5rkHnp_oim{8CAKVeC6^@5ESwoe1i?yL>N
z^EIJr1^pA<9%W8ER`XiXrY@LVd8N?%u0ryTLQYlgd{d8OBmuvZ{sM$<Bw^#Q(YUwn
z$mO)77W}WGmjtR!py=^ZUCS{3HxZ@%SKS1*)g;>6PBmeqtBJMuz<v{Q4|n^&kpD{|
z9r1+=Zs1t*|GRSvjdile{8GtStteq3&@#Kls#Wqi1M5VOEQ=XZ-O^-*6tsj(7U+$1
z8p3$wIk_TMEzK3?CTrt;#?}pBVwa0?_a--=A6w6L@}1p^I#!nIEvn4gE1u@U$7Y&j
zx^#FVEQH(4^(LoL(e)=)FQhNF@Ehi}8n||HxKJu_3a(JLoW%B%QB|slsJT#?OLirj
zom{Y|tuvkzr(7v8c0^=(qQ*us!JLuker}t*!s*wQ3!hNMzLNx$2S9Cw9(+|aB2!%L
z<V)YZIhyOLd}gV_tNX%LtnbL|$AIPlmZ!OoVZ-RQmZYZSkMo>j8&*Lj#)c(UA(^U~
z!ZZfAD+~fIJnOX|!_%LR)P_=YiZVePX&7n~=M4`AEK6p?p}{c-)*J9X-58boF28vd
zh8lW{vLDp#*$jjEOr`s;DO1lyyj!byH}S66gSUpjHtE`D%W=^n80R`YqH_Y>M%%Y8
zGVt&Y-=rE2g?^x}ApyC1ZHs*OcrGh2u{eXON`R`$lRlo#s2}ZiuPJ?@6Zf|uJ=Lh3
z2d$-`ks<~Ah8u=9nK>)Wc@W!=br8>IO4k;@g}a;(yykCFG~-wByV9M<1s=S1j%S`X
zFP3>COi%(F8PFl*kll045gYzQ7kAyFoA(CgqhMvRTo8nsSV`EJU?PowFQ?8yAlL~m
zNNXLy!V;fu-Fajnnr=np^;-$DY~+bYX1vr<K8lEVsyI7_R3uC58_ve0yXxJc_~3($
zHwG&!2#*r&V<&B1_{@`8R)raj3=Vf%Obs7KU^XkeRBKIA7tckhtz7SvEZ3i#)!Rh0
zN!N<s#Z9A^-+2n(Hy<eaPp`zPM!Q}}R8WFqkIrION+zFGTb%|YBV<(Ym}!a}JBLyn
z$Ag#5ZDvS}^r)<h7PFKiRvDS$DNdyUb%^d5RqB|*$F9dPGMm{d3O~1H_f9KoeOSFk
z$nLTHD6p#2cLJ8l>*`nR&fQVj`zGaWjnU9X@&0;CY{5jvI^7x%QP?lhn&`VbqqQ9A
z!hV(28NEAO61Dvygp-l;L-xw@oL~9Oz^p3qbyu>d{F3bdML}U;yWpm(`tE}Nkn(>`
z`Z42glV(-3NhEzv{2+{&_<y>r?cljP?|*@T0|?{<P=J3s`n2b=K-iA-4^sD3XPde^
z?(U>s0VkhC>bLOwYvzwZZYPtCf@J1*J_C|}66p+dh<kB+wYx*dPVzV<$>i+_^6TXi
zCvNuvxf6<`CK<ZpxFn6b+rW2H1NV_k{lU)viM^M{d|`t-8QjVj=tyRN?>Kv*DXEC1
PX~7=~+&yg|0090AFxU5D

diff --git a/datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip
deleted file mode 100644
index e77110ad47766ca8d889731ebd7c3b369c4012da..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4186
zcmeH~c|4SBAI8Tn6p6|boudUA%wR}Pk=>YSOtQ-|rWz;9%rHWcEgD;dH)0TBBuADa
zk|iODvM*W2TAew1(aT5VeZ(gtInDXw{qLRm+|N9J{l52oU-$L9uR+1t*m*!JpRs%H
z=&vtp4_=Ts$jQ^i#oNxw!NWn}jOjKI8%4^*er0%G;svp=)7d~E_P>A!YRe)AHWtB<
zz!*1zyC)G0lZSwzicm!`1PX>gn5=#$!*RnD4>MW)+}iZ{(~fU7eIa1g+@Ov-j%d4+
zR~?O6Kp+QJ5J-`EfCs?==PK_(Bz@he7|L<``Gn)&Z&dRQ1hQ69K(A|Ds8Z9!@$ds;
zQp1T6L8_sw=%lDlubfXb=p@-_XV~_Mf+p;(R@a`Q2bB-Tixw%Cai~YSxJGn|xl;&R
zJ7+_4VX2Fi#m>IG4@6Vqeqs*TW*3&vHC7kX&RU@IhkJ#6zYeIM#XPA@*~li(mfcv>
zal`;#R~pzJAaL?f;6WL<_=MT5V0djXyqFUlSNO&#hON|?CNhSX*FAS8mzZLmf`vr~
zpQeUeK8I4An=upU{Oh>f+{8+2QR!q^TJl|J>~x_Ofu_GKvM^=ay)f*nU;D&Al~r#!
z?2%X7P<_GO^z1KEv%}T=4nbbWd@l3Y&ez;+E{jm`EPai@Pw}VE)#7J;)o;J1j@xPL
z`<HVqi^Kd5MCfGNW@HNLs^=VXc_uS$moi2h{gcCCKw9=zL5x#cPEKk8Y6sNyRIvZd
z($_<`uG&r$1C-91YVyph=7c3NsaXi1*-@71rzvfw`AYIJNtuC~a|TP6vl1>+Ivq8v
zYS-VMy#~QHQR0^dDq_a3_*rT9MZYJ%wDH%MKxuvm(0HYH%9fu$^Zht|sta+|IPLxr
zMQsXVn3dvny6V1R(Db2J@{wIa7RfXAiJV;dcaUeI#OU#SFBPS9-#L9mMNvI$jw;6|
zXx$!41jEg)*<TQ_;Vr-*s!?ESt1-c-I<Q1f>@;U{eXRP<xKOf;K!D(&3gTZa_kW8B
z6vJob3F*CsDQVGQU>fB92-V=jtaxZsR+t-J?9`-NTVYsvF_}wwU<CDSkI>|JtAJe_
zt+3j_Pw{C)D6Qhuk(zA>m&D7v8nPSZ4BSTt16$1n*mfTut+DGK5BpWDenz74**;`v
znGCGuqF;`UaAA`%G^S-fKx4P=KrKh+^d7R?N`qL{%7EgCrc{V=n*v(>2@GZCwIVrO
z!~v}c>-0^r6^H%6W!M(Ds~CijbEt&Cle&LDwd<oYr>;ppOCLtr)i<yh&<tS~(3kHU
zl{gE<rrzces7g)KyyDZ{_pb2moO?pkJ@<)fr1NmI_h(XNx9#Lid0Q%)w3|eF{~z_R
z+*8+T^9YLmCC7X!SmdfS?odHqxgiHtTwtY27e<nD@ENci;dkAZ>h-zWzYwp<2UB&z
zEVE3=ES{!PI&QNG&T!{g^8Ud1yt2cej%r_*Mp=sscpR6QNF988&n5ICo*aj1d{j^5
zYswy*Irqfw#QYVDL4}rqQorR=Z|~`$v2)&+y{#(q=K@$cPiWU{cM|1rFYe8{w(llF
z*jw-g3ND)jF+cel_C{Jo__+hVG%;5v0iE~&N4cEGs_xeiM?{9Gp)<FkL#@iUCB!Id
zsSaK>B5SBUpO?Izyo-4KsbO@J=aufoo8T_H-1FIagBq91Jb4#O7Qb^Fqk;;|E9ikG
zdVCXaix&BR@&4E1Rnu7ZS_9Vm1dM-h!MS-{z!H9@p37@S>!s;LN@PQ%$sU{^FGV=T
zCRZVMS86BF4{?jmDk^=#=Hhrf+9LN0E^zn>PWc`+?yF3aLU@g*3lr=i(e?B;wZ<Uo
z$jtCm0?Cm?$RW6em8B?uj=?E_OT?_awJ-E|im++1^2=CEr{#AiYu6_V9nlIkL)Tg8
zgg+!1_ryEGso;Fmn{`O5E?@hJIw+U9NIC|oTc>9&%WVKwxieE1t8Trw_2w{s?{m+7
z3ie)cNl)F4-}-mxD>vc3BtLDQYQhyIPoy?|_yqOe`+JqK*V0nEYWj-;pNQD9#t5gN
z$4N#nE4PQ=s`4?C$?!0?z(}_{a78ungf>79<V?bmIUR@j(lV=Fpb9%$YtE`18_a(x
zEZyDTlb|=$DsPT}sTA;%#h0TkC8v332u~K(-I69gOLjzmz!oKUj#X4VTqqhIsB~|k
z|8CZQqL-nEuB4O_ZqbJbKtO9CrNEeXAkKAj7u~{$YguLW-XqY|NXXMYBZky=n6OZH
z#uu;;*%f^wO-T3`SE#E(ZE47{p%A+FjK=hwy;{}3i&eQDX<4eYC|+?k@@K=T^b}eO
zH+^J&?)`%Fg4&KtN5nzL>Iyhib^JOc@!ogu^{3%p{Hh9N$AGGwM~vRNL`CYKR(UQX
z9n@~`vQspu%seum_g#?bkSzBZO-w}*N~jS1MrKsPD%{i740h>^Grc1C@$`$pXEvCe
z-_)dthtu{&_RhX#vHt`Aq<?oI>T!hxA}&onpC%Vhe1AdGT;xfe@V?3TPnnd?`iau8
zCb^P_vxxE{1IhC-XJZ#%?V`|<=Qs|2j(9vuf2FO|`#RuuctrMJG_-*U_u0r@%P^5E
z$V2InaeRGZ>mIsNuU*x=+I^+o<W9bq27H|dD_x96Z+e%*^~8{bR5|0kFSq;|67q_8
zj4@k<dGXey!?|tDnJfBj3gj%T5^P)L`n_v>0T5kl<#j{K=U7eo|DfOj(0aioE!~X;
z|0ePOoU}RbuaoFn8BCJCrhfpolKy|Vto0PRG5fE=0F!|-1K5EdhJD&{SpbaTp8!DJ
zQk`|`#_Y;SeGIspN$R&G`*Y^z493Vj$IdkKyIum2e~`vd@RhT;zS@o1fsriD$uxOA
zN&I}dD~{WE=P^RZxR{1A+<?qcH#TWTYVJ0sshe8%-?3LMzq!Y(vVoHf9Od)8OtZi5
XI9s7P0Odd+9^mT=oSseq00R9N(1Z0L

diff --git a/datasets/common_voice/dummy/fi/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/fi/6.1.0/dummy_data.zip
deleted file mode 100644
index 3498a204ca687ee868bff6cfecb13f4cb840cbe8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3864
zcmeH}cT^MU8pa2Rkt$t4T#7-ZlPG0%34#kqQ5F!91rurzE>%jDrAW9mJwPZCRw>d#
z5hCmYL0QA4XsDqqD9hpkYmkc+l_qV;xfAdh5LvSS-^`gg^GDA6o9}tw@A)1}b1n!k
z!2G=~^S9o3`S{=i4g+q%K0YCiZq7l@8kcN%0InREl@s$0zQG4@KxViA0P+@mSngUx
zVS5o*teUG|fPZkHnzlMZ?F9S;Tn%wT4S`^@dVYb+wog3HW_5SZO@Lso-fo{S2CkYL
zc%JWK{dzLNS&9Py6gdF^&ORV0z}eGRJtz>nAyi6^p3`!=-seKmk~@5w;ry}PlVbPe
zde*K@Lb4lk-IT?>%?(tGCnyDOX{xqG6E#a{6Gj+LcHaJ{#L}*5PI~r?LM6tn1yi?(
z`zCYk+?v3j3DXHaRF!{cm8Ox9&9^m!hfV4w4bPk5ER?>Fu{HH7fC|e?IvDZ{8pd(z
zt|b4VipV!=4^_L3FEz=q`Lzjx%*di-RP`Qdw<Mg)Mksi_RJ|gUdW;~37@9^w{nFy8
zl`m^aXSzvJ#n#7(F8&rBec0Y<Qc_EHZFl2-_27!3*oyOrMrtS}$t-e6H1<wAL+#e8
zd`KAbt-$prc_<^OaAjtq1HCfw^u=;O@*n^$U;i<sz4Ar<VISaKVlz5#w!Ycy(9&rS
zKFavQz_;}r%n4$(U^%m3RSax_AlL$R_FLeF!S2}rt0A9fa{TK>7vYWKxc9A`DA+4!
z5eRM~wR-aAxi)@O<TJAe*tpeZJWRHug{-8nH$K|)5HJ25txt~Z3Xf3PZ!^{&%DAw|
z>4BRby;WU95hc0=M|Ihg(e)m*AIG`&8x`iiP$8+#HWcQJd6?j{tYAnf{^GQ;d%P~u
z@Tj?7I4oxVtOa2{MTiHs5MmZ&2-i8?lN2dDX-otx1cC|Q&<p31%GxZWya&kv71FAm
zdYuGQ;)I;+P*;yvtHISLafi(t90X^YgmZ-d)u1v)L@h420*_IGhP{z>E0GM6>LRQx
zoUHOKvaUtCkXxQ^=h8Tpc$L3QrA2D(Y8h3lA4}}7U;U~p_4I`i>$dy-ek|=xa$Kz8
zUvq>Bg4#nt?KRog9_#Gw3Hk)a?NbMs<Nu{CfN%79Dko&rUC0}Nw}n$H#ZnHHbHobp
zKs%_?uXC))v6`dQCgW3u(VZBEEG>gEf6B`9BCmJjhbFRb$qkpa6DY5O#F$ld#8GrS
zssv&$^~1=S?;pyXT<piZ{m_V$rFle&8;(`i$d?!t_~XhY#G(?A#fkCMvcPBqATo6n
z${*=OSUAh;6LpyziCiryHxesIqaLt*R+av{2PD7>A}as2Btb`Cl60waAmrgvU0a`6
zAH<AXDMpvp(U5b7Y+PM*s+a0z@kHHRIg}oV5)M?Bo{OEzA;*4~6%amDah>LvqCqH~
zqGMeeAXB2EYqd`%<Cd0nePhV!yzg4W@6gNMV=5lY&dzGI>afITde2tzHP{zF2oo6e
zN%!5X=l{CzuDhwhQZxh{`UUX*2cGX2<bes;TX@>Fr*ix;;J_>Y*60)w)~?jjKPoKe
zE#j@qalC5%5p&?9uBXZz`AUJ3GCUEe^vb?vq752+j41Hvw8ENp#Q5!)<-ckeWVrlY
z|HWm4hR0-6Yix2Q>VmoSeIObbG}SyW{Wgk&Zl|B;+(W^T%o<4t%BxZZ?Kv-ZDr3Ym
zDB`v%vILn&1f=VsQ11w1EhC<hP)K)=a$u=_-dK95I%V;9hz8B+6ii*iynZ<GK#R~h
z^=d>uy%r-nM(q0UPuB5wo1}qk#Ebb{hoskcgVY?NO&BWAoMgl**3fUb6pHPRppG0n
z|3+86^a@e)!6Mp|a({&8idz!#6Tq)?Fg5}L%ieA4@DIKNm7WG~ZT6?!(|4!I>@w^=
zDW<*-2>#fCcBNq+ezSZiI&Vm(Ay={8Y7PoZmmbrrzolVIocWnAiZ1vbhMB3u2Ud)?
z)1SX|&;O<5v19jtlJaye;=9)!TP3t4{wRx(Ja{+F9T|N&`$30xpM=fxSBKZE&HN8#
z3E*_Ou2Fk(F$TAJvx>S!$}atyTa}|*ZAcunyD}R&IB+aAl{A6#_Ej&HpdEE87drV{
zssBNfs6xf8vm9qN1H6Z3DZUcO+6Zh?K$Cv)%p^?^O<iw)()od;@y>B-HB7aSOv2BG
z>B=5=O*~1jxtX9BQtPMPR~O>BPZ@HVXj$$-A=;!JG}lU3EH!q7X%AG6-$V6{7Y^oY
z|6mhcBz`MvjMvDhRm9y*(Jk|8tfV4OL3c8|jIaI<w@dE|fs$Dq|KX;SST_D<L&O|*
zA-st;^2ToX#-BPeOG}waX*g3ihV@@iR=#7J)Yg!w#eVxfHqX;<drD$<a7Ucv;FRIo
zEtN`s<O=~a2F%xO8Fk-gM*Rl`6#!PjbpxaA1^*_m?oHa6R#~JOg90{58`%|vF|+Fz
zm&Mvw+mq`i4BTQsA&%*7TXfoWSs-jnjs>Z^c85jX9@e%}<G={PCiPQv+nc#Fq-<q!
zLD*(~<|ZKd3vtELoH>iE)owqbw~{0Gu}x;3*LyFQx#PAUPg|kOP`0644lDMk+iiL)
z)t#Gd>JHQXJND+5-{~=%Y;cmnQ9cU(iOr$<+;Mh8?*l~y0ABF&1*eCw@fH6Atq)s&

diff --git a/datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip
deleted file mode 100644
index c8f82000515c785b28536e3aa24b65d126d53687..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4085
zcmeH}dpuNmAIHy(k&#50h~(NMX_#Tn(i-KG#<j_12$g#cW@L;B<(6xU7^7B^%B8|u
zWP38A&2=_5q;e0fYeWXo<&oz|ub8Yg```1&bLMr<oWH*B^ZR~&pYQL3H|2qC1QxHs
z{8N^nU%os9fbD>rKZz80+|8NltY%})5Ab9t9Cli~{R0F52yC1O0ALf~1HWbwMb09w
zG^DE!<&?i45~GSns-x6VNVGZ<jpnj?W|j-bCn|AS-Gj0O5W23r*5`|XtA+#5GhHkr
zja7yfAON5Q1ppNH04l|qNLHo#(LOhdo}ugX?vn1eje3l4Ft0%gW7Bp!KJ*+sUe3O%
z@J!geu3z|3u>vcaR;o(MMwi2boGhk$&h}Q9TIM|+?b!S_L0THG??SrP<Chch{y+2(
zE4x@^iVk8!Yz3n>UO4v<(jn$hV{*%`0tDe4lUJ=ktkEGiK3i^=YzuAsI8Gz*hwDNv
zNmom^v|o>Bkg0X{%^Im4-OaC#ybGz<_DWBdJNaf9h~QIxJ`(lxibvR!UYNXR6UxUS
zr2M|T`3<v{jwhQ;9Pb@1v5hs=exuV;Y1K2NKR)v6(!3-gvTTTYs(sFJH1JK-G~-}j
zuq`P{HQhxqBUY_Qv9GzvKa6=X;ukaYqoByDPX+uRSdrp=UGML1iAgD|7YHSn(=_%9
zZqf10dqF4I2gizpW<G!{?xkfh@}+5WSWpaOPz+V>#c*?{tt&;c>4k1t;pnzMlt+Oq
zdSxmr1fzW*Ka9fnD2&y2VF-sE>%OjbY4k9Lsb}~65VmFk;oD%%+No6B|4w<YKpwq@
zd7Mnbi|5Ygg&LG6ms*-r;qi2p+E#gT&C#~1soH!)4HNhDwkJt~WaBtNJqHTy$e!Je
zE~#0XCTU6k+D3>NC`gY`xR1k5J8es;61$t|DL9bzq!c$C|4ejmix*DJ-^|#e{>sUl
z0~P#rN5&1kbABE6^UH1sa?JjoJlk4vF?j16*dW_x1hJP?P!(M?h~6>B!wQ^yYb2do
zC5nq7@XgSA?Di9xG#qTc<aIy${we2$*WLD0EaQ3eU1)yYF}Y8+aElOwcg4kL?9f75
zDwRml?RJHUKYYAZ33uA3z7T$=q*kMM@;s#DjNUOFyv$@b)~fXBzJD)#HvE!E#u+Y_
zhR|u^8Bin?DAIS_i$rtwCW52I-R-MX??K1>g=<K3b8<gwQ&iiiawOS9d}QIo3xyc~
za=`LDq|1cF)WBLc(&KEDJ<xG4O%!Fq=ddqz*qu$H=cQ)y+YeT&+|eQrH?;<eKMN@g
zy>`mk_vOU^(J-wP7cyKUF~32*Q#>aVW99~xpji;J82W-vc?Cuet(`yF<@qBV>u<<A
z2=(y`NIpi}3}f9;ZVH4bt?U8Opj#BQwRJ0`1S%>*>_>?6b*Ak%Y8%jkdpERX(3A%J
zUNr7Km`1&GYtW;1Ose5!@YMi@tjt^!Mkew8>+unh&D!_gs?SnB`CpywxtTE7x;2%c
zTd`3gMZ)HLb|#hTQR=_V^+!i<BO6g`cl<#~qqs&*`*V^A8=I2)>}JN`@8JqevjI~`
z4j=aN2=<i(eh^mY=2JvC^vkG=ti;(mS9BCTQ%d`i-2x9WLO}b|{>^p4qkD=A=HHi0
z^@r)=gB&(ivxj4Z6@v?kgu-Fvx)Tlk4=h|Ok4~H&Gbm{|-F`a2V(#g0GrgQ{Tq(2~
z_d-+M{it6xUSl%;U*rD~jjamRZwi8DhW)eS&xd-_owA<Dj}uwGJm^gany83953^ow
zh$dC?zAH00CnItKb6#qD9^1zMHri^(@$PV6sIG2RmYj3v2eH~e@5R}fhM$+6&Dl33
zm>)acHIw#1qh_0x;ns}XTD2&K4E|=LtMWCdSG1*`Rb&L=XU{x)iRy%Lz*@8PS9~sp
zW~7O1&z!!?*J-hZ=!La*xa^eqT$6Pn#p8sSV@H<$^(zs+H)uzn+C^(#VnW77>MQU9
zS6IFw<!Ny2@B>QTbMv;0x|}VA(+^C{DD1$8w`TZo>a&d7ks>>_;zCWFcF!41LYYqX
z>^*TO%-c)0%d)=9@ozX#F_FZIQ2!Ih%QhJLm=N<ZK~=foqwSNq(~tb@7xstpa9S(b
zZGV6S*mfDP?HKN@MI^7DqH8$pakCz|T2`31Srg04cUvAMQ7zY^V%P7CqkH2LUU;8L
zm?-mZqNmL$UNw9{>qTgse9$M@c)1=kw5zcQ8&@ZYP%OU2YAZV|T70bDhhIrVX*Zn!
ziPcda`Vl31%R=Cuko0Cr9U3F2a!fB!iqD-9eoXnXZM3W8&n4(GIeWNj*z~-Z!IZH>
z)`kRCdsEe&ll1t_7}>WLQd7^ExvUW@ya%*aC`tcK`l!D6$76>8=-8cSU8gl#Qd<uT
z9fjA;#SD7aSRoTde(UYYB~dm!x{zFWO-0&JDZX9R?Dsz$$%=FW+Xp)_p?&UbM@O<3
zJ>iJ3t!Ki(TZ0{5#LKfbjfy`xQo0B(h3SE>KOjy=p2_PL8v4Wxmst+4i|KD=0@mbS
zhc7A+eK;fFq7<vLG6cGVXRXLB@r5r8jO#7Fas+VRr2zg93MvB21?Tk+a2EWVFup!%
zbrfGFjq7D`N%|bjLD*t2|I1}9Cqd3wz61mP0%V87z`qrJT60+-Y(+>1scWjUOy$g<
zE2(tQs9aLNrqT78tK-y4CI-ee^P8~%k{83(Iz8q*-&c}lHgHW|KJC|E?xN#351f@y
z9yr&~6~_;ERL*(3k{ZO%HFeeW{X6!O<yX&`B{sN|!L1Ax;F|sI#90f?3n~Wy8^IqL
L+&#V^0090AniH|}

diff --git a/datasets/common_voice/dummy/fy-NL/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/fy-NL/6.1.0/dummy_data.zip
deleted file mode 100644
index e74ef5b16c1a9078fef7ea36f7a1bb97d7ed0f62..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3982
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7Llf>RqFcr
zK<waOV28^GB0!IGFkD@f6jU0XR>;K2z`(}Dz+g<09VJDHnR$98#bw~&nmErc;pa3v
z^x!(%@9oE;DBy6tf{Sl~u$?cn`-~$yN^dqU$~b9qAyeY)owzqUK5$%H75D0Qf_KEP
zNlZ6m|10D&h&dG9pML(<#)$#@PvyS~Q_-H*z+T9(cyf;Downqxv&{3?%(C5aLVf+k
zC~=iyx!eFZuj9ND1;zz@Zjq~91bo~U);zH)NbqA5Hmxvw_+-oV8WY9Gfm|<_K3vf|
z&Fzd+%gtY$TX;6}J#GpOlzjI^L!oh2@!v{wM!k<8KYiQ9VHSOQ$*w1{ld_lpy!T;V
zX4aPFSD2Q4^QqR=mQxSV*|M|mI45Vhj<^zsvhumsC-%|jmOuU7oHl)1!KyoadrG4W
zE{0^-Y?wB|PxYVsOKDJoK?L_>2g`?|z~Gh#2Dc$e!JU#?Mli4^8s;4~;Big=DzeLg
z*?XE!<@<<|myA4JON5lC$=oRFjaz>G#f<JyKCAL~4iD-Z<R3gfp7|uEZQ>E$Q?6^S
z7PyJs&iyfidwtV0+y7n`n=Ojf;#d@}`5K<(5WarJGW55IC;Q#Z%ZVG7tTa@+#`z%X
zac#a-7@LL3iVHXSm|WgF<xKf{%3}Q<x4hcLP1kN5_qeiMYeABCE<Xnk+oGis0l!x@
zzYg4fFEYDodCw*mzXx+J)y%P0;{W*h=iAZ?y@jWCy_=G^wqn2D<89teN{Lf{{+nB2
z$*C(Pcs%0RsTq87A3M7KpLr~2qp!n!`QR3{9iKv0PyD?tVB*Z~HK}>l;d*;ca6}57
zW~%gg`69sYOhx4BzuNylHp(Ny-*aA4P+k0D(_~=y^8>@*oTTtCOU%gxmJX>Y@K^vP
zcPvRhcY<%9h#^nw^_jsgn-h<o=oCI4xsjE#uyxvGi@T<=_G{NMnWQJ5JnGROp}6PM
z<~Pf~?>sko-SNz<XKKffzHxpLb+u;y+`63hcgu3^ukT+w=jxo&+$!^+*EdW)hoxRC
zERAccVLGw1IoYcv*W^IN3eT0VIGW$ft(;V%f7Z}FRcnehr|Wx1rwfyc3mF83HY^J0
z&@*K!l+qAeV`o}0b%OKBeHQ%@j5|}0yb=16_G+8XZRK|%U$5M^tNUH`X+}oHqtivs
zpCjMAPTPH=bo=G3^HueCUp?LM`+YpW&h7NMWxGpnFaMkMTdjQeo?4@O4U%8;GUgo!
z4|;Q9vPGF{x&Ixd>gDS+&3SLNb#O+typpKN>pC&hjj!pHmTiG2m*z4%1q)-I)dH?j
zYc__wn6N@z{ohCC>opa3%1ieBxOAyrQ#Rd!<NcKL@qD{aJ(#sTsAQ|SWK{n_<MUbD
zkJeo<+#Oe1Tlug3Ek7boK}8sHoEni7rzNSyCGhCPQG$&|C!M2{b)DLBc3>9$3&f<=
zNcklhsYQh1^Xx=hP$Ah0EF_(#`SGUw{msGH^=>ZDtR+IM+j5R%HOH*Y)(=yCaYc9v
z_s@>+40Q^AGreVvIz?Z3=J*ykipjtJuruTyi|+ML)`un@oyh&sl_9CPE7Nhtj-`FY
zll@;Tcy74J@KnQF-lGR{s*AqAJ6yZk+2h!ei5hO^Sr^n=?`3sxHec0~O6lj?)v>0t
zW9#K>Ro4|aFfX@h-PY5nWT=tNTTnc?@y5X!EP9{!+&O>tg!8Vwd-n20Rj$vy^1IUX
zQNYu(xDO$z%@cN>_!zd``1IOmo^|zC_r7QmS}Pa%(Z8|j)45&j7vAkXkv>iH^HMXO
zJbkTCRg-s~n)-0lNf!RYoyH&QzO%n)N6kXtWtf#2fZ_cch%HIVLYaA(1tX4}bhiI&
zE-2*BzY&RA(70CKTW(>-MNweD%b0IVIa1V|6R-MWq4JdJ$G<xW{mGcAIyEuv#)dhG
z>)nJnWp00%+x_Oi&uMkhb6C_ob;2DMS#f%+#VpD5&EBg2WW#jB35LCnx25^wj(mF~
zck8G7s=fWkS|(bum}gy4`*kmiXX2i#dJ>cRq;i-$)6y>bMc>{zEkI`1vgG)~4=gt%
zB(HT`c{#^mi-p^QJ?nmceCK0y<oE06#}~|Aw_5#9=3SmWqNZs(R~YHLKJ|6=34Nqf
zW2rTL(vpb2&hLvqy;09_a9;ad^~dpwbM_zaJ>N3jYrzMN$hrHUt@diVuJWpA_jKVc
z3I87aL25dJN-gB}rv*tt4h`_keALz#DAX|;tBg!?%(&XT5>O`yFuZjHF;QB;tdJHk
z2?nY#APoHL=!$9}p4Kp7n=sqL2%FwIE+*0@Sj!k_AE;%F)n|x?8fv>3VjnPbFf3^V
zl}$)Nh0|vc`;c16K#Ot45yE2B79z66WxzHI@fO3|i-fJkY#1S1`wiIZA>LZF(g0{V
zmL^hwHz-{q{ES+IBU^rol@!Yn^*LcrgYqD1RfBBrOm<T2MXGp+vlz9!MYfoilN5_F
uN?&}|L&^cnvIk;4Fl_@9cLPun$&n2&iE!D@$_6r>3kcg-7#Kc*9RdK8$%*Cw

diff --git a/datasets/common_voice/dummy/ga-IE/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ga-IE/6.1.0/dummy_data.zip
deleted file mode 100644
index 6c207721ef8b523d1f70931a55c4249cb2b4c4a4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4096
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7Llf>C+d2-
zLhRsQV28^GB0!IGFkD@f6m%$brmzGf1A{UX1A{S1c9aw)X6EUY6qkX6YvMe+grC#w
z(1Yvj1j9TQMIP7eBCajIt6Z3Qc&1D=bM5ljdGM5ku>#}fTQkqjQ)fTHdg<NKLvAlG
zFZ;ZGUaEca$t&BO`QC6uCuQ?rnQ-H0MMb<>RAhwS+wCngZ}{k5=wwV%kBk)H`nq!G
zw+i)Ik-QmeHhF2wACs87>BZD}TQ&%ovL!DOF-wriyzr#O{fm@)daIybfQd@Tl&Fkm
zw}%-z3CE|%Zc)geyO}Ga;1<uZ<o65IIpvmVR`i@Qiph%2d!DxBFb}`}dG^HC&>)}w
zuPbik)z|-x@t!AKzqjs*aG^!vgW{caPu=zi_)iGu>rK{@ta%l1+wdgYw%A>X@8VaO
zKdxFgH|I?{`|snAihc;zXZ_LD-N&%3_x<Hb*?F>WKj!bu2PGp!BqTgy+wmJ1C2YV*
zFeE7wQc}wZMnS)?-%$gemg~<%_A+SH=RQ{DZn1EEs^TZT(s$l18w+9XFqwto%KQiC
zscN5(kju8)uXe7~i%~B`++x<lH<Ih7zuX#Pr!ZyvALY!n8J6t#8u#Q+-yr6%z5U7_
z)9Q3pw#+r}_PWW$yy<m)==ON6g0Tya#ik2>yb>oka-vhFv|QwyICbM*&Sl~nQ^mY0
zLZ+SMy3(cU{dj3fn=Y$#vz(^FBDpy-fo{zQc@BDhNWFb(ZQduFvnBsOnRtq97yFgC
zJEP*v+~`N9Yws5BE5CB}@1Kmr`q?vgSJcTZaBzsd6c^0gx99GMnv*mC1aa<*KUDwB
zWdEG=r|b4D|C#R)j|gSYc}YR5AMA+81}1fGU?`iD6v}0ZIhnwcAvFaa)}W-0CAsGY
z?CldV<XKnjFBt30>NUs0URQ{f)yYZy*3@KcW972+FzekPY{G-?1i$H?XdS2+6Y(#n
zew+R0!i!(4B96>?X*Ii+AvW^t+nOIcV@u69{f+wHX0mc-iRfO5B)PfIb{NW*Zdi7?
zfnPw?J$Io8-@;!n_7oekzH)V68NP;-k%?>ix>!N$U*{Cnh2jG_T$}himM*@`6i}R|
zexsO0ENHr`YgX3Ts}rQQHMum4SeLTPu6p>k<W}7I*a&{%z5oBbWT-pS9bh--+(DbV
zIq&tqb?T+;*SEgxD&~DTcYFMll)TM<f4#e1w)@Y8iCcH;y{_8Y9jhLtS5zPQ*6Z7q
zzVFXp|H(LDVf-eyyOKf2ZkvwX?jtMgQsM;+uCcu0R%QBhU}|jd4TsI@B?ok`>D;M3
z{UZ8+(uGZItDFT=@;n*>EY!*#RF_$<*tX)3P2BSvfxHs6nJX{XFJ)eGUHRS;`@8es
z*8R`Tyjv<=7yYeX<qsnwZb8Kua@-n`6t^X*#U=1)#ZiilM(Y4YYw=H=kSD-`@iZ{o
zl2$wAmt>?C5sKHdeU5%d40v3@MdRWfnRik+g}BX%TTd0v30mo^wd}yk9Spa3N+cfX
z5`M7%<Meye=dJ0UmK-JYre@E9rJ}lvK5X-U|Mdd*l3#&!@$*E}bUou6U8Oj)MdP-v
zeYW<hwd!e~#|?^06z*~IZcINqMaEUy#?r^bG9{!m!}rbAEI0Kz$84TtT%3GyV)G3P
z1GjAH2Q%iHMz+?db<NuRp4~n`#a!ylk;S1$&S+Xs*p>U$y<Bi^+P?a_Sr@hov|Zj7
zU7>Mz=IgJkt7F9DxD+HZ%9S7ff5mQm@5}k?=eG(!z4GzQ@t6Jo%u(}+_}Rp=PGDfV
z0n0#3lJZGr9%d<sBfFGNu<esJ<Y~=*t69>h^W}fAZj)=*mzTU<i)@(EOxoPfJ+s>9
z78S4aCF%AG0l9<jP45qHy*y)i+)>tNN_=`53Z1u~eyN^c(y^<)N1roYC|zpmIwpae
zn%86#Rp(jd$LyQ(D(RAG+nhz7a?S=RA$?N04Qo2GHX2ssDp$GtvX_Z0NaERgOJvoh
zJ-SOj%EdKq6!D6<P|dVn(d2fKN^<wAQ{u}HY+p2~e4WLKKhcf9x4k`IY<=k^&;7c8
z7B4fJH=W#@C$3#*`)<>^{{6)(z3=n0M0!3ddOh`M0&A@g=j2@W6W>|a6}K9yd|iCL
zRb`GUPi5Wx>&N%Xf41D6-+iZH_j)I8<!9{8FL<)6n-13U97t{YVdJ((Bim@H#Kxj~
z&!(<ZyRsxCSnImL+RGa!8u__;Ff5pLwpSs2+05V|-`zfTA&a>}7B{DMai|%@eGv)q
z?bY?)GNpLsHBplWwS=pDoBe;pJDq&t?^M5!KXO{x>(y(xlNa1BovwZI@uwZ}mS0Zs
zB<$ln8u_#K=r49q+(TNtpfVEL*acG-B*i^6<}>qA+k2qs$7miiGRZOHYGq46ogl#Q
z))B-+X>+qe+T0`<sKS6S@UNpQs)2ag--K<#Y<(kadh58DNSk17aiD#mwm4RwAsT_G
zEpdo_z?{mkq>&v+Aq5pqpF!+HYL^2o#u-Nli&0yj$QJhiO9|pFhPOotTZ`EoL$>x4
zu%AV|wP?i>&~hvdvjA^Ux<vRHwY`9B`8ifnEJw5+2zwfo2T?0DWP6vglVUGY^+ue<
zs0BZ=#ostcu^6Koz-K+KvIk;4Fl_@9_a~qtk|P^l65+C)l?`M%7ZA3w0L|wH@c@PE
BnX3Q*

diff --git a/datasets/common_voice/dummy/hi/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/hi/6.1.0/dummy_data.zip
deleted file mode 100644
index 2570d76101e6d2820ed0322703e80cb41d8daaa3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4303
zcmeH~X*`tc8^;G@O^k4oHA~H88)FOA#GtZ_EK%BQW0^4aIfyX!s5F(bM#$DsmSo8;
z#V}5?hoekH)+E_FBYi@0n)B}e;y?45=XvJM^?mO9cU{;0yYzMFfGmKO4>{Y}@Y}=p
z%QgT%0OjH6=w*hoaJPUOpI`>iC5jrDulzhbw*hE?V{`xjaGY|{-!h2U#voQ$DJv&e
zXAd_iMQNCntc<LT6iikM2BR|iy%ft0uL!0xx*d8Iz@7x(>h-xOt7ZbU-Lo{D)~Q$E
zrvU(<v;cq%^#FHQ3p7UB-3|L~qpl~y&A&v$f8MC3s6%=+s-PpIWG-czhUR6j<5mRQ
z`tY>QtCdmFsR1F;IBzN8AQ5dlrAGeX!i3mZOy4jcx-Yaeq$JtlUT3I<1V&CjRBU{f
z?0dL{5Vn+D8fl?!seUe%j|1dp_`#d?K*Py6#kbrluTUbIS|&IRW1d{OjI1o(mhxzh
z2f0r*WzM%J?jGUcZ)Z4k#QoXMq;2vX1&aQ`wCD~R;%Uva!U8sV^9=Q1*?ZFscvh?<
zl0KMqFut<fap8S0L*{G6J&>X88E;B@C93vKc@Db2n~G`OpDv_#_aHn+=_zTlFjka3
zzR$!+($?mbtWf;<+xv3wDkXU8(gtzW2$$|JzKb0}IPgt5MNHlnX^v3Cdc`z>M>Rjx
zzMl*2gsvpfnhMO#2MrV{Dj+E;NK>x@${PE74NCQDWVk|XKXUofq845ig&z&~(c#O~
z66YPh%rorI!?6Ql=_1Ae6i`EujTa&DV%8|zJiojBc@?DGtWI}L?&sZDoxCa#pU{iC
zk|z$OPlS_u73&G-H1fibVMDdv*N}M^o}SYzE6ji;dZweziGcu&K)N0sy=L*$RWCLh
zK>|=FO<r95NQ~TB`zR4wJ8K9q+iOxB@TXohu~`#jXkH;XrFQ?8rKI7DND$tOV`z`U
zWx33%;v_NAa>6Kf4nwwj^niseRwAtq{geUUYOfa>4|nS(xq9!yE*a}}@FYHC78H;<
zxZG|y{%8IpY(%5)_=Kx>{lKn)AUr05$nBlU$cQq%?kAxtNWZ(ar*;Nbz5Atf`cQni
zfw~^MsCD=8z%G43q7U*kX7AkV^omKpZzaB}j#i?jp}|b56po^fJVhOO>eazoIG`!b
z#2WQOqm>%@MV(kj9l+dzfr9OAxlL)Ntrkx=wRzh3Oh9{YKMyWYy+JE14gsZoRSnpU
zK(Oq<5j^+7{1BY4iOMgP+lfbev}V)WYDdD-uaxi{nH3x9a{09AjNeul+PD;0TbNP!
z!8B1^j907h5C~&k&|YdmLU`0d6ja%oLkGJo+C{4VkW`KVYT=wS+E2!5OaJ}cq~3oJ
zNkbDM&6Fphk5{)5Ep~1(Hexcqm@Iz%4bw$PRlrfX`4Bv`*+lm;TWRmWL<#4M3vvWW
zyX~fN;L;SMr@kM??vtFn7rJ|f;1x;}W9m*77Z<wD@LA4%xl}efM*4!k)Ydbm-#@>Y
zS2UWogr4hxn_ZbAn=RRBu`8H;D&Ch}!0RNJDF+7Vv(mv#!OVw1N{YuYXDfx3N%Sy=
zFc<=IzU)&;l&2%UG1@9i^MIN_%kT~)qJd5tC|H3`sp6S6xG<BpHyXz!TrZh+;oSDG
z>~(u=^FtXS9gS}eN2@a=nSBQIeg5L%)Xx~w*9R{dX4_be>-C0GueZCkoBI!`w`hs~
zm->H8>O3yxhXs_<PX2GlpOd?twd?P+cEpn~oG{Qc(zwwe4QSeL;viCfw)2H{m%}Rt
zZ+dCMD>o=pctoIx*y$i3PREn`FiU<M%ZczCLwtVy4~Ei_ad4;4jfu5vpY2`hM4XKN
z$i+5xyoH-F^X3*E-;<MfiY!Ie3E20@$l+N~95J)4a$1BJh?n&jA@*rJR8*SqfN)iT
z33$0WeCQoF*s#xV>g@oP8wud>a&wnhetI;g-OiN1?z<a+RmlfFAYO)7M&D%J6<K!_
z-5I0q5{>|&`9oa1F?Xk3d!dCiHBcF7XaWa8ux9|7UD(ziVItH$kke-N+S-M*-)GV8
z)2os6eDfM)_QdjIA*GIyqB{9dx7LjJ`0NAlOU>$-C-Vmv<|NX0J`f}|)p))8Y;xFH
z<5+U-rCzh~i|<r)S?fx6?hMk7Zb84(Fn2ZN(g6f!DM?6ailbCzuN=E`Ec+@dPuTwL
zp;n-4UcR`BSX32v7^?}GEDCwZV@%BQZ!lqSZ+GJdu!zTb6!`D0|4K8{j`O9RyVN&7
z$I3SG^>N#ooO0rwa=p^ykL&9s^*%P#hu7AUmxadUGOp#5XCnt={ZfXGEw5|*E9NL8
z`@vs2l<e@LWQQX4jR1|=-1@iZ2Tf5VZHh$-C5;-*FpDifU%-)_L`=8p4KBiL*7*~)
zDPV5yy&@11!BpIw<VT2Jw>A{+pPt}kW82lw*sAjQ<2^RZcSD?yNT4%eZ_G{ju^*p6
z5k+?U(}a$~t~!QYtfrep!p16(i#lE5dnWL_QiIPj3U*5y*G!OOEs@k2y^@X6&&)o{
z(x75moPlG82R7gW0WluMvdW!Fk&jqh38+|4Yfgazm~7KcgTE!raI8rQdMPt#e77O_
zWxXV4ivn`LPf^Gz-4VolqNz2$*XBO`!1jU0u*qBVa@CwUIhhr^%S0D#hCAx9^Tjx$
z>T92ZI{Gs!#C%eh!@P>S%SC*wl)z<XW_-LsPke8BbA{_$mYZuSL^v>@Ee#|5>~$R@
zU6!q?&pfU)?W}c`SE^*J&J`Zc4Y8Qlm@B)$K5%ljQxtsL5B!y8&6cl7x8B<;P%qtg
z;m}SS&FzYMo2CF7T0y$4h6$eUd=OwvZRK&pXhFX^TKtEAVt}=POKO@M1OCgH@q5zd
zA!Cg+rj|k_>D$Obfvt=jzg*Uu)!jI7timWI21pG6Qf}+^Y0G6%VC%*Y3Uy0$)~Fi|
z_Im1NN^Mg~{b9F%&)jSq*E1nNs+m7^5(@bj22x*VB^TFLyV1a`C-X2+O<uDzzhCZ3
z#ce!F*F));sD`dPSgE6KY|-nf!j#&iQrJxm``_5BCBM1Htg<PYOiAU}+o)#$+;O%-
U(^HfK09Ytr3?+MXSXci21CypCJOBUy

diff --git a/datasets/common_voice/dummy/hsb/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/hsb/6.1.0/dummy_data.zip
deleted file mode 100644
index 9794d0b658a5615407f5305c5be867db85b55a3a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4363
zcmeH~c|4SB8^<5}9@&@38bgDTok+I4vQ0zT&7kbV%uu9ISyF^!CnQTrLnT5~nqy7W
z@Y<KkR+3I8CCPc_=#!D0`1k$1GoN|p`Rn@L_x-!B>;7GqrZlw70Qob1!yo?7%l8Kh
zumf-l^z{vPc0=Nj3f2fFfF?)6%7y$3Ji!7`(az8S0PQ>QVYy)t$+baTgP^WxjDH{&
zswxkIDk&-{LSaf!7;KZ#^RJ@TEKz2Y(LJ!J6#{#n&PMC!2Di-!bmybsG|fwOc2od>
zp#}ih76>>D($h~KhYkAYs1kE@TozJwem<)9guUhjMUF^sEVDB()=Mx6v6J*G7aVvM
zH|J_xsZVeUkL9uqpx0y;RQ=Q-r)OfJGN+6-LhJ0+AT5lt+KPUX<cV;SuuXgH+MFK4
zl(p>nB4#XR*hMlgFdzxykzVd2JSkpQYLgn_DO-qZ6TtXcF|pLpH~RK4w`JOU7#@>i
zw({mKN`&6YG=bJW6&z5ptF&vud%0;urSO!XO7UJcEZ)^V_Ku|o0~DF%+yS8gZ#_&!
z<u1)9*n5OPLH1K{hNlKY=S%7{`1C}m+w)Go*NF2glGrV8At#*R_eeq|U?wqyA0Bn@
zf!N-UuqQ6_>?8;}!Vgic990-|8_UNqsN2T<mH4r4VB{-@AEWfe-txw`%;gKz(aAf;
zmh%%Ic$h1mu3MaA`GPEbCCJseFnwzI9E-3qHaL}Dw|c*9)7*`kE+|1d(@6c%LdF?p
z90|{EZx!~(XL@oHu8QcV@J34k6j2=%QDF-ry157aUP3)kLRQ0`7u*_vV4(@IVsaEn
znN}xO5DPMoUUP{f2=u~^xip<&BRLhssCcWU=6<Vsi`EWizWy<_Jt3shEiarPk3G4Q
z3;Lu{<!hzCWKe<2>n8wAs}CZcEn^{Ed{J$GmXSKL;B3EYxQ#%mB~K*L=4k3=&itT!
z`Tty@O^(FgO^WOql}rktpZ2-ZZ*v!ZS(IogQK2N9*wuhQpj1ttcQSR}Hef5c<`!uS
zkCPW@^o(7s7P&r?hEHiwF7oR_X7UU;9U9d6ERh@f-ZXzM$!2tJ{$-VPZ5<KHPQym<
zb*Rt5E+v*<;{8l2&z9+oFxhqF&638#*91@Yjt?U9VOP_1pUrZzo*u&I%u2;2GEBW}
z?&+v{7|r7RivKuI(3HXu|KVPI*w_+<E+#oBxP3VHofc4+<Df1oThJv4>Ej9ZA$PYQ
z>X3`;7X{{<q!mA)$PwlJ4uU&F=lt@j#dcBmZ7!_Q`u29F+`A$1RsH-_m911~*r-3V
zz9tfCnsX$(G~ETC(kv&;g^9isdQ)u1)dvvc;|lXc&bL3FIF#ez2HUw92Jt!Xd;Jp5
zCHzRTPww`4L{X(9WL8_vCbnOFY>@LWgXah0qLRBCWPM$^1@DiCmowLW<dw=-eYPDb
z&OX-q8nxZTe5~4r$?JW@4$XtD10HOJ$&A*cV*XUIa;zSr2B`E}`Lb!%16f1-s&Gby
z0+mlgdA(X5WgUpPYR!r3A*a49AcrmHt`s}sQBJ4Tno_^;mV8<$j~Z`t2w^O<&~4p!
zB7JAt9*5l747Y5vCe%HMWepD?hrXpH@khB1k44^HLq2a$I(m+JdTWOG9fr1z0>Tx%
zu`=n1F{7I1vmZRDyly0MN9_*bY@p|=(1+?Pwd{(#OU%}OZ6{zWa|1O3k$^)}%7((;
zVx3vq(DS<6diVy%B}g+4Y;6l2WxbODQcB4rgr17<9MVwnaBX##u}sOVS#Da}AiP`6
z5ntqRWbb@u5ESa1W_kseVIDp4#aCnU8&wf81REbb=<NKGt-Ck$v_wj=<AR17B8+Nn
zJyYh5;ui7-=Z?kwNSO_K^8Zuje==oubz-Ju!B(TR#d{o$^Ki%fE@vq2@m!bjT=nzl
zeRp{1+?6YvvcindpDlz|Gabvjnx|LIw=KW0IIB2#j#PikH5%)qY0Mh~?YsT3PrNT}
zLdsw;hQ;%n@QbLis5_5S!teH(G`b)Op@CVqL-&=1x?6htMH<mqJ8~Og@D*jyaFkKX
zQx;NuFe0+chFYmq&T0?74wGYY*4)ON+O-}tdJbZVFk)#IvQB;?(40}j9oS;(e701W
z$g6%0&LLC;$qv-IU{;%96eGT<{NUgP$sUo}uA3(Z%qom)=rTRzhdXSw8&2H+Q>DNc
z=#Sbyb-RJ1R!maeN#PuqQD|>Z@3`?nY2}uMv1M^S-R2JG**$x89#NTzaY-Ey#r@@`
zw#Zq@cxoc6ippY;mb8t;beQY;2w`Mx9Z&F-FKUCL=z^lCZJ~}m{np<A8<cZ<0*q|2
z3_**P3uw$Yn#f8DPH8;!($47%jCmzvenihipGG>bzt(LwQm=x9n=90vTNc_u9oMFF
z0`>^QUCk(%NYI?geZJrJGfptfSUo|#bIOx9{75$H03%JxP5s>;VxQfj9zi@s-r(_U
zA7iEYZ=Uk;o5=3Bc`Cy8N<~IW^|x-K5WK!Uw<hs=1NnCxyL&MgWmWO;61<nS&{5w3
z-V4?mY{ORL+Lo?CtTtR&sN$dL_0#kPOR(BHo1%9I2OOGOVrDMKx3G&g<zNPNqD9k`
z6YghMbu3_toa#ybf`^8O&x9uHj6v#XSx+SdN9g#V)$%#*{Ou;ClA{Ue6IK{PnBj=Z
z7lx&rsC)7s{LF19+NMJ^^u@B_BF&e?S{X8h4T5V!T3LK|Doz-`G0B0`FNZEk&-1N1
zd9o9x90ydlAl7%OJke|IFj=p4lLHmC2+c+V2><s5Z~!xU<kvOh2;Is!vW0+>z-quH
zJ;SvD|7A4!J!$=5vPzoKyShoz%9sMu$YaVc_qFP8*A6KwG_b6If(SbBkK&*<+!vWf
zF|vTz4fR>YuC>*a*jBKhH;esYvVRX<?-?nfDB8_Kf9f9~`WNoe(v+N!tNUGRJ1Eha
z^qWVonvmabm|S;jZ!St|JLBf56!+NXuxooWB^Cu%=w`)T*Qx&vzEb(?d(8?Q%x5s6
d*;zLa|GD#Q#HIt)0{~|58w93M7)StszX4(YB|!iH

diff --git a/datasets/common_voice/dummy/hu/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/hu/6.1.0/dummy_data.zip
deleted file mode 100644
index 3f593b169567bfd546fb47a82800ff282ae82964..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4258
zcmeH~c|25m8^DkC+9N6=TPBGa28j=mt*h)~i3wR|YS4tnxJK^Pn^MNUO}7Qvu8S;V
z%|4PnlM0Cvp;4BuCEc;)JyU%`<u>=f_m6kxGv}Q7=lP!B`Tm~gJdc$r8#@oc_>ATG
zB0gSxKJWq(01A)A5>BAp{N0r8Y`Fopl-<^@j2}Lb7hqwZVgms7Y4BjRX%M-MK|BJW
z9zJJ%@i?dk3=UOQQB{G$RiSV=lhM;N+&A2!Jd@Ee_)mZ>w{$nVeGzcgTtH8{JE8>X
zdR&7A0CZUaK!tgL|5-QBGcbQ#z{dv_m!j)RC+U9upgQ9^u@_YYFFml>1_O+nS0!+!
zJ1|9;rJ&7<Dz`~6#9c{GY5rUD+k6terz>mUt6N_8X^vvE$dS~kG<ee%&u<~n7L;G{
zyRVGy;2$Trcf||H$EdN;X63{(LWOX7yie;)tk1R;^<Zw099`2Nr9-h)MMa7j#XbqZ
z)FAq7Ys!+?v-c^Mm6#oA$l8V>E3k-nT!ULJ68)yln@TZ>UZt>yM~99EPMPkGvLeYK
z-NenE_+)gS?2_nRLE_Q~O=SfBWBP-4eZ$qBhoPU!B)8n;A!<{n2$8fn3<Z)me^k>=
z?zw7LY+9pm?7Skq!hhlN^h&TC&s8&rE_|3(7k{<(qtZoDYD;l6$CSq}*+n*!RF9{x
z$ChtqFA<!4(}g*!n&1ylDMV>qo6D}(Jz3@Y_nh1d#`aqCBV1;BR0{O73-kkH-VX{L
z@IyPX2QBV_cD!*sQb4}M068H(!DwWrFMKxd9<RGkgHU<qErWg{^`%Q71Q}FC2vW10
zJ9v{5i8v5guK(t#H9OQ~vIhUaJ3W;C4o%-Bvo)eLuwP7?WUi6VC7hI}xq~hZMRW~$
z+laH}lLRxd97nenZ0)b0rjduT?uK1&7k8et%BY}n{M2Wgw%-E4I41BMZmoKV++k>!
zZ#*HMv(u5~Ih(OZqF=#GJ6l+P`eVK)1edpO1x%bnrZZLB+j9?g-f*`J-f1vA^z(<l
zxl{KmcEtpn24BRQzmB854{f~k*i1NUWeNLoCALsNlZ2GbG8zd#vd3YsmZ1WwuCe69
z{SS(jBK4zcM!t=Z{Fb;vGSJYfP&ZpjE1a>+m0HtqB*h((npl4_64amvYEWlhLx9_9
zPq3b#QD0;*(&Rg-g0|K!c&5U4`<+Fau+EjmJTq9aG^Bmv?~0;Y<tsi3XCp*B5w+CT
z+Vo#dgfH@IQG6+4ugEk){k-ur-+E#`ONv^=($sAEWKDQ+aPn$&A9*|wR>O0@65ezN
zB~gjR+k`4x>4+Qm771_*vvXh~5n0>0M)`9^+*{ehYAlS?N_sSThm;l%jVBs-DY<*3
zg}w1mXgC!?@^2-!CRi36GX68CA9aCfY<J8d0ewkU`EjYvgDJHW1@9v?1`nwW<EzAG
z=)o_dhlBZM`Kx{26JqyfYYgur$hr_>aiIeZ6K_+C%lt-ju4)Q&{Px?6q7yTAm^7oj
z3FjKQPOd%VfJ9aCQ9fJ#@w*BG207PF_91Ce9Vl6cG3%Z_C&2X&Cma>-rIcuB(>lnJ
zA#vC~Yj2*RiDYh===tL4{w9*#o-kHoYW!+-8PDmWSB+EhT66w?)g{MRO!xiT;37aP
z90`_!NL4>0BnN6Qd=P7FeI9T#ChowVphxyE9aHq8;|50b?$9z6l4J0z@}YWb<{1TY
zy?JHk&HJNq{$HeTQXl^>{eM-uARE%#5=?jazk4hC_+!v#e`GzsOF#NJoD1kGqlv7t
z?;dcC+|q?JavhONSYFLoP8BPwxTfa+1Pil|vCX*wQ`9(r=%sbxspAWcm6bOp<tw7@
zlDww<v}+2hu?6{39jUg_ucFJ<tF?3RW>pi&F)xn^&6rupf}~g|EBPmZIvIUM!K@&2
zqFuP5`mH@$Kfmc>;V^8Z3SNmAMP`<4Av*mYUliLTalvTXCDEK4;tkDBx+0RxwG%x*
z=%7+nMV(v3PgnL2&ubb<|CN*Zgw8oVFtcQPK#Y0>U1|2q!-+Xssa^<HueW2of8flL
zZkxIusbg+#%I{kFyAR7wF3zIl5WW$@a@xRhRzUXUbNXCDwRqLuX-6HA3zC8JyrJDn
zy*7g6X9!g&8K<9<qk=%E3d*YX-Qs|cjpImN#2E6}OKx=tgr|8gTt8#G>PeHyPEm)~
zFHk(oA#EWI^(-5+b(lgzod(76f#Nin&sNVf-&X3)lJ<div^boPa|n5kctlERK7H@I
z%F8lw-GM^{QAtARB=o{f%Gh?TCh7M?iA=6oi;I6JEMrno*zEA0{S_ts=VTPat4>(@
z98YkbS-`I>-Ou>b+MaZy-Kf^4Q+&)Z$jd2+eu2uHR^%rXU&P7!$f}fP(nB9*MW{p0
zb9UPsxI%>}=MN`X@w^^Q78o>&w7a9~*4^u%jEH!p+s#&T1!LAFe?2jm7eYdsQtlDU
z(N9xo7IN8hD<~|Hg;EbYmR;9cR4*m<#LWxa+PF9s_)J$J<F=RVL;*?OIQi|HrmK@q
z+|e>D3EOxn&fC>P+pMfjb8B0A>+Xc0ou%FPg++D7*>F+cMaEF2ArbC4S`s2MH0A!d
zhlP-NceAv6X_5zI$HB|-yCI7&h9hTWC{ismBLO<F$C@>59|lp0bS3OT-QEtH1ww1b
z3~rL**xXTSs*ntwTb@|zif)Og+r7@F>pbt5TP+S@W4Qg)t}`m}`gVOk^KL(z_4L`;
zuQK$0)7Z1HO0sP>ABcYD3j$O6jLQwv0>`Im;a>!l1J(kr=o@bg_z#oAk4fLo4Qr$+
z{R}2aAEyWq#+V|$yR5Yyb>j^22?pjMkog~W@U-qZZMrNFwr-*Tshiqaqi*bs*HbgW
z<Y$ulrBD7b^V>FXJyVgLY35g@1|)x{A+$1O?8UX!Zfrx=lOdc;lh>M(A1{}YaT{NQ
z>!CZjn1-%<DKbahSi{#-A9FKJ{idS-6MM57^N9`aWN<4N@-of-y5ekx<^atB01x;*
M1MVJ4KE}WQ0(j*x`Tzg`

diff --git a/datasets/common_voice/dummy/ia/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ia/6.1.0/dummy_data.zip
deleted file mode 100644
index ceb6639e9085a1ac12253ba63dd304f2340e5c54..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3767
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3PlLY>dQ
zzz&!5MS#BMV7R&{DJbIUKGByz-YZ511_R<PC@D(J%+o6=E&~VD#CdiJKd0HD2h`c~
z-gzvFJS-Qs7IXO-Ji2ZYqZFPYnQY}@((}vwI=^u0?AkKDld4x2N9~yF?fK!zXYM}1
z9fF6y#=NWh#JF<*t@oYUhfnE#<nu7GTUd7J)Z(pq%5@o;IhTs&E%k}aoGP<av)X9M
zo~<V%&v>@anfo{)l;c?D%(b1BNsrES6_y?RnXsgLihE?SORB+Z2}2d_xgX;)ojk9x
zKE3Jd)xIKSiH55Bf_=GvZMO^Pvx<vOnZ4zD|J%$5leg==i;J8RJ0)tO+yY0Zch~Et
zJ-mHSOoQXUSW=xrxw_qvKJm_Lvk!dM$xSs(II{JB>Mu4>ia-SE_7~1-kAOk?6d0s>
z#0P0gY8k-*ofw#P#DK?T|JC3pEZw@dJBwT9DQRq)Bb8*Gv+4Hh@aS9T|9y8qbAacJ
z^B(cc>p?;?CA9}ux=C@{^gX?)Z>jk8{5R!H;Tua-cQPAh3w=AcrF+}9Ro~94_ZY5a
zoa}CN@rlJE3-Juz6q)YA*Hg|&c)G=GPCZw(__?Kz#9HyFz1xK*D=MkC+4Q9q3B4)T
ze~@w`g~#y09QD^}(}M*<J3h~F6M7JrU4K?r$9v(?b85Fb-i7hAzptH8tdYCm)_dmf
zW-&#~EUSOdx7x3fRQu3D@yB+Zf76$09#~i}If?CE*8jAhOo)*5oR<_-wlVFqH!u{x
z0I>=2Az7A~lL^elsVVUA1SL)^2{d)0Z{86Fp4RJML|*P&#qfFen{~EFPe>RyPie`R
z<hMRP{Yvb_)WnyGe|}!PQ)#QWcNK5{>|0A8OpU0`pMKn;BE0m^n)>iPn|6JQ`nhg`
z$CNC&7UR>aUn@QJ_Bi|VkY=W|l*sf5rRPhc54)MF8yHqTV^|tGIalGnQ=YEmr<)1u
zW+{hdz0ftTc-&mLl1cn!V8g~69BR#*Pb^csAy9LLQ@PQ8|GKVYn|g}x_|?3u*f_!0
z;Nm`g=@sh@?-m@&-OcT6U-4@9l2V(3`C9(-cijJF)c0}SnoC}O?q4UBc7(BcE-6j9
z)|<PbkIA`2t*LvfRJ&!CicPEKGmmK}T4&8zS?K0BDOhyk)iaUS0uE<3se4^fI4;G!
z{o0HLFM_r>UT|n-ns(~{#GY>xd@7V`WgcC-c>I@{uFYew?S|!vH4AS1`1Qo1&aZXH
z_BnpGGwT=;5eF(&kRwi?_=qb>EiQq_8ID3_G|p%gXG%iLWxfK7t$(D|1o<TysYQe$
z?d*Bmyu$`OtPdh>Tm1wA_D|ElxN^5+@D^o<O>=H|t&g{SaceL4M~)ASe>BwZozryP
z__$KsJ-$L&EO&m~*C)<Le(nA178AK<k82&5%B$&0;fK!b-8yYw-HjE_9h)y-%xYoq
z{xGYz(88L<nI}z7=|{txuX{8WEj^!*S<=jR%IMrREj~ekiv|m>%CQ~x+gt6&@30|5
ze2(07$C(#(E*APn_1jgYIBuO%wRd*M@|MX`&E|`*Zr>#R@oGw9rMu+i#^3kMtUf-O
z9l*5f!BxhU%BNX{IFJ6n62b8KTFth%7dM<ZGfBU_M`xq$>L-hn1D4HM?P4YUY=5Uc
zYL*bwPFVUJ7`#t_!D~i*mdMP*EX{D_jI;gLd7#=M`<ci&R;JEh5$m@3?V597f#xag
z1Xan%+dJ#FbZ>ciS-_Zqy`n!q@|E3l4(YeyJH9JTW$nwgyRBE=@pkt=w<y-Im4W*j
zX8cyaBX!yLSC;PHeK$H3j~q?h;-R%d#HP!zPt5()293=<fj=D{6?)FH{3@;EpLb-z
z+6zf*rwND(N(QD}Tz#?Q^7*-k12=lTXgaLfd&@`7Rozz6&#p@7L6}$dgS2Uum)EF^
z?%{g4HizwAw%*L!6SB{w_p!hI#3i-i@%*`V<%d6QX!QAzAG&P$sppD^r?$DwDVMZs
z$(nk4z8*MOA#FHNNdpW~I4~tXSfPQMnUC6t0fj6^Ymbphju}^@Q37hY0K;2H5EG>d
z$qH#gl3<_;1H!<+j;^Q%;%QD2wh6N_iLmLd<6<Ih0yiyzmVug<SiOa41)(-9A(jD?
zKEskm79fQbPB^^<v<#`43A7bw>>zAKZF?cx8qP$tt?-r@VLLHfDadv%1ojq)^eI{{
z2ilCKy%OLJN{a~JqE@rWHb=1%Z8M_MCG1^Ljzg^{kS*n7C)!e^T7x)SQHxJxTXQ*x
twiTl=#b-CJG6rHdFeL+%@)1s=?M5$ga9PdD1~Qrp2=iDN7>vQL000hrDTe?6

diff --git a/datasets/common_voice/dummy/id/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/id/6.1.0/dummy_data.zip
deleted file mode 100644
index 9b9006d32c6164677bcfe41721a4e0900d7289ea..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4029
zcmeH~dpy(oAIHC<5t9;<R&puHZCjeAqJ>4XVoA*<x9woY#4ITX5oIn@xr9XJ(oZ+q
zp@hN-<uW>RiPKSzWzpocayi|8-&K#0oOb^C{q@@(`(7T8@8kXae%`P5`~7*%4ksxk
z3y40v;y}Wui!TQ`00xjk{QVEPkv!-gMtd9<0FpUc_U@uzNT?hjAtjIm0I3o1U^izF
z?U_Nm7zSRnpumt|1C$}c00~FJ4G>5J1VYT{=P!89ctssCqx+IR0~B)2=6Zc4aMdzE
z$9Ydec(Ir819-P!9st0_2hf8&d;<*W!HiFdO35*EpG-6RHc>4sbil80sFmL{rHZYP
zp7fw0SZ(fVihg&gywS2|ftJF<f?XD)lF4@?5_acJ@^>6nPL>!`@kx>z><KrL7O>I=
z-`(zyeu4=;FAuNB*0L?Li(I!*nChls8Y?)a?%PJFgjUb%W>WatwdAv?9j`ZQO}bl@
z67>8m`}xbWNPRebPUS#n=!h5fHc$1&5+5~-&Fh}tC`c$6x6yrG+Je?WYM*Xil<t&X
z9B_i0ZMxW!h3#>6RX*ZzFVSP2R#u??xi0rj)hs+WJxbuv6?8qundlsWs-Bd)86~x_
z;RZTwEG`Ufarn_4Wd`l(TRG`it=7u~0cR3cFWi-?E-I6$Ja2a{DjWy%JRt!9hT`X$
zL}vUj%Wd`nb*rF|C*BO%opy`i68j4uUqF`K-?yZwhu#rjjuae_?mp#zKkWmD?dbdV
zR%u25hpQuAEbRG}T|)0Z-qoAty)-7X$V#mVuhHJ^;HTq4b}{Hxb%XA{-f4@<Jfo9@
z0!Y@1*06zQv|{Q>n9iu~vjn9YTQVeW+p^!++A3a(L1Qznn3w^N2qebSo5+<%c1Ecl
zt<m4;sIzyiGRCM~Z>YC3`DQ99qqS4}Wu;u9V2G<%W>m@lxvq?#QeH-`d2{h{Ciequ
zau^;Ew>|OTyS>m1^=4<Q#HCc}0m)mGk@TuQ<3HoGo=Fd_=u;Q8ow-Brn@X=ujwfMv
zs;<BTOf4KNV`K5t%kZH3g8fb=eND2cjK5RLWEMWD7Y;qvmAQLDN|Nxghhf6Z(a5!Y
za9~LjHk-{C;Fg{>bfRDgVG3>IJ>e;TRq9h2Vf$P1m<em1{o+<l+l7YG%<Ipk{Dimx
zO^eIxbnqQ<zHqz9Z+}R5G>bBxhY2<eZdv_?1%ERT?$1RMDUOyqBwm$GtXb*VkQ1*)
zz4!-0D^9c8`XqI|>4H{SuNVl*9&)LA^?{UN!(*<r)$yDJcj8gs-im<1gXwD?&`VH$
z5|!6(uXjsFaIT#o7a|^8bJ$H5o$vGCV$6LB$9dSUgI@Bg26&!)sa#cO5-iV7X-vh+
z)O45G_`;MWD(rn*7<;2924J`rRi|V8=!y#b{=V!nK60Zwqb!f3Vh?2Xdnn@&p|}-t
z9&9u~pq%~a{VK91_I1{C=H_)RY}?eU^tNBM6)Gsp&zaEIj@gVQPF#Co*Kwxy((U_y
z_glq97^_E@Ycy>0hD|JMFq)E6G?k{yX(Ya!baF8LtCDjJrT;7WKUH#<dbv;!Y)ql3
zImK6!M)x5H{Yc70mQB1D25Q+wEF4`RtGk_7{nF#K+6#pUIPc}q@If`%u#Pq#trIfG
zmzElqo$Ioyi{=~-ZLMRJjE1?~N&YPdlic@~^T0&Tq_?Ab9@Cb*PvL1t!EjlNP|_#2
z>!NET#0x^v+*!X;*qA|BXSw(+(?nl+)m|T^7U_*htJeJXz`-8Z?#uDfCoQbBUhd;k
zI3>yFUGmH0C7nzPxw;C^w=|acH*6ZwM4h?1ILrokeQOJ0;+Ik0B%#`^l|0I0U+bt3
zy3@?@3lE262WJX6*Q@esUkq4WR%A=8zmi{bMJ3Rp3?)oEE}L=KmZ(>KiW0r~vJIzO
zKeHefy*_UfnYQ6SYYfgn3qw$rw=l3G#PY0qQyxEGr#)kp<MEM?t0?cyPdtfh6d3*X
z&1S^SCOr^M%N5|XM2RnoZ@}z%IY*cIr{I!c8qnCTX5FEA>ygi@)Ob+z)+TrUHdmt#
zt>E`Jam3hTB_$D~aZTLeAT!hA#8&$iirjpM5dUQJVWPslVa_W~zV{wZMc`xQRLxY1
zlN^4J1BY0f97%Qh>yOMd=ZB;yT(MekUXjJs4$ag^R3r`Bnbm05cqoX-R9v9dq>NE=
zjMsLS$DXQWKEi7Ix2@*1q{bkudUqJ#LpQqcj;S~h2M>jp_u`Zb;&wh^O9(g6u<v_(
zLfenXKP+}{Glm(ruN|v|6jUj&+lGbykdeBd+h1KI^Kl;?bM5Y*r2W0M;R?pM0<*QB
z#f%&}t!qJsvh-DD7TL<QZj0e-&X2xwyw7E;Ot!p|KfZt>t9=(sbq$n=r0y`hM=6TJ
zH}W5$vhHruR9U3n8=+Bk@I&l7yb$~6-~k#f!Cuw7ru!OkVMT6ENX713Biq&8*he0R
z4kjmDZ^{@Px2AFLOqo#8y!>KfKb<y+FX$IJeQ7hToJiAIo>yYtO3Asbv+5WAB0zu<
zUCuDl^FJHue-Tg{m<sqFV>L72KbYztlV)4$DUtwlK}^yogAKw&2K&3qnzDT}?e%9E
z*m%IjC@JtXEl+bU3xrKG+aPsLb*88@&E|9}73_F1sb8(<kD0R_*mPzS_@7Bk<~Q>N
zB!9<?+2KUBIJMfDW??!xbH3Q*DeLg#<%&9P=Ds!^N|F{EI&GH|k2-U3Pp3}Gh)ta}
t#{Y@^x#efin9ppmlEG4b06z5Z1pYd4=0eYhh(gPP-vB89K+B2l{tI8pr2hZ_

diff --git a/datasets/common_voice/dummy/it/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/it/6.1.0/dummy_data.zip
deleted file mode 100644
index e50498ce0c4945cd4eb1d40ec96a968e7c0f9610..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3995
zcmeH}dpOg5AICT5G>4Hz4xLcqVKy=<iEiW&wmGJ9n3?UaZN=z}QVXS#Qzd8RR4R((
zke=Hul-am!c_h*tZz|nO&y(o+t-4~ByWM|0|2^Ba-+sHUU7y$Q`+0v3Z;Xp11PT(q
zhRXKgKHYq|$b+;&ga``dPyiu_9%SO>DF>3w*2M-2KM@DzL14(FBnSj~30yFX2GJ7@
z5*%q9Obgo=5pIk|po~qCrbuIysWA#AX7v05yvQT!iy56v{|tcTS}*qaaNyLkpzeG;
z4(hqLR}%~ZEt3F&km3XAVL@anf*v0EDN)JU)`9;?xBfO!?Hmrd38?^L(+)sk`?Sxe
zJZTM;8C9@N9@^B^<fhu+w{do2MYV5M=S$3S8wojqat9WtQQ-5Bw6cHg(T%t7<4$Ft
znP?l?_TG3*XI!WC(bc#bx6%z}?YL@$3nkzZTWWPAd#A$$gOy;B0?W>0d2AUV>~=ZV
z8xysv5G<h?tJsLV&DG$&v#~RlvmyEMR4f>_XU$s}w<^`7l^(zX*V^=IF;)GbaaW~^
zEKOXjS&2r$Mg`!LN?Hf}xeoQo=ME>P=1*gzpAt+8Gw!PQ;Ga49*cx@XA1g|&$krRP
z%5Y`1t;3k(u^+vke3T97+C-suq9(Cw9KQK-Nc%1Aim={)8|u$~=zM$JYum=wU6odE
zrlV}8VDA-$B{Z*rVkLJR4rs6n&;TL62858vpDMs~r8ccnIK>mdN$9>r*(6AM<%EGw
za3(z7CKc&XQe<K_p1-Zwr(?^j)|JCeSB&GElqNkc*FQgD^m67+aiY8J+p-VNzB{h%
zii3gUW^w$4q?%gp^=jI+t3eok{#))$>-H9;l3mnz=0M9~2i4QX#Htkz>P3NBiYY@J
z-P3V}vAnht_VV1ljr*%44RL!P=7vgy499SE1n>e9gS3mCP2615DDg*C_eibWAtwr_
znY`z0J+)RPgL%F|)p1o%&gOx@9hM`89nvW0@%Ou@A?ZnG>ZCgXRz*fU&y^UQ8}AHz
zYrKC+eDtGD4F_pXomnA_bpD}e{6tpGEvuB1v&B~$rcq0+d#45`QnRK^24VBLcgn`&
z)~^{Kbp~?(8u+XiKlhPAd&$6{2qAn`Kv@4j=z)-ASE}A>1qU7<9&w`O1Dv5tu&^NY
zG}Cpx%&#MU9ma*9sd>W2qOra`|I9fEd&Cu3Bd%YNNzB|pEI5D-w?^EbVgHsoH!|nd
z>DBJ_%07o-H(c0rkA2lFqAzqtTFtrC(b{`wePS9ORJLY^)Wa%_TM{P$FJ09=<ml(E
zlzj168#OV_B>|J*Hx$S-x?m6pk>O4m?Y1x1*!o}q*Q$e--Ixonys0@h^DN-5tDv*_
zcW|K1NdLRj7WWVDL{lC;%lSChpcUHqxQlV|=_u6l_)Y0X-VArG;e}&*Og|U<VjJJ!
z3fU3{=we#sDUJ9NwY<2Pi6umO+ci&O+_B5mDcbtI>lGizkfXo`@oTHfVh4x=`qfNj
zN|XCbCZ$Jn%S>3okR^;9>yV??%O2ub^kqJ!=h=^1kqo+CTNafHc!nLEhLSm%dEJC_
zYoTr?;_F5a38#ORY>~0`f64wgk}bbh8#@gwo)<u?{f$*aqmx3yex_MF)5Wa`scgp^
zh7*}dAKkq7?%02-xUY}p>pX~jq0o7mQM(VNa7k)guD2KN&b<|gQ`P9DOE%q3xhrVY
zs<50M^V14|bTcQZK}6J_4oB5uo@C-nWaZzL9h#haz=y9ZCDi*|^Eg*<!fx%JE1L~y
zC<P^H2bMNR37!fz8-^s@Mxjy5*exV!dLeQnCI1fT!q!zXa=*pFkzJ$bgL7pc!1x|<
z&6?f?jrYpmhEi>_oiphI76WN@qju>H41vKdwL&L_QrQDeZ^p2sJUbuqV6@KNIf44~
z>(nLXBRAY@)J|?{p8?Bf`}*JXoEVOb&A9K_FWM6Cf>c(Lf!yi>YXmKROOUDG53@x&
zgTiq+r@dN1dcZs!x;}HluWqG!i{Sy1MlW3#YhQB7sXV2YkIy4kJeFu;p5=E0(&{>n
z=<aVkn!tnB>7HS}m<!(?C8v3RI(pl|n+IRJ9fjKLO3R0xWc|?{ymLwshvS0AFMF+%
zgJZX+s%&lN^mw}JWf;d+nwL3pn{lc!Q*J8C62r?HlW!r^SQ^G21{-VPrt5xbwjbL3
z-mYwIUcedGKniD6`KX#@6=l~Xntvqo!1AnV)TQ1?-ziBxe?K~@sXicxb<5&In-R0Y
z(>?v|FaC8Kf=(Pwv@Y~oGF{&awa7cE@@EgLhihj@%Luq_@C;M?tH9LL1MjKnxYS=O
z8R2b4tjL`mcUTjNP(ISJVX2QAw5;1ubNikp>Bqn~)L{_`NfA(k-y!F&#}gGBkFRMf
zCziSeOH&MjkRKoFU)eJCs{c8m=bq77OFh9HWj2dYGhoSD&P}r_r)5V+^*`HkaZr9*
zT`25lpDLVn3w^4&_`-gXl}r;gYC>(lbF^TI6_Se?<7Hp?%AiSG;jM^Gmilay|3*MP
z(0ssITYJ%fe_@$_PWs+3&yyx?3&bRSvd;iaXrF(0ta;-mYMwvCfCdB=9)|#53-Yw+
zu>jZtD-BQ=RcD?mY8Mw$<ABZ=lls*>{+#)}O<Kr=K*VN#GiLzu4-6B=MOcgTrxjiF
z3(0j-Vw30B{?Csq>^RX?wGc{^5gWQ-afwG29pDS89kOCmzZ>U&#a?X2d}afc43u&)
c@Fy{g>f4F47+OkN7#a$^sX+Cp%L{+~7aMJq2><{9

diff --git a/datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip
deleted file mode 100644
index 95256af4cd44dff9a27e2c6bd35c4e3dfb5294a4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4274
zcmeH~c|6p67sqE9j3|Sl3D*{3!pykVEE&s<iL4>ZSjR-nC}S*<TwP0fEZJoXkw=r8
z<#8=hp>$=-60#JDJ0)48DLpgQD^zZC|9oDrXXZ7(nfd3uf8X;t=lnj${4gtM2Y~UK
zD)Y1a_~Y}13m^jU2=Mg{cJ^>3xgw5Qa{^d%_gc6xJ^?{o03c|V6#xLeqF>Cn4I;HQ
zh&x%qop{PGz+XXC5v8DnR6;7CloU`XCZiV=x^1~d878B9f4+U5e4XuXA4)$p7|>tf
zW_kTu#k4;V065A503ewMkWRUJ5fn-O<d2C;%GGgMOVjx}Q5}in@eg&OcgycZqjJc_
zKQAhK9PmF6!Q8*|r-GFmG$wX6L3W?+h=+>Q{+PQ?%B#p;!C<bEmi92fT;tp`O^s<<
zsSWG!O?&;q_?OyxPWXHZz=`ag&?`zxIzpQbw;TZWU#2`|F}~u+S8XZn@Z>+#*h(Hr
zgomz&QB?`d)?`g;0un1O2-z=U`Et^3Rpg$MHE>tZRgJ|`L<&MS&3XdH&vGjw%0yPg
z>sM_AnC*&I%<ntE1Gh+RZi#y4rW&!b{)a#|;_7I)>BXv$n#=hEDJt^no{rto#xlBV
z>&G-k1Z;_v+WT*k7w)%YRsWRH`woxFoo*TvE8nHsKGKqv+L3F&*po$672dsQs*#;`
z=@@GCSY=ft_ZqkmJ03t#S2S(~s7Svx1A-HBfFjOw7g`J8z7_tphG#V4>4B|e8h&Lc
z3sItW@O*h$C04AaezIPFomSt0&4o)H9m}#E2(5Tt_YMiiY6Zd?XvlYLjN04~*X%<>
zg&<vAAYEKV=Ee2Ek$)&{LOX^P4KaK)b<e1&xP*2hD)%}yBQcZHv(I;!BPOBR2oeYf
z7YM5X!;4X<a95eS00gvE+mDO7I-fY)C?sj(*u>Gx-52s}q;bIDvNGGt_~G#xb`Yy~
z(#q}Nyi3Se4Z>E9Ldtsr9lNcexnAv=P|-|%0VDqtB58jP+?HFqor;mIVmp@di#sJQ
zk<Gg=YEPk^F&5OR@4IuqMQt25zmG4|Zy$$^9mPZ(+ZSi|Q_PwrFHZ&gxQb~_aNta-
zP%Xx<lw$2ulWAm^rxn)dRJ>Y|i%2V1lfx>EcUe7HOmw@O05z-8oTe#jvIDt83dlP;
z?(+SzYn@WCYLSg-6x==J^FAnTVv$Nnn^+$BYXxzcxRHJ1TsKRd)LNGediIg@>?<>$
zeX^^M7kvidJichaXsz$G7p~{AcM1vuc=BS&RcgAqS3WuOejU#2kFqhsdWeKQFI1o$
za~Te=Hs?LRyV2z3Q`Cc|nN*3yOey{aY4yDMQ<|AccFik|gG=Yq+a{idEF7<%`d5Dc
z<kUE%SiZPH>Wnzx#vF7};7<1Geo{{_s~0yv2j~Kpk9yYen$6Hfl!UYr%B$TYG{Oi3
zhbryTHXtdU1|M)BUR;Lq8@ii#9tNWgS%Hr6d{&()InND5DwSbFEI3NCKm1TcS*x20
zHZV#{<6NwbO|i?=&DP<V$2>%$BDN}|hg{u{cNZg5a>qk;YS%+XqDm-@BI!|LzUn8M
z-;haTQMQyuf`0h{!m?ADm_yjZsA*p{utL$yETknOE`oD#pGRv&kX%?_TMTRmwawy2
z%`Mb@ou~=FILZ<{zo#`?vM`BLy?)(1wgJCKSxDFzNP|Q!${I&f=k7rzq;+w=7M-W9
z<Uh;}yg&Qo;aFH;a7b7{RY~Zc*uu%8ej?=zb!7Zt7h!y9t*pNH$jfeLO5=y=$4Usb
zg4b^ErnHxNHmZOzU^iC*g83>S;rvNo3SgW0`u_s>Unl@RhtTH|^f9pWpFLrTq~o|#
zKdOL^uBIc+NC^AuAuaBOR<hec(`N^#FaFyYE;MtVAEe(35YRI*fwEh-#GDJt6h2!c
zo_gTt%2Aooz`DG=qk|-zz~Bh^kPnL8N2?+^ICz-+hs$LrRa(!HMAP3ngC1OaGr8|l
z^;oB$k0});3UiH%fP|fspD~%&%aF}B_zf-C7MZR{g*g$FU&~T@F?L*;ib`d`I8d}+
zEz8cqt9pHU^X6B@Lw5l|94~wbMd^_4SXmN?v*O8}U`2nIM#>BF8^Y9$?BP=l#}~49
zM=v`fF()vO%V&kJ*kZa;RQui>Ot($xl+(7TE2+!{PF{0}&+rC4(iFL5zv7I`9B#iZ
zH~P>I-ORP~MUJOk&ugP*j^(+&@lc{oijLf<MaZiD8rRldoT!h$VChnr(4|miz8Ad+
z-_GQ1T9R&gFwID4bopen(ZeF&1?9crc${x^SxCJ#Z!4z~qS%D@WSar(l^(BQSjJ#Q
z<8P%wf3bokvc)tsvgeX-8sH6OEb_Z9>_oms<1Hg+&){VcOMt7PYBYsl+sNykDPt^-
zm~qaK+GpCc;Q>}+k6@m061XCCL;)J?{(xK1Alg33PG33}*$3lAozItW?1uQqvj}ly
zOSXXdgrX$}xKHxK^-qG2iooUf7D1w7xI|HsoR;=3k)>!*63k6PA{LSuob`ORF19bn
zbZnmAOBd!~MuYVwpSHEfi&5Llp05n2PQIP$9^%255RRAeJ2<Cwiq<c*-Sbo}?<8dB
z*j7fDKeELi>R8G@u>79?LtR#$ZJyb3ki|{J`|SRXwjowEc_0WVOTKxbg@$oZb4iX2
zhwnSrlaS#_#404@I4SqmygfN_>;;~O+U0cNvC!~`+OV)u%Yr7kCLlx2PxhTLSvT8v
z73S6atfd!mtAAw(`i;>CvWT*7w=cjx^C5s)G~?%%?Sbvn_V5n^N&z+kzDFBw4fr>k
z#E(he?hzZLS#%+jq>oz#9md!yzI&_<+qre8_ynVO9iS)zME}~9r)`f#hi%$0=+tf1
z*`RJUyPK&`>Fv%W^^5iWG4orCx0&e-Vw(BYaG{gGWA@AsGiq_;v|DY(W-^YQY4V22
z`0;TW9k=ySwi)UPW*WNb!NweQ>(t&%o#$kl`ppdgJN9-n<`bJ<$@Efg;$oWp^~Bi@
U%|=%a0N6o)6X?~W$j$ioU#l<<mH+?%

diff --git a/datasets/common_voice/dummy/ka/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ka/6.1.0/dummy_data.zip
deleted file mode 100644
index cd88f904ac49981f210fbd9668ef7f541e106ad4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4434
zcmeH~c{r4N8^<4G$kriCWhrDQV~ZHFgi|OnLy;C5LK<eq7Lg&w9+Hur?2#o#worsP
zIQC;bMTm(|mZoq<b%n}l&Y$mfy))N5^Za#xe!u(v-kX6QHJAZd|BU6JV4q*UJ{SQW
zz|t9sbh~J2;bb9YY(x)GCkhyvum7B{F#=TJX=(rf&yXJm+ZqwvY6RgTiEzN6oE;?<
zATpA&(z4Q$GP05~G89_BmwIW-5k)ApZuj;0%*m?T9iN?iYdWC&-X++x$bvX#DgY1$
z0f02+04I!vtv$ra(dDzFZYQdmf55AL@2Hk=J^fmBCS310JIedYh3A&~Y)bnxwUD=i
zXx*`8>W2e@{J|0|BQQs{_}RY-t>xE520A>{kjncr%7Y7PCs`LkDlD}Nc~zVu!<szI
zwXSvYwXyz<XRuli=?5ZE%^1QAO&i0>a+np-u<(qwF}|PcVb!cgDKY^bN=V~~HE(?P
z=}?dSULJIc9;`hga1i?bYB-40;cuXQkh*Zrvg{#9x2P6J5D9sBg)84^Oq}}A;)<fz
zBZ0ibCfvv<21qU`!7pAF108pY>l|R6addB*oKeY)_T$As=_}WmtQ8@(ovizXa%-?Y
zlYHd~MjdUW0*&ST89$PX*KR+lvQF5OgCd@lt4+G4%X%|x>$=%+{7bZOB$n*>ePl&J
zD0jT2mCG;PZm5A7kY-XT%Aeg48=(bDyD*xf-q>FEP~}#vDwelEL?^BbU4JW1U)W#2
zx-N;GHkHS*l?GH0OhdrB1qWO7yB9jumX)S~_FzW_Fz(&V*Tg*5Jm>J`oI#url{{Z+
zzKpv|FUF6<i*7YZ37JSe5W9a5B>MOuV`j+jrb@x0gwyHGSHd_Na}9^NvM_M->6C&V
zGaxidA=D=WLS&F>yR*oeA{OElLtvGWb3QE=MmPMbKJ{tGSaQ5$vXwyc+w#@-gS%1g
zFEh0BiWSnKL_Dl+Rt|TP$yMgl$hG5W)>;MJsL9_A1(#`^7kyW5Su`vhIo7l!Q_14|
z)Ti$8zHWPywZ;&(crfqXV}FHpt?a24ApSWRJ7vAQPmb8ATtZ)}pF|y*z2w?8Tt!Vp
z(-T%_S$$l6kDyB~@AA2*1S>nj=GRz<8qUMv?fcDL%acn{Bb|w-suSMEu9?dXPe4qI
z5&jOsgEuapVUBE{Z+ZP38B2=&W3&ej9o>QZv|}UAI*FHHJF`&jvE(@8$q9Cd@;JL#
zT(u=vA}h;p(OyrmpCp+T%+8*hpJnx=egtC|2sdGyOGRtVk>9JUFiBO4mB=(Ep}G&<
zH(EG7nC9!8mZc3EU)br9By7K<zlF$Oc%=2rn-Zns;JTSb;z!#IGknHUMNuI4CGLKq
za*Mr)9LoW*5j@uwIfW%3z94lUX{eOCWvvEHcG?`SIb1O#-Yix$>vZyS?muBR*@0l%
z2AykCPj1T86X;mfkIFuk7$8AmRL8ZXAEX$())AXY?cEiMiip3A=4zN6tI$kakk3m8
zkiNz_Xy#lzGeM<mJ{2!TQ0bX}kaQ{<**jl`@3b4?A@wEfCJlvfPz#%y^)^C^`DOKK
z{bcMW4ERjB_nAl?VCFP(QE^M(7p6O(Y}`ph@H6)e=o_cADHV=RzvufThtIbWsyuSe
z4#F2Pj^$@S$XgtLZEzpk<h@v5u@<P?oB&4&_cwavju|v(o=sHtn{Z8dn^FX=6e2yA
zz)S^b-%%S<%nUnUgxIm-?gv>My{%eIw9AJsoMw(*X-7B|)+FIayykK;M6KU-qvV*8
zRrS%PQ>0az4IQi(+RZwUqFe_~R*p{JB(P1L{l5hMHzY8s`z*<vTnYyNTd#F+va!Pa
zN&_tsUiwd^*&EjKGG73=t1E?hnWp879Q0QOeB7}Y02($e>10El6P0j>OTG4jgEcGd
ziyy1<ujeOaW+1ZmJTpm+IX9SGw#Pt1-L+_>%Cv@Oay(_`76xnsCg98jqvm65`bCv!
zyzGqJ+@dTPjL_nx7BM>XV>%2tYoV5%Y#BzkW&V(fW(w{yh^5|p^Kl0ZC0+>8V8kQD
z9fP#9Irz=h0f-xeRig9>wg?Gaq%RClujvsl=!;6=!g0b*8p|Uq5H!!UcHgDbP}IS7
zhwxcnOc+x9IQFK$@0hh({1B<JrZ2rHsZ~|^IuE#k7+qjrI~A%~+TUNz-!;j1So6+3
z@CSzj?bUu*Wx7)-Ye~zC*bx+c9sKNXq+^hqyQW)b2UKV0%7}8|2}PN7@7U+VJVTx9
zT$s8Kl;8&mVUHI=5UbZwE=T%Gdc^JHs-lP7m~xU=c&lzc3UjQuTgSUKnFXr(Z3*N^
zj*}y)KzTCT+W%PAx5;Veh&D{8G)sNk6x>(YGA$}sEmKwZ_X`g+GtluVVxB-X){J4N
zpRBhxCuCONovjDsks957LpiA`%By}F=i=Wj9O$W1$<cez#$V8WF=uJWJ5PJc-gQDD
zPbb$l`gKA;z#H~~-$wDvJ>^zz7U_01AI`$N)GZsc0lsuXXdr)9_#07pDvU|Y$XS*)
zN}vPS1xmkDIajaS42i6)*A=q~$jvE!#gm`a6QmM??l+S}bYBA^i**HQT#y5yZ9{wy
z@<tfg!TBNQORh^|)`XTa2i0D!I%bCj5B@#28dEW-Hnv(SIu!mftZc1?B+q;!>Ioc;
zd6&9ejX8g(GDT~+EdzmO8hr+j+1We>6QvqKHTx=@DBKqqm?H`i_iNG{C>4GFyvZ!*
zP_j!DV{TmiUQ@eJU0H7RW21(4@CNg6aVtaNo8bX3p*{ipP~y>9v3@N0H+i>~qvEoK
z!ei3HB}b>zEL&%J&z(>*nh6O`TJr=sU!Ln&+Rs=xs`b8#g^L&m3T^PxZ5q;Ae#;{p
zt5)!0bfWSNn{wN1;zU+w@rRUymIIHJ%<p==Dju7g@77COqgofqm-S-3;%{Cr<S7@)
zSBcm<Y@M&yb@Ia^LIv7Sz1=>u_bZ<TnATW--Ll=#eA#aPgF!)H!{CaB_Ev-cvg!Pq
z^y8khL7LV`rI7S_>mkF|x1OI~Yh(1@y7PR2k&_un^9GauHu-7WYms4_HXt%}+vsdi
zw+{T9sh;F2LLv3r`2TC>kHhC?rWKfC=65xVO#aCjYM{5S#f{r;9d|a9k+c+(H-?{I
zUw1v@wl>(!P!t`-&`s@@GV0c5u$c-a=O%@L{h>Ad8+*GN^My@TGFi$OnJ8v|UvaiW
U(~u(v01V`>Jy|`iWB>sC19W+e$N&HU

diff --git a/datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip
deleted file mode 100644
index 3a8d0bf1c94d47868ef3df0458ed5aad5b3302d4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3902
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_29c&`CniB`
z;9p>e%l#riZ*wqQU6d5mE!8@gfsujX0}vaMU_(h!VrHIRNpTrCs3y&`OZYj>4n3&O
zPPFwq;=tpUUEEdB;QD#?nGl(VX?C9~<`~@a5mJo|d&D=VVb-KNOQz!|d}p|B`BL)l
z$$QT_Bi_#AYkdXN?#xTjZFEnvIQ#bVmXBP8R{i^xzwy}iPZW1pbg3m#NoGett=`AB
z&?&uY?%T?)O*nciX+q<QutkA$yWO<K7jIA$h|XPe*v~^r*5#o~W_6_4y2~f}dYrB`
z>I8?FE_Bzoc;Oag5VY`H%34!DF8?zz-b)xvp4!Bo_dW7f(0TQekE^~t(mO9DXYpJs
zOgimfRrilgp*!m99#nC2vE)2oG@&bPpO)johyS@w$}c?r_fwtL{2Th~*<<)OdWBkj
z+O*`ieeXU-P*Onz^`gpkeYb%De+d}W`XmH(N@^LwfbRG9JM19fa{X+qP~|`4?+N)|
zzZ{RKc+()eXilqep|)C+Zgtt$C0Z-xbT_X#Zebkrp`vkb^BR?cy*=``6|Fy%Mb|6)
zF-=~h^S;4lFXw5u9Uk$%Yaa(+dGjZMInpn#d1YU1;Ta(v*2h=f<|j`{aXLDw>Bf`E
z#X+%46lV%CNljKx-nzoA^{j)$Uil+zXH{L@r|uO_w@AL?(t5K)D{#NCsAxuZZuOnH
zp2x!1EZ%zZ$E43YV`XgWYft^vd(C-UQirFYV~zESE$u8{?Eb{Jep+0x`_=3%k4ye!
ze&9ufq367$pp4rxlkI`wcOQsNNeIKT#GFiE&Q48%ha@PuVo9>O6MggU81S@SKc!jI
z<RGLO7*&4Tn&-#!m4?aH6K>39{%W!}bjp&`MXpW#EcSML_4mn6KYFOSKmK8@$i9{B
zp9_EPmH#J~Q=j)gXkXQiUupluB1$z5*M{%=J!99p#WkHXlg>uoirM1kyLH7R*S>ex
zHY`gsUUtG)bC&nf8xkzZZ(<pWiZ0Kav{QWJwCLO})vrN<i*{%zez7x7UY)weyNz?!
zlXD@Bdyh&qzk9u_e%|gC@p(Ga=Wwo^{%x0?!k;IZpC8}2G417puUoc%uHdyiVvw}r
zu(wS2Gs%_HdQQ3u^hm79PGN0Nx~g4#zN0k3!_cv*XWsKwzSYj^FFKg!F0N8eT^%Ic
z%g3NHSu#j_$>$1JA;pC;I-v)%T5K-=XVeP%$@HV}Uf#d&=Tk1voxjI7huP!JpEiqm
zhnrJ>Y^kgI#QOy;#Usa-0SU2Hl3H8>k0=~v%V<PVF{0cgRDu<OSxS?Mv|1p)BqOzm
zP)wb@V3>E<fQR)$q-`tT1(s>4H!saGE-~}yDr)s8ORz4z^*ZGRXH}b$_xX^eFK+FX
zws#P!St3#y^ZUh7&THu#ryNk;XfglS^vtKOKQg83y(J=eY^vt53QYBzsMd5OHS2_`
zod!qO6y9~L!IqC@Ot%{COIgeO%gjO{!!guq1)Hw{%Y*FJZJ9cuVmTQhcNe;@&k6tO
zcU^YDvA$hhlB=VvbF=zA&IK_?WeU$Iy{yFglp*lKl+Pvh=0e}y59|wzy{(n}?dOuC
zjcE>?GlQE{7T4T8x}N`Ejl0~_x2GKE-LH^%P<EGFRcmA3QIT%z`@M_S2>%bi@sDGE
z+D@6p-!~ehPgv;jcyY`VHqDPax}yWv@1DdJ^z0L(ozQdZx@pf(vwph&DIZb9fofz;
zjfuQgz_@q>#O5Sqn#?@RLJmjXDV-SG7j4MVI^TBn3AQN74ZV@REr#b@cG_QjlD2By
zQtemkUiJJ6St-QdaL)0$@V1y*>vPxT-c{{sKQ2>a%zys(&tm&Og1`PH|Gxd|=;5Q=
z-^n|kU1p@S`{kp{rIF8nEc2{BzAVmzwcTrT1ouQ!6%qNija=!A&Yci#3sZmoigiPx
zvEbR(Qw<Soy<`mTWWN^mx^i&i29vTm-H)s~R9(+q6P(<<Y|)gax372Ye-wB2ym@YV
zg4XAlpZo4<+LZiVx%~eAn#e8e93sd5{jpdloLigN@{%L}&{@W!XL@nAhUuMiY`j?~
zwDH|do3+Siart_qIH{H?ZNgKsj20iYo4I^VfcM436IZ5BnlY(+>#K=UMM2*qZxnYm
zS-L+Ey7-ToF<`%fnLVHF*UQhg-_ESI`x4D>_`qJ{_eZ<^UygoeM>PIGQH9)sG$SFZ
zpb?dskJ^F)#TG_mlaWb|8CRQC0_p$(hPRF&CQ1vI71Dww!9W!Tgn@q@T~Q6h)0!o0
z6J}c$Vbfd3#YEZ!Y1sm81GQ|i`U}z6LT%SVYy)N}h9!-xKnf|KaQX{m8&WG5Xf4k8
zL0F60#zVHY09dpVYc0IRN7z!#MiH{5TY&u)Vl720J%CnYX&wc5gVH0yzo<1lveikf
z#9NK1?+JStl<!chD`Z=_*@?Fmsq!MuTGWym+1lwG#9NC|PUEv2lJhak8i?h<^bAbU
dPC!K@Ml!tQ!DTlq8^~-fAS`2HVAuq91^~IsW90w<

diff --git a/datasets/common_voice/dummy/ky/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ky/6.1.0/dummy_data.zip
deleted file mode 100644
index 8687be4bb452108fa67ace3def1f72ce0fd0bef6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4390
zcmeH~c|2768^@3A1{ZN7YnDV|#+s6hB74?Kqii!Yc4q8lE0km>%ZRcwwnDZdV#Zo%
zDzZ$4WQ&&D;>tZ!y+Y+S_rKpCzd5ft=lu1&&-eRzp3nDrjP+^hSOCgzqQt}W%geVL
z1P}m_-mb3x)<|2dt(=89GeCPq%*2NB_x6JTG<0*c06;en-i$X4BECL|y^oB&yO)PI
zMn+i{CL<4(hswa@WneHWqhHIhTla~QR7Q99{|fjcTYaO?=LT2J1dy)Tnd*959G0d5
z0D(;a0Hq#)^|D2~$zm}+Up6Y?in`6m6!jlBs^`AGQ7e>#fn2SsWn|oDG>bIa{1GN=
zl978Yw0N%&M7mOAIIQf{77Ii@nML^)M`Dx&qs>!-oPYagmjb1lGZuT52bPsG%$gsH
zHe_1(o`}Uv)@v;_knEB6E`p;_SD8%YK<~7aDXI3VAHB52(C*k#em43;b9ESoXdVAF
zxLavQXCs+>-||u#d_iw(>lt({Pu>oR9rH;K;&z@OY98o-WP}n0iKb55%qmWb>J^)a
z4zL%VmFHuTGVO4Aq&TuLJY=V2d8D;8?f}fmfp2e9<QXpIMRk{i1wQL%_K(lmpU&qM
zzP-N%hU=ZF-BaG0(jRn}?m6C?1HZkI;E;X2BVidKX@z?#qaJR?#qOzfSMVv*vfi6*
z0+AIE#8zu&nYVnj3d<QEX(=hSs>vv+;*Acd$x%=fS?V=GqI`a<NE}5GZY_cXk0nqn
z5Il`kiqD!$Dl|0h(;!)zFbbL*4AP5Qbd_rihV?}GMX*}}o0|g14!z#zU|N^IQ(8a2
zzClDmu?!YtWq5I>2BTp2#2=FoFis>YJv_nX%IcP&>-_faQ&ORJpmy~Dt4K26vC<Gv
zr`HB&6Y%3?4XNbN?p<v(s3SVPsyv#WZ8AD+QsL&6yAm~ZBqNR~K02QACzgFv1nQ-i
zZLQ!QT^d%yJmu!94!>zXhg7lr(i!EToAPD6a#7~Yy_gP$cS)OrUo2I;U8-R2s$kjK
z^#0b0&TXNVJ#e0pz-L(%F3+R(XMa+Q%{eL{v<$<yN-rXk+@H1!CuUu89Lov_ihM-~
zfokfi91`M@FA)=#KN%MciCk%^BLIObVH!|FVcl>;F1N}hUQU6Qt!K4eIJYVLaGqq%
z_{{Sc|EgZPSL{r0EV{9!fO_o*^-`o>FCSYMG-x&`<agC5HS~)*LYe9tS;080GRzt`
zi>&M~;Hw^)>p1WKM%#9iol5g|u1--$Vw6@c+%}hv$+4m}-0scwixMRzL$W0eL+D%A
zdfX~CYr;nueMXsFxhfAoX=|&>!9)y9ObVZqsFZMxQ<v8%8bQuT-APY(QdJIU+3Y`;
z$c#*)&Dn%ZyF|W#z>bG;A0OHQH)9Vw2k3BxZ+E57qZ?M*$9Prgo{Wy(!9tD$P;xFe
zlXh`xbxa?lMwtur9Im80=j|T)lzdNIKwZhfi#JNbJ=z&jvoC8c@Tbd{0?cB9Z?jEQ
z3UtN>s3b*t6=rprsbW9em>&Z#e*Db|{~dC33-tnB7*!vyT}?|u={2ZMvtWF|bdOY`
zK}Ah(+2aeb5zg03t<WM4{AcfVikvDCJYHlNL$~$pU&D-INes+VHbm3iQ`d%v(er#9
zy1V7#qvO@8FySVcn9cxBxNcM*Rx*Q!DCV1P*BP?XK(^peCJu&|mlmDz$NS!(k$YD&
zFc|y(gL9$vWQuU7Gi^C07&j;TB<|>(SoE2xSGPXXujT;7N7v?n9Q8SXMPabt<=?<Y
z|1balzWh!dronZf1)TqH$EZ8@6w2#&`n&e)x^0DX1NXf@a^nD!8jgXq06EUEbujbT
zXy`4hzyzx*4Y56xW9;g*_D2#jP3z`o^Wx8k1uqNYBP}lz+y8)g#WLf?;^Z<09|eU3
zCVYx~*L440Xt!-9IpzbeXTq+fsmmb+lRFLEu{AL$h^$m{p7|ceRw=VI*cH59{A=RD
z2bz7|T2YpPgw*Sr3Y`ZgA7WnRdRUsgG>F-zSB8q&WqCOC0xq+!`@A^qsfE799TL2a
zvbH_jxt2)9?paT?Qb*K#Q<HIfOY=)EiA*>SUXHCVm7B0)M_K#6L&-BdVfGna{5UY`
zjTl9|P+|Yv(tlf>B<T`lS0n&adwTb(Ko>$aSzpzNSt6SC;=J3GF`9JnlWnNp&Dik>
zv~&`r<(QK_LkG7?wzrM0-(jg_j;d$FOjphFla|KoAcNVg?~n7P;VRm_1RFW;gtAHV
z-aGaNsXmj`7){2vDDb87D4L-8K6$uCpIu4co!?CwKU8Nhz0=J<oqTevD=)~I*Kv3m
zBku3aEnKQt)WGQ&Ogc5v?pyyllYkZoC~Ziwp2nQHEqW5VPqABSkBPh*xv4)_KKh8*
zfc1KZco5xx!U9YYD=<Zrsdosp+t26321d~nXGrO{v>#1!s{+NdkBx8x2V?f3bCeB9
z4932BBO;Zyci3zOc(khS3lAw64Dow#&9bD-#^?2&{wsR?y+L_MQ0AX0tk{ulVq3D^
zL#){PXNn^@D@lec1teDy*C_-qmuqx}_^r+<q1sZPm(Rit&e>ha3zM+io>!IN7~d`9
zD9XY{QrnV#s9jSBPB_w@Hqo5+kkelANIX|1Tb-UcZzWDzs}*`Q9f4M8Qxs2;e4r;*
zL{#jNV9F%anKWzCTZM?^OLuh!--O2b5Tq{FMc182OoVJc-E4@7Z2-nRNd_L50<E60
zjClx>TdE;{zpYOD@FC##W5!zVz5FWm&6$B2HB1uQiuSfu$l?<F-z3hb{*CxBz9r~|
zr33qH2*Q^guJBHS-F;>npLT-5I*~x0MZU~b%i=G0n#kPkOpvrv@r7do)1MO-+aPK^
zD~hk#C?b9x87MZtc4SbZUc_$_qTSc`%oH7e8ZKxy3DIseqws#?a{zO2%Ims`h5qZr
z@*e_<1FHd-;ackh{>$|8d(zLd%PMIOo<k++%OnHBD3i=Dm$lk6ub*SS!oXq!azp9B
z$C`cGa9JR1%~S(YH&kbpy1r9iOHBo<oJ#8VzWw*ipS#VqOdUF^nLl_6Nd86NX{=Al
z#nsiW?{U_W4H&2<uXa4YUoNHM)}O3vq54czL)V<M)KS-)^;)VMSesM|`;&G58+#*<
j`N{?}8BApbR;t-QI?hIDdQdq4U;!^TFng|n008(0T+C1w

diff --git a/datasets/common_voice/dummy/lg/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/lg/6.1.0/dummy_data.zip
deleted file mode 100644
index 04f67d1a8c9b580315b980f495687098a4a02913..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4074
zcmeH~eK?c*AII-7VRNj_ikRn<RBV)1EwU&im6bZlvm8CJ)!G!LnhePgc}il~BqdK-
zagwLV!;uy>j#g6g81fWHk>9=Q${abZe|~@cwrlt9UDthmUibI?{(e69dpoSeC1e1>
zt3Bfg@$<>o0~AmO_S3w)e4O^XP+c&#yQBeerlyUv;6pnB1?EZgi35N{KX}+NXA!NL
zMclk~-N=+9v}3xf_4IWOum)INeFI&6eG#j(*se2fQCr07o*rKSa>>SX-M&1yYAJxn
zawR^xkm9&-9sn#C0|2b(04l}hAW4sU%=@!YkxXOf@mS-33RQP;-=PXDY*Pu(_5{SV
z_wPT>lhmv4M8*%g#c9b#@tRyd@TX)X_E;>UrbZxnURxHZ6!?`rgWjM!pV|7-i;Z%{
z_heTvoKqHeODk+l3V1H|dr#KYk)iUx>GPGN>+cUIGola-DslL7RWdemsB9VI4f|JJ
z^B_WHFhN?U*EvxM*3%qen>Xe-pvHKe*IUou20xyDuVcwwX8BFG;KdMgd_y`<DkY{N
zi?%aa2NR_ri+eP*&nI-i2a$n#mx-=^=2vpf9|mQ5xQ@Nh<fr%29?-be6HH{S`SBGe
zWLr#10sbCe()A?QGz#6+vGojbA3F$Vc(DqJAJ538T}9@$p?_QCyR+o3SrW54>*Y)5
zi3`K?1>0v@f2Q5?JOrq}8mPaX==$$>_x_>!5xFFBb-3B#Ayl>Xx(n^Ct;On_j9+m|
zDFoxvTk6^$-<q;G-B0Qc98bO6Ic9b>QzvKn9XW5+z!2@m?6)3XRF4#u_C2pRd$JK-
zm5)CT){OQXy>{{0P!5Km&XS8-Sy|6IXTm|4nE-{YVj6ZKc83b3R+js>z|bSGSinQ}
zv<6Y7tmnbm&7t}?q#~7V<X1POUQJEsNQjX&wrDeK92fKeG8)5%ogW-<K6YbXF;Nix
z=JAAVB)+KdQ$_uy!nWHT#%b%3WNIt(ME_M<SeD0%@CwWem*$8ni+i%@`xY)4ZKbMI
zWBnS5!&M52l5rN}O5%LnC|NO1Rg!#(YPJ0h8%KU0+0Wylxo5cJV$;f7GF^$O_iQn4
zpz<hC`BkDT@9pAw5F8Qi`@iWg*!kbt{a)dgBqe?Lrlt{{#Vz|6<*Zp%6Y~oBMrxyV
zFNb%|`MPG;sGeeto;?q9Z~N0W_r^Z&_y-OHqkIS5uF9UU+HUUYCmB(b#gpixL|pfh
z58V;X7WuuQ7wit(=@y%8vA#{|ZdeA5e&imV>-%1P45pxCxkS@IGSnK;8LTOvCKXMD
z46q`kIjH4(n>$_o9&0>5dq}J7k#tP_<kOR(8`-QdXGQe8w|6&i6ZdYfrOM=*yuUtS
zkEt&r4-b^`(TkMR{CICylA?ZjVj2{)!z`%JS9#T$*Evb=4&Y5`_zeSfkj}&+cM8U2
zg^Cu|<y!8p?y~%w#_g=z=i=RKj|MnyH21omhlwjlNknKa^nYr7&R6kd%DO<uym6NU
zjsb=?qee#`J9{2HUyy_1Y!0R`EuUKD#;_|fbUL%(ms&J;*rdupPGgOcQEXMXpHWs-
zRZ!h<)Mak)apgOGxg|yMY7@u_AwvbDOxRG2=!R0=k5RvQImaaXznA}GFaJ83sSSY5
zDfe@aB{KDZJLN}a)`eS<D%4<sO(XJa%a+1bUP-*DLut4vOESz?Nu%G7{_54_&=l)F
zHAJpaO|xjXXL;Zg<h-c_$oboT<LTPyhfRCN(duyMuJuNWYTnvbD0Q0>nLlcAZJWce
z%h-&zbVdv!fI}RPIk~IVa&1D480GX5U2W*nqWQZvhWI38qGh{yk8keoOlV!oc%9K(
z-?ST^pPbN9$#-nKVdN|8U%tLRl~<9|VV`|mvmx@n#**^?@H%40&cBd5f_-`q>DSo8
zUl?wcFCk3|9b2zD=u;VXmsni-n@n#BQSqbo`<5-}A?Rv_`_L`P8M6EEt)8aR*yzkO
z)rWC>_}xjlM$GgW6m-quz>GnEP-!zz>D8il%|X)aK{!|QL|TRMv7p}ZI~DS}^&+;u
zz1;`#4LGZz6As@XRz+%Mo{n>lMRDLYNdd)9tm`$moRqHM`<*&c=qt233Y@003fIH3
zx$T|7>^Pf+v2|(}m(_6)+gJ{5H(!}ouayi!#3gKJOh(X8A#N@xvyv=bp@8+cq(GlH
z?-Y4$G<~Af-c<Kty(#UDWKiSoTSay7bl4rcZd<R!ObY@rSe~xYY;iub=|&2T+0;Z+
zs-Z2;HZp^jrPZFAKm0My4!2=2(1Gwg{EB?@5yjjmPlq3K`px-vSHk;aTAXC_*@cOe
ztVL~G$y96hc#dC)F-#m@pm3~C$0XAbUm@@RfPR>Z%HtyF4@^gLj|8uEsmUD?^Rd{Z
zzRZLgowE+js)W_!scMH*kE(4eR@)cYaw()&ivFRiXNrI<z;$&djh_hd9NKP~)R4I5
zRl5D+KO5yw?@f)OWaYc9v!#5xk2FWopayM9ii7o|v`SK9xTV!4E^g`*?@m#-Q@TNk
z<EOq#+;n0UI4&G}4MlhStKSF7GjpH7^jQIUo|uaGTsd3eD<2N@5d^0*Qa9vF>i&s>
zTEKL{Ny4U?1^*?1|Cls8e@~P82y78apVK%96QuF)E^9gq&dlOpU|?qgsdNeOOGu|V
zmj%Lvk~v77)17JR%mgY-4FE@ih}3U+^vBHE*-M!D3H*a4BJ&>u2qb?eh3&8s?8WKT
z&I}2{WG+Nx@^obQ@p1(nH}h#G481a6WT?=yO*HDv$u3NVNQ+FJHRu10{iWq+&zLW4
ia3_OXc?`Vh=M4TfappopBn72qz&A+(0DgxGe*G7tQN^tQ

diff --git a/datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip
deleted file mode 100644
index e12366b49dce2dba4ad322a71fd20705ea43d3c7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3824
zcmeH}c|6o<9LL8s#vn(8RBqNXZbFeG4CT(QQsW%Vjv361k*Ezha%70|5`z$Otul&j
z<rqgQM2@W-g*Fw6lC$04(5o5An*D!gUeE9M`|JCDp3nDqK6Z!L*r6cWZ@eVHVfEw7
zgB!F9<Q0U$gt&US6Wmpuj&On4$TIeBv=wxc8^pps$p!+kPXQ0RO@qj;58_FLdHUl5
zg77dM6?K?~nuZ!oT?3}B&Sdnfc^>PwsK8`&Kf)S-FUN4R?F#{?hJbpndN}+d?X;-E
z0s^VA0*iS70q2gwsu1wRRiRSIhHeX&48IlXVTu8^NsTAU=cACmvD^K{!F?2TaAeA>
zBZ+=u)o&$Tu*n5KbZLHo-ZR6cw43!$Wz9qO3ogTJMFZL}@4U_}TfIyf86hcq{(db%
z^ZkVGvekghpQ9sz(j&(MQ_^=GRCZdb%jZLg$!O1-9CZyXSB`Cu?9Wo?wHVc((6@JK
zs&eSlP>G_9p!-k=?$or>I3sUayp=P!mc{AvHN<P^6+v;5^`!14vI*#~Cj+$(hl@Jk
zO=5w?(>*%afPt20H_V+(d~4pcM(-~$g$vDuK^(7sn&o_ee0}i`gm;|4-Uz~hyFE{R
z;Qnw0$eH-2j~WdZ)Ett-l*x8FkNh9Yde8VL_wvoiC@&Sn@J`H`{mjD|Yc_ziCY)1T
zaNPDmkKA`VD=qqbUA|(P$o$1m;<U1&=LHe`r!60l7hWJQD$M7_3rXBED@g`6R5ku6
zG#;AmHa<UW0P~5JpF62e4NTRIKiDZ&J*3&?DEFx`-glzddnj}^e_=U2;QDE(SM=iH
zKKXzdcy-O))Ho%sam?b0=Lus8j`dMzKTW12G&jYT1$9^(ijtHyxUIxeqs6LB_F>NX
zk{t)#ZnMepI*$lOTzn8ZnkCQ|73zDH*nr-Z>1Qu$N6po1C;nssPH-87>Z?hyfa8;u
zCWPiiYu?%Hv<$mf^K969&r;>w@$`=L3)wTuZ8F<XB`7OWa``+$KL6=^itFr|f>z!n
zm%#(C<lMy-8o6XX>cO<NXTl0)J>~x1VZSVCNbynX?ypQds1Qpj)q-jHskSO)iTL)e
zEGj(POPAe}?BVdr2W9{QWd8sxE#_q>y8EGkxq<ZhDn6|pzSk0+4H&8<KS!5laiegl
zANV&#N#T4DS`-`uiKMjr{=^DrBcq30f{Kr4gkon4{w@efmHtcLLRF`<!oxpYvr@Nu
z!Qsd0#>?o>27&IC0drwH;Eyw;g&V0DN-4}x3auJh*=GuAv$6T10bNoLm#dMGg~xA?
zFfos2sW+N_Ti@`+NQwl;{2{g*oeqgj?cHULOv59VDsLG)&#9U&w6~g(cebq7hU;mb
z6*<yJ**}y>#b3X9F0jh&18UId4)v5|^WL{p66sj~XEmb^wJYyjr?p>l*qX#49;5T9
zEhi+jlCRGfs%RHU^v|wb?7SU(OOrTp$@4-%NubiNxH=ZP+O*-rsJ1HeY7>xn!dIa;
zn?3&v%`i=*B7SL?1xi}#zfF06f-e%crP8E_w33eM%!mHyEcPHt?^=|W8?PF%E25HT
zsCNg+#Gz>jAGZX(GM3JY%h=ofpl693&vIEGZ2MDDMSc~BNQ{?j>!EVz*v|g(pBodr
zTAD}A>uZRX9z_1i(+`WwY!Tvl;HiRYa!4+$ts_!G-t$y=i*>to%F!myva)d#OLT*#
zA#a4`k*wDlMp&WXW5>3c@?~f{Rzz=G(h0RQz_VT{OqHa%DZkOWd`hDoVsqR^@xTjk
zjAyCLbdW-tU18PSSV(pgD_2p>>1(YP120F<NGA~T5iZMJ{5@<jRIASl@h=XC-Mmx(
z9uW+E(?=}>A2TUV$L03aWu9o`rW%~CdjxsTlUUZpjg|90CYAbBFM8C*Jr6ykmf2*>
z_qJkB-Ku4v&vpy*7e&HAHpGE!=rCXQDD1{Dvsv{&x{m>FUQpLf{U~{e^3h=qed9+Z
z#+Fq|2XF8k4J29Orc@>nPF;x5>GQHif?+wf0{uG8MmoL{Ueg@qd+s5xT<8u^$UWXa
zEVM(GyPQRdWja*Bve~rJkTQm-W5<+f?1xXXa+~ZUnVxm?Mq7mUb+&h==-p53>j=!v
z$McJ3C%Ost=0uYvZ^ebec5`Y*c%MeudDmCA3OsJRQ(xz3#oy@M3Vv8XF1r!2-4&<O
ziO*|Qq?}JWMd}`K95SEze7g`LQtRG4a2BgAZ6Lx~EL>Y3h({g}FxN%(m!Ch8nXR9)
zbD>a6?pNQ6foxXA@}wo3;T%InLex&FZWoT7cMX-wecF=AmAePVLkh*j$L`)6ru#P9
z>|r#2+RP{6OA1i_>)j764;ySE3+o=X&D^Hw7d}5|(wO$S&U<pKdC&h4P!>cFxMFO!
zKH$Ij(5*=uT_~M2X`II-Y1NYgFq$X*?y>0IvEGrc!GLA~<%hEaF9w}9Jr)3C@TUNE
z)9lcx>-`oZH52GkCaGWD*VfF9W`mKrot<gsH!}esf5&#%9ir7D{j}>ZUPf{_2h(Kw
zHN5q4X&txz5@LkD1~Uz1xRRKot{=9H)RPdVsT)S`zp>X^e&dW;V*`~8l=2D})9i02
V&Sq#1KtvD-3jDD^_4uvs;vYOgF*g7J

diff --git a/datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip
deleted file mode 100644
index 2ba4a4df6f13aff548d30afe058271c5626f4050..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3862
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3R5ltC=u
zUtovJ`657Hb1+<8loX^b5uPW`$iN`N#K2%cyagpiiJ5tNCB<dnfSNSVF5%}iJM@4$
zJMpaV5d(qN^UJuz8^X%|h)+MzDz<2&EnnM_zNGg`Z*#L3$o6US=r>8T7R)xErGEDN
zym+B!W=kE+GW3^}HQrRV%)h+UWR1#{?|+OB^`sn=DNuh<yI;lo<A#Ltv)R-67e9M(
zKuX@$LX~&(L5}59y1i})v|VIXbl0Bo#LL4}q1>v2Q8YZReT9>Zpo+&*%ZT?orq~pn
zRWW9BU%a=oPqWP_>$<s{YyMW}R9mhi2PXd5wC?G@yiXRo(>tRRw@O~kyvtH~Eaq0F
zy<J>?^isW8<7<!IoDXT&YMba5S1sX_&JD=C_3~q6)FMgq8-_87f4^Q4mgG==tNK>=
zYs9r`mXw_n{-4S}p8s!o+}l3UB4(*mT;)9f#E*%C(hnj6j?dr=;sHhg8!!U&h>w7j
z)G~s>KfyNdumO)-`Zp0P#|8UuT++2lG0*5M3b`>YQCn|<t~dV$IpymC??acqy|q_*
zKl`Kyi=~y%?(CKmJClARYr##ym}z<bGtEA{Ed8fjs34K?Z3~lCxOJirpZlud!y%7_
zKQ0Kp&z`z-gIz)AJQLQ-T6#VXhowK=i&)ttwrru-Tw(XOXEqu-dGy;w^m%>BaJj7X
zYQdJtN+FA)Uik*ZZ3)x9%Mvo__Tpg1lOD%Aw<z-J`36_bk(Moee2;Ur<@(k9mlrMA
z^ZQj*V)dVz!=1b{3jN<mUN71|tzZ6(Sk!it>AxHFpB@XlRTOnPS8qGB$Abyy3(s!+
zmsC}8c*o)WQ>Wd(w48gfu2H@2_uAi=A95qY+;d)1ka~J+jx8|EZve3g@nK$;n3D-C
z9a2-^ArDIQSQ38f1Y5r&4m_=|ABtEqtg+cI7d~Z6$QJGXw>lFydAdw<UGz9lriU?5
zAu)2({69KBH=miTJN@yJyvL_E-`>5!^~~P9<FBhW-hKIN+n;SUM&+Aa-yYrQQNF8E
zbl%$hx#wc<9SIR_IxlLo*ujiBgyGY^zR2e7H-y67j>;?*o5say6ZX|-)|v)SKfx>M
zU9&7Vd9A#AyCcdgLHWlOlVb^*Gq}WD9tmcZw@DaSM4o^C_4ggmHx}{z^MnFi>Mimw
z+WIY9eUI<9zb%iy4e#&g$IIR~F<(#Kw`+D+gy&M_o{MWdA1kful$si7DgHG?LMK`B
ztg^<6j!F7TcAGA69P(va<@)f*i=|%#lU?u64`MLN@b6R(Ji$0ifL-vw5wC>}f1EiO
zRF`?!bMV{5=RVAof5%q)US_`h^u);a+Nz6xKTLm&6jh+|2|23tiI1w1)Z!9&Ea51g
zMq`O;vE<BE_Vo*}PI&~(P$bkM`6U^tMTDa1Y`<mRVFw=9>}Mi&jf&@<&D1jYm>TzB
zf=qOS*@;a-zcp4Cw@&%C*U0SBGXJCh8HH;4fBPNZ`j#av%D-Tpt9#gvGjGk-R<X^r
zJ9gj0y>)TlB5r}=yDfD|ueQd?wnxRUSYefxZ4;!VsVH^v(9zo&GgPKJKU`xMSt8+L
zB02epve&j!j%NbJCyvSmO+MjzaLPn}!Ol4r@)vffxou5~O>SB-eZ`|#A(KeM6xMld
ziB%8E*5vNrI^$J(`lY%Vx_ip2{_lxx4|m8&uq{1(D|Ge4M25d?|Ap`Gd@}Dx{;rFg
zq)W~3uRFbNfBo6%={D!L=kHqcd;gj`a1#SjMZ9aBU&;*(*?&N6MtmN~%)>0OaAbwE
z=dFW|81S$?h?;Xq{YIy}=Nsn>7t>R`&q;lnb)-ddSMCl5^##wkrrfN4n5tpPF!jrM
z52@Nj51XfYf;uzKZ<u-@IH4kLSM9kyjA!*P=XcvWpBMhfCE+x)C6Ujf;O492LhlWX
zW(#atrE0b0Mue-3^O~)pj3<wpv7O&~`O=BDj=4d7S(+zb2YLKD_UoAY-Cd^^Zd92&
zec@K2ixXB#uk=aP&AzlOh_}#p+Dfy&C7*Pp&kH;&oRj~4*@3$o$}dfgzO`+6;l&RH
zF-DU<{LwPVKm76DV>Kf?Iq{88A8(p&RrqN6<JZ4t`Uf;N)%a!mwc6dAVesZde($^E
z75$TMT>kp@m*OQYF5b^pmpPx@pOKGf-GPc9<R+pi@j(v__{@CNHWDcGF&dMMOmfV)
z+N2Ut`vn-@I)a!eEmT%W3zY-|RTvNk{&jRkH4sm0m9R~iZB>L#ZygsCX%o2R3bYK=
za>eQ`M4muxw?ZrfW)FrXjVwS4DV%V63uqZqD;8)g&e%cNirQF1wzU=5d?V6Uc+-ur
zotTXkWIKNXJ0nEeiB=K-ZN}1U3GfD`MTBor>t1A=m#`9TGomIY>|IcfL#;}XE%jn2
z+ES#7g*aPLi&$h^S8xz*D@K8f&u&PT$1GzYb^}u~Fexho6_FOf@Dc}?)vRnFqq%@E
LkA;C@64(_0QP4k+

diff --git a/datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip
deleted file mode 100644
index 0c27e7ab976464a08a66e48f9426be54a08f028d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4768
zcmeH~XIN9&7KQ@>R3Ma4WDHUR66u5@(uW$9DuPm^B?6L!mVg3Mq>0i6G)N0o5D=vo
z0YN~DD4>KU4gz|IVxdSI^d{px1B}C%`Fo$6^JJfM{;c=xy}rHH{!C8*=@<c&-^^W4
z^WPu-7?=RO03;rRxnz%Y#5qczwm1L)W(u1*Q2zLfOaL0X1t0)G_nthM?imESJBYKd
zq_c;&C*DU=QA$=)PDV~fQdUk<R+h@>Ik`@|KJf^Z(J|O>0M=~vy*{6lJnQ{{mpM-6
zwq3DMA`JjAKnnoKP!GU)JGx<|a6Z1jFH};dy2E;^`d=67Y5#rfm<$`>Ibn<OizYA;
ztUK$@d@auwn2MA)uIPY`a~j0t_|(wS&%%sRmByT#koP(Kvt|q9bzyl^iJcr1b?XxH
zJzc#O5q3?E?<)7lGR-^5ULr}gDK9OcmXs<7TMCGuCM7HS`VopA85#SE(}pSoXN=P5
z-<3#POrp%VbWQul%-3zLge>wOw5CRS8!~b%CKMk%5onp#o1%LGUj#qwE%+Z6!jVz`
z$x;~7nh_&MEKO2B_ETj~m2uBF)H*FsW<X53<VJ55{#D28j5?X3+tD(H?kR){|HKEA
zD!AvYV~#za4*7lz%M=c-i;9~9bt+)9gX;?M@eo^W{eI!7eZ!Mn6OT+-_6<*gcs-vI
zbA=(p@nati2(>i&jn@fn7u(*(AX3X`M7nwn?lIyko$zg6ti(k0#nvLT^oU#TYd>dk
zlo_ChTbIh)#aWugR%2@V7l08ezdiAa_%LbE)Uba=AxD${a`b#^GVEBw;L&c)XOl~{
zq(>a-zeM|ITutjWu~^pJ8VcU%FLyQk^s4>-qypEV!JSSGbTE*j<ZUTP!qvucWGNJq
zr65JU6p$$29}2<L$e31^$BuMr84%ff(&c!@;K_~a&|YTt7m}_r5~foVrA(FrY~cc#
z#0aK!d}PE#izsGdvx!6sGVxWZ=bb(r;UDN3v7w2BDb)WuQ}a@DHL>`8H+&isJ6dnC
zamWj%6<wb)Z3n6urGL(ZJ!T@pVWu+`E-naWudo=<gb3Z)&xO!7vPeThhgv<^-BPZ$
z@Cbt7$#5;FKJZ}#Uvap1y8*X3p<g~VMB6DR2F`I^rT}_A5rG1`3Z8bLO$0|GPjjYx
z#p=&CiY+DdhujyyaH(CHzil@RoX>y+gui+i;H{0;m>qqLB{p2~c^43Gik@atYW2OG
zzKQjjRPvI>{-*$I!9=XVYd3@z+tKn*jr!t)+QzC(!)fX6iuE^xViZCjj(M!k^36}M
zM#cN^fFc>{ANjJ%0@T-J;(X8=S{0Sz2Iu{vP2dgcPXmv0gzOu}1H;Y(FU`Amb|4*_
z$2m(PjclKNW{TMg2)?pGvz=Q+rjz+9*4VL?oZDn_Zsn=Zt*;~6ja&~=$Uieo+2((<
z)lrE?l&u~?>V@G#gxJ377s~3T6Wv>q!a6Y32q2JtV7}qK)ZN;ugDg&)%og6nxHrht
z`-RrzKd2>S4Fq;<dCLjoLwlahn$xXDza(Ans%cM>=8`PiSd}zBYgBjSOOA|)SMj>g
zWV(pLf=;H}D=4}pdsTTjLBQ2zL?kw(U;d0rD_aifh3Ls%?)EasP^JL~;R$c8NZd)f
z$oqXwT65O7F;gWp0Bt1P6@ZnK7kzvy7N;^77HgfaOOouE>I0k92jvJ}@iWcxJ7}3=
zKT|XxwK?g3=Amkns%2X-@64sL?lN=~gz%)~XmGc64CE{cWz#j?#M}Kjc};zsg|xUO
zaMtT`G1ON@5#3|P(77o8Hs7Z<apj}gXVugU@!ShljbL*)kx&O2DJrva8KoWLvBBkO
zZ6>G{S_XVP+aTsJEG$pV&qmzQLB8Y3sf7#F0r1SJLL;|0oOGsyYBHHG67Sr{rgN}|
z+b);7ye_B~b?12Cf&tZh4TW>3|8~TWcGZRrP1iRI>~~x{o3b^Q;wSXp7%E;j6I3e>
znY0+15~rJ})}Xr`a`zm6_uCb%@rJ~YDo5FaP0%l*J2bZSLTL*-^&(BZUT`QM+@Dg}
zqbvMhD*szjiTKsktA|`#vj5!)!UN}u^8S%lFiCb7AhK+oSt+L^0sy%lZpO~bzw$J?
z-$36WRf;Zmxdp_ix0f23Pi?T6`xE;PX&>NxB=fcTJi}Agh<4SLHXL79H;6RNtvjnN
z#BVv+Fq42q*iNk8eDM@a$X73by!pAs@WB~|8iQelIn#5YEC`jmTBaVV@II&%T26}5
zG3T23ag=c3s6kiqgP~BPv)1$iTsnpotK%*%5ljf4o3;sr`wqzk%#Eg+6;%$*H<K7r
zVI(>C1p$4`nO>+w16Enxk8hxIuCVulB&;RR=D<KTX5F$HWfkN!KBONLRgIY~)Og*$
zahe51O6JeC@x|MbX0EOm!K5$xq02pN_@ArEyJkJ~d30jwv+cRL0`(!Ik=#41<iW2M
zcDvrqbE7PXZxC@pEOXL72;R%?s?;iMXHurBG<4K4ukT6tNCJ((b7)9FDGp2s9S{r<
zR`ruJM1@ue<d@s7xz@t9Abc@h>S1)KS^#^Vaqn@FlzpJorC;)W6m{+<TV*0rO91Uc
zTdd0EYlnhFt(t_gPK_u|>D78(x*X6!zbY2DBCKc2wL&biFZT>zyoU=GtmE%eS}FRi
zNk;F>&4R-EOtwgu+D`jrzFzy?^@WKMeVEdZipZi-q`tnmVZZN4dz2Y`A_7a6jxK2#
zc9AysScRqXwU)O6Or|w-Hi+stkT4B+hOJJMv*#l~UmZx`$gdatFi+oQ-#iPPcD9vT
zEGI~UZK9t3)Q~O2Dq9ghH&Xag)V{oOLzNeG%SG{-cK17p#|Aghb=MpTFoh3As#=f=
z&_vFaK71?v&HTlYh`i(zqHc#{Ou!|B);v>}HM6dNW-x=C1mw>lt*%%Jo}c-d#q6M9
z4xF3Lw_AYoL%(v7f{vM{wOtA5OcGv*i`(jQ^PzhUS1%2*!Y5-D4vo7tTb6t&Zt!<~
zt$a9mEZ(0ag-|1@jQTZxDN>Q+E5a*`tRk!CHXeW7DtJ+C_i00OG1@x5lK)PL^r<&>
z@8_62G{zL})nGP5X>Oo>s*k;vxN5BdcG_w&W>Xa_D!*|HDXSx3Ee6w@vhxykHn8O8
zl7Av$#Ne!XFjx2SG~A2!NDF<pLv+7CF;KQYH!d3wre!`Rtd=H(BRLuyo&k+^X7e&c
zFG{M$1**Ow>YrQm9jsmnoldQ}m%i+15uW$z`6<7~%el@PEB4BK^dR<E_3y&NOpST$
z>js9mpwZ(&%X2}<E|^jh#=heo5g)L@*Z7s9m~Uo{(vfzWH3jMw^M_vCJa)SkifF#G
zbu_emz`fid&mVj?z=9U#ahF%5|K=6{ML;NEJK&a<?(Ts9;2VET`rb8elNPjcsU-dG
zA;~a`hy2@VZQF+34)Pm}yt4s$p>*Wij&<5|T4dM`KS`$UnVoIwZnLtJ%1XBAR8s%6
zE<a{|e~8}6oTH<f`IiDlCjZTG+VliPi`%o^eH`0KhU}x7ynQ(P@pLIWZg*qf3DrA5
zHFQUtr;fV2Gwh@WFi}nYu21|Ed#@VvjZIcES<0m#s@Z?7ID4V#87QF{$rqNa9$ps8
G+kXMae+m%*

diff --git a/datasets/common_voice/dummy/mt/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/mt/6.1.0/dummy_data.zip
deleted file mode 100644
index fe980849109c5493a1dce430ea598c1b08c6fe1f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4094
zcmeH~dpy(oAIFD@W|dqw)}r*wFig3OsNBwS%UDI{IPQMPMvPEmNk#fGPRlQMU6AS^
zm)u2hVyiez<d%gb)z2++{e71nvC3)Zuirnv?eYEY``7#V{k&eE_xtmjwYdNo0^)oI
z^ZaZ+J$$~2fDj<JKq4{3(ao9ctaga79wd;q9q+`s0)s_BeBdzw5D5GZxLL1RM1i}A
zYY^Ji*WWKN0Ij8pL2IaMsG~6&XbgtO>RH!ZxL$E5kJX9fWx%EkgSB2?0$8;$s6ETY
zrn$iG280h-89xZ5&O3nY@9afVB?kn35-K6hz-g9d@U>8krwqIR1}fNpFV{s@nAKMt
zHc%I(6r^I_3u6X(u=Z`1Fce1^IqE=vg7i=K;;yzxms%97!8=it+x6lMHN`4YXdTQE
zdVW=VAMCJ;@}cXgQLx5;yf_M#y_8oR8=8%lQLUZr|1B}T`)!K$0ipgNT8nP&D^1kE
zx!>+Rz(PF!)vGAD4el?KBGsT=guHzKq*y%O;zx?mX;L(4l}jCzyZlN6l0^04H&Q(1
zQgFGI^nM6uK5|i_&i^qx42zmBYH23awbh3%C>B}f55|a}v)W{#@n9D7UVoEkS*G2{
z+rF5Gaj!8QC>GzyY+8*$`p*p8@Uk}f?v2la5Y848R#lGEr;NsH;`=yTW<_&ddn1-G
zpg9Q8T$Ojt-Q0t?6lZ@&@woj~_3EM~9YYd$?b|sBy@FJ6x#Q8Edyu7nHXO%6aaH`;
z{i^z~jV{e;JxdcySB73b+<k=_X_t}m=<o&xZh#2y3&^xQJC9j1)1m6LAUq`_?2=uY
zgvHz2QHSO-?jW{7*VS%pp^%SK0=vB5yQ`;uyg4PNsgwWO`#=c6;h#9K(a4K7HpawL
zdxIy5<U7c9tbAKsL)3F@4t%=H-V)SHUXY748MVlhoNj>7QG*K(rZTqhLwP>KM*}S`
zN}V5;Bx#c%d$cIOGbSGf(2w0}G*4p(i<;r^Pb#JE7J2Oy&%F?S@kV(5uQQ8}s9u9Z
zbZJj9-yJoiif*a$aOPV`W6kqwz7@R>q`BA_?|ih+56~M4=)IeFy@Q;6yntcg?)F7>
z&L02HJ||KwNwOGdR7Vr)tWSBNyqAKRuA!B)wSs%|zqh)YT1EbnBR%rqR+a-9e|P2s
zlJB$v!;*R0>o)Sa?;DumVX>)GzZjILo*x*Sn=O5mmy>r}rNS_aN|-R!X-+CvHLA8t
zj)K6OXqzit{h}0U5@j%oVcPBgw5+?b!C6Y3ie>UgH)a&$&sGlFbV`_;D1oicZOcHx
zqM$O9PR2HrtLCCf&u&V@uQ$4yl0~#H=^NEbXl_+|QWh^?7ZPIeW^7`z%prX7xkpuz
ztJ7V}zMq}jV)ID#k7VD4>NIAi*T1=k*b+!=9H~92ubdq#Q%s&3NC@<VVRoc%xKAO(
z7_;(rnH1gti?&$w3N^%Tml29cK7(fZZOescqx{k0Gp};4#~;gjh%ZchDYC0v$$Enz
zR5LnKv^7JhFp;v95t@{>5IH-`JXqBZpQ<jMDra?cgjGrhHu^?RJJk42%(n&n$wJ<z
zw@#h8R`rrS<=Eu7;x1=`t#((9cX!F|0pu^nat7FU+Ufts{x6MPLPa2PK-<Fq-qGYs
z_H_6Ek+Z2(JrV)~J$H;PA)q}1)mOytIU_O$5sA6ZjHJ63=*ezHNk4v&CD;n{lgyXj
z`$1@iQb;e^l5O7;6}RbxY$Va;-D&JX*QN?=&)(8-bUGuIXj`Ex1y3e*MoG)&PUtpd
zo7oZKm5nC(=)*a8UXh}`p<oR~cY$Xan~R#9;=L#LIk-;d+K=V>iyY4T*g(HQeHtSu
z8j802DK=E}85lk6-BdNCl0s<L>+A>&Ptm)W%4A6x)#7k!qdUrmKeBb*viH`oheOl5
zDjDLX4avRQ1~(U3mr8qRBU2^N>G8MoIV{&|<V3vV)+~~KccOOY+r$i}f$^?=2n8o4
zc@s5^Kh2t9_!OrkW}R?;W2d+WVXP+u|0Y+lSWzx>pE6`aR1yx3mDIwd7ZH34cGFSY
zX#Mg+^GuB+GY6YL<j*7C&v~<@xm&Td-`LX(aBLX3w0LhtFVeT8cFm47=lEvaF^-z$
z$IP^x7!&|w>m#yqj@RFL_xS!!tc_|yq)AXA^oUA?N}JJ0N!RILnL{N1>hg~$pCbWr
zF9sWrcU2r~e7ex5bR^)&i{zfvoG0d`Ta=O+zmF!3uRlqx^N<l2GjVjOk9EpjN0quz
zbj=3yxXCzW$Ds!v5@l?pt-DEWzwN=BJ|UqvQoOF4`6aA#di7=*)AyM{mtkiy-gh<I
zNCT5F*35;9TMi8&^2RvPnUL=1VZ|6)!MnLp46QVJ!H(p=_ljC`*~XK{U;n{?XP;D}
z$hv(n5K>ABqnUQA=bYPZBtXBeNqa*fP~^NXbk3O>gtQ@ILs6Z=f9y%z{Ti=cm>=S!
zPp&&``=MYb4t3PF&@C3LPH2NIiX?O{39>zsRVwBe+dJkweA6wOPvn;9>m`XU1b06T
zl+rS2VuNP|IL<9cVa^a;9fdV{ckZ)qUcTH}kYnCAp_z{#DX>=dhJEIXg2wi99=VdZ
z;BpfG8wC|WD+L$!n{XHWi&Xw&(zls>g*3MR8jqw;2_1lO68d+SwUQON^Z7Ci=q}K8
z3K;lVrPG?r0${7sIzV01ofRr~LS0P_28J<@)GvAU$INe2+G^%&;7=Ei%&!IsK>ki%
zvo`1K#g)}^Cy3SL20@<5D|zC_%jI+&_vyA8nl8jMbk$kM8<l%PucqSH^Gy9_UjG$)
mtr@e-26i&Al|@8&W`8|#)<O%e<CKN~UlJGu3IzZl(0>7MDa;B0

diff --git a/datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip
deleted file mode 100644
index 38399d0736bac119321a57456f4d13df50fc280d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4075
zcmeH~c~nw)8^<r=mRhFbmey#ZQ_PS+QbsOgBJPQ1=~$wu7`z4~k(sHJON(W>G1{i)
zHfgRAZsk(%xRa%mTgxP=iFLxMnNiD3=aQY&%8CAZ|9RnXxtw#)^SQt8_xC*a3(*lO
ztpJE$PYU<Cy*qh-Py}>=fc<p(L7xCJn{2$@eF*@~FmUx1fA)te0uX5t6ab`0!2@y8
zBJhPp{6mobOxE80`;Z&fp^+FA28BdpkZ81o)w6T_7Tn^u5>{vInFT0cv{>x+Vc@Fe
zf$nTSw{MKE#y}wepeY3aD9HhA7MaFa$KDt6PN?Jz3*X69i;snBzvIkk&{L8TV9jN|
z)ryKr)XmZy*??&ZATGhbKARLaiP^5)K{*h~%^XT_7!JF=>BQ1I*G<C&4&Rs1BY7#x
zDi5wspROwNtRg&;Q5|!1T|s{Q$g5{n{}y>z4P-~F6!lU|KIfKTkG$3eIays^LjKRn
zvB26><O?X^LGDcg<-dp35?3z0Vfw=g_u~$4mQ-&<6`07JH{urUXXgakIFStdPmUB_
ztSAwUF^n>T&fYWyg2w2-$A`{v5_aXphdK-+2Gub>KHpc?sx$`)&jVTb|MW%<k9zx5
zv<cHEqOq&8HC{4LjW)l!vXK;;=3+LK?h=>x;)EwjV0+zflwZH59wOc=b7H3j8j{6g
z%YtIBlU(coO2`+bP6DMy$sg$*T^k;qv~>D>AzMww8uQdNgp<48&BZ6-zB7}kaa>M0
z&Si*eXMXHRH$mvU(%Y^*+oJMu=hjWI%H3~~G!3jH<(~SJhv}DjPBxQ0H6gEu8{iy4
zOS0Ss1FHb*fh_<$Z8gc7ceC-ruT<~x4Xah4o7MbTFr*dANYP2RW-PRv)Q3LZWW_bj
zIuMzisCP3~i5lwUkqEODq7Ig_cJ_|r4zyyrFoz)xX9QeE{v#X9bFkFQahC&e*ooBV
z6BVr`^`yayJ8oXbx;74+l%lvNcy-lN_F3-X+jc8IEA8Bv^t7wIJ*>bB*BU$?&a`<p
z$!h=ed5iFyD(0Nj_8ER|p&2g-L7>zyP-+v&r4AwQp@9Q{67WHA@xK1d-liw<>AGm8
zAMUW!w?$>m2&p%EyUH%EVIg;ZhYTFTCZ_H>s&mq)1#OlD*uOCME9?^HX#^j8ZFG7&
z(N;Bcg75Ey!)yDz^-=Ip^>}t91aiD^8!xJZ>g>%d3{N&}$&GI<q9B%gBAB)7@1+b%
z;|uK4&P2r|tl}D<yTCoCUe;zv^Htk*S^>%G@^wSNoRnhG21f4kGVp+kBG;f&1s$P*
zj^a)GwR;~j2F)s0dxu@m7i?oSnfH$pYJM9}ZYGtMTW_xDAFE{^sX&hp)+J0^71#aL
zk@_@<eLq12i8%DyY<OAmbhxD=GFQ-cb)9;p+V#8PIki^C$3~pZ><M;FZHcX#I_2(h
z<zq){hLm>o<;lBlR&P1qji^OC0tHoYt-$oFd~$U%Cw#AhH`l$Rj8`IU%dg%VukGE_
z_{;NcoZFoIo=7+S8O?rz&dk6J?x65ub!3eR$IzFN&{^TX%;G7>Gnh5?Q4h^G)>v|5
z*_3_k51uYEum11p|HxAdQ|t*U*sOYA_Yh*TsT9^1P3?4KJVYr09px;AOua<>+p1@{
zOIk#@C^(1@eQ+1TDp<8z^Q2KLw_^)j9}*5XzFP4Y=45r}L9DF{@5;G1qCXF89z$`>
z@4MKgoU=ba!Z==CZ^CclQD?Bl<`GMDTRb#>jJRCV5bf$(E2H>)z_VNRkncdWdo;|d
ztTBxvb=-Owal|Ye{`f}R*_hJCZmPEXnmFjCp1|f7PidBeN`r=Q%htxQ-Jv^sMfXN;
zh0*wpS<Vm@Qm@XLN`E5#S9;`6mmWJPPrjPiPOQtA^q!{dvJ~d{4=*ESR$b<s^n2XZ
z_$#_^<>3_(y8GhKsF9<F2#ywE@iu+-+KwKE#kXyKzS7?rt%~T~lNF2C?z?1H8!S-a
zR*k^_2JlnzcGa?Mm}3z-;3dXyWRT-D1NGx3)<@CPZP=TR%86DQ;ievq9?GFuOzt>F
z;3@BC6gQ_#-<!NmIu#N9Mk|u%e5Y_!5xc3bA#-m0ietrkqlP*p=%_R3=myEVl*agU
z058(4?TJ=yO?pa%-cW^Qf~jrPtg`Olr`kCESPhx`S)xC(%#LcN;6{JH)|TdkNtr;|
zAev$34+Nflk5(Y;o<IEMO<2e6sY%yJgIf)Ohzta@p`!J-6N&lW?bYSVVbVIW&#p8>
z@=`*GJCu5oxn2`qw2|=T%T2CjdBn)F=?`#psJ^}w4VAC-BQlpwnA{qpm9FSeuoXe(
zq*C)VJHGR$@z%)cH~Qj**L<}~HkI~XSD_(jsyPhh`0>=r40B_mnrI3;uqCcJ{m=kk
zJK{o|TK8MM4F9;|9d)vG+uN=*ZiY?cX{UWMHyWJ@Xts?x<!o0cP+Y0lE#3G>5q#Xo
zQ0t|^<2;f8pGVf$-JNHf0lEKodcXh68a!?tnkh8Qw)eEo#Xsq}@y4=aHipdRUa>jz
zb1znMbKYA+V=fGSVt+nK$PlSj(8Yqb)_eYPKx8F8T@brvW@GnP6odnF1>aiPEG+mh
z5&X-fPXqWINo18HA?aNl2Vvqk{@G>C<->(xd=>^a5s(`t4SvnjY0+hYuzArOq%P{t
z9CcwTolgx0yH`T$hm87V=BM#&KJy{?4@*MkN3#MXe<r9A9mRWbZnX>Z!+dhNjKt)*
zH1Xx-iaT!M18zPvPEKOzyvLnn)P+NMK6U33iK(BA<bPu?He+Vl;7$g&@+<J7uQT|=
W#90h2BP%Yg0KOU00I*w8{Occ^XR5XU

diff --git a/datasets/common_voice/dummy/or/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/or/6.1.0/dummy_data.zip
deleted file mode 100644
index 276c1fad039680ee45ad044e0dc3ed39c162fcce..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4316
zcmeH~X*`r`8^9kK`;uv7$*v*8VP>pHAw-su7|Mv!Fb-oG>u^Sl>|@PRLK2#kHA<91
zb}eKVsneU2PGl)thwzT-7n0MQPw$s^<~PsGe7XM5{rvCiy06O&#Ry^tR$s3RJ<*?k
ze7$f0!T^rw?oP79VSTaiBbICcW7cl-ldB)%?;HRFXq*uMKohi!*_J_|8-qCb$vb#>
zdlG%*RTYr(iU>u7JW^2}iKH`n?q&N8x44JS=pNo*0Nb;*wz_>@+NxQB-dub1jiLg3
zJq7@9*aiRy`T@S)SUf?&*T?VkgG$cQI=PUl_5FitODr>OK=4F5P4fl;;leEm<5#1F
zpW%0_7@cFryahNZNnH7i?hzeK3>>9rt?n4)r<^^P-c++VNq)5A^d{o^V7Vs_J1A%!
z*=*I9GOZtAUP(5x4YC$NDb3<W<=zO{7ja#{ov=}wuTzmfa4NYIW8@T<YbiIOeyALr
zK69e+7*sX3sP!D=;Dg3p8D6r2D4EzR!(B!M?<bNTFchxjVjecO<)Oax8;ytY?eR*{
z@nK31B)xQsViI!fVn-Ldlu8-Vx^Ky!W30Q%$eNm#s+)B#Hcfm;Tto7c3D1c#8;K&_
z*@&sg=bH5#^(F8s|L`Dqy;htUPgebG{Bx0d`FcMAt?dbIQ+<911ahbscOJ3<89N>h
z@8SBp^4A$~%WUGZgNZt`P1LlnV%{W{XvK<ns54G}FvIH4SpOuXaRJWBvbx#UY*ko>
z1ZdN2`Os`B&~FRp==XD1)ihVU5j0mMAp8@gWv*Lq85P~rctq&0$EVW-z-AK24r5M*
zf~QsY3qP9QE*rdo+&$2u)!&jj>2$StGOCK`oEL>E2Zm3Lb@mq3aEIB6^QSIh3-^c~
z3Y`2TjCYlY&Z`^EkFrVOewQwN_GAO>e7IbLkbtR@?pdBlygj052y8ApCWj5Li(`L#
zyWY5ThqS!VYZP2rRZpPHa}Z;Wydq0bPYG3i>`+2Imy6@_F%`6Cdu0eK>gF5hAYwle
zT!TbLRHTyaqu=VQImk3c@!H*N=B-740?DKA9r8L~F%Ki$52Sv~Y#YXiSite8SXgXY
z`7aB(dcbPl^z3b#lyvW-NXA7<X*)Ams_E)r6PD${b12Z@zyW^Pi%fn4#l@n-f^{VV
z7uu*t4bX*QBQB3Qy+>wx>`cg?i^3OAMp^b~7)<)1AI*~j&66_yp8T+Gcv|^!#C@~1
znpHo@EcZlYLJE>6!gUfh2Wp*rn)Sy{gXm_ZVIJ56v2kYS94xB9tV6u4_@*u|D->k)
z-W(C^C&g}yArGr{EsA=mRqZcPwdeK?#1f)9Gz&>2Om)VY09PEjIykWFSlhArT(!uw
zsBSl>yyAr7SwgEEzsw%LnuSqWU2&n?rD3nqTzY>|!Ny4THD`E7I|s>g*WID)Fy`9D
zusioRp>R}wat@|k#T0Y=r9e5WIKNdBNx|^wp8bwua$SG+Ox=6d7CcPASWUM*JS7GJ
zt+@FE-OutX?W&hB6Bhext0%^7JZeuZmc~s)lK&d7xiqQpYH7B+T_AW@d+f3|a%S%R
zo%hWdUL!(2`XzcXhD4}>Eig(^X4r|v?GW?+fQGWbAp1nZPg=Bxg=NkP12WKamPvK7
zl`h2`60&g>R46y7b0O}upd$9Hq1ynHV8q^}k9G?hqa<oD#_$DKr0UaCi$l+zkjc}H
z<vhW5WK)jk{Y}9u@K<6Rb8)?6IQ@=&9esSi>D{7m{$KC^Uaxv=8OfKH!tejx!RX=Z
z?CAY7zimm6j2pChBm0LfJ_3?T=_b<C%8rEmh&-74EgK>h47SQG53M|U#YCHzN|-q^
zVy=-+C61w6U$3nAm?lr_`|qnzb0Kn$KI#<R6<Vh0-bt<{n#H~2Nw*(zrMlVlRA@S|
z7`kv>yk_G(a0}7fUPmz60WQKcqU*!$1%(fybwO`pjB^n~=@J@zgVMHM5%@-n1`uCn
zv*$rCX*ah!jWM43ihPb4dn_s8_=6UviY`b<zA4E>$tBJVUmq=j>_6y2jn#k`JQ;U>
zCd*t*p@L1l?p>INeV7=$+wVT96-avb93?fxPZ4R&gm2FoeH85vnX<~6Ico57f7qq4
zfWaBtzTp~;d4l=C2i7jgq-pit_Y?ax`iiwlY8+&w0SK$Si?NuSr%f*4nm~4NtOCA6
zWzpkz4baNOb9c<-w%j)b0u?7B-znH$SU%ELARWZ_^w5EseYLibi#@+p2r{Rgxg{I6
z?Zte~P0}B}OE;H+h3^j1zCWj%$TYjlG`p(wCoi6`xkPV~!);0Gri@5lZC9Tdbs)P;
zGoj_GAn5`EEe(+hDL1>K4L(8%PjtqhpYn8SXL#-uV}i5oVA4!Jzf!w0GXFZevjN{L
zgLu`ggm@n#YRuJc(5tW33(2k;@PvP;VGh!?4B)(d_63A<?*o=WgISyFqF$7f3apY+
zP?N+#coQ-_4jrP-d#5%!wEQG`{#v$V3newCG}GMnV|+};TV0DR4nw9QwgRbfY;%C#
z4qG(F;UeVd!(_hO&BjRcIT_2+Y_t~}uD{~Ab7fN1X!FicsnB2dTped+bc$2D>p58X
zIK5cuQhVT1@<eXKU8_uMrn>es)qnS<6RS>S9{+Xm<>Ti4q?CAu3%pCh!>M&rt^bjT
z)wcleJ1nR)Kf~&M`o8Z-j*6(8ka#G8b6HLYG!L5GUW>u)5B8ng+coIOhMHm**OH_(
zjw*&c71d_C{*3xd3hBuRH1?A}7Meo|^66W6-8O~@N$i4_YE4Oxus^UW<A6AUcNxW(
zT>hXo;jz}EuQs{s8+{e}ZGUx*_t@C!t_t5Y5-@BNW!!4A5ctaH0mgM!e{PsFn7+&z
z{~{n1SPQtMqrWlWKg=9ICvBcM)=1+zm+2&Zo<C@?)%oLx%UbJaH%=d4V6?0Rs39QQ
z*ShDl<+5n7b+ZVKx}}{p>c%d5J@p2y9MDPq)>HqSxw#`;&pZaAoB3Vwp^<;k8=9e3
z_u|@WH})y($y&^Glh?YIpD%Yc<2Ih2>!Bj7bVJvjrSwrZR`vDNMK-#rn`-<&vA?AJ
m<{I;bP20(|t^AgQZua*TXDc)l%^U!*)7}Ky?!j=b{`)WG+8R&*

diff --git a/datasets/common_voice/dummy/pa-IN/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/pa-IN/6.1.0/dummy_data.zip
deleted file mode 100644
index de46a6bdfe71da091c227e57ad90a818f1b58c42..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4416
zcmeH~c{r4P7sv0((%3?l2#;;7p_#I#(kL1evW=|85XL$gOG!e{ScYksY^5f<l2XEC
zPm(PpMHpmFJ(N~O<sIpbsXX!LdtL9$HTT?qozH#l@AsVFIVSs<nAw5(&*&Ab*|(eT
z7Y;xaaQ64|I(Ep}(a&)+%7P7GqDY%M%>Vq4asVLaNhSbbeo4QWEE@#E7{n<+*~!Nj
z>yJ}bSAi*Sfo_2+!?q~HU@MHCk%?w_M8y?G$2uw-7%wom({?VG0Y5!)Fz`47Z3Y3U
z8SVf90BQ{YY+V(@&({&-t>T9Z`1ZQUl${PUNjrajT`h6i#!N7t2oD^=4k!wlxHb(|
zwR2P}sdvnt*xkr#lLkgS@<V{xBsZ&uDI&nvhBBe4donU|-vn3JlYA!f@8oRWRKIO}
ze|I<_664vO=;}q{80&fKa_A0!H@`?Ej<wTFF11+D1(BPhHVATf@kYu#ls2P^>XDb(
z(FW~^(w8+L2z$N0qcBw(FH!ukHSCRi!u?RWv<vY~r8JT3?516*{6sx#&1>*6HUv+&
z2a^h~CI7*Xs)0z^yaqV5x6@xcgj#47fIS*)#(r?h5O>5PkX$#lKC)S}qHanxTJr9o
z;yRDKmrfs}2DKx2DZ^N!<{%N1&wAIRYgzb|9INbGX9+o`q%qG~@NC<dyU$morFM0s
zChabkdPXWljuKmAd3^UB*!Gv9g}pZ<gV*Py(?l$*JD#3299}!`wMC^O)0OAU=}MiZ
zD+ODXQqG41{;X4_aV?Y|@A{Vi7$|Od;3KcqQG7$`kzPa#sRuZ|A&y5H4M!lrqO2Bi
ziumjbf_Qd`tstw5VDGbjoaX@dSR!VoLeVsLR@f%UaU*oRqIq*qzmjS9LG{Q?%0^W+
zBa9$|>8RW}YTC1VxjDtjdkIf<cj>EQubQ8OkrNZx6d}*d!q;4t*_2?aS5p`bG#<8L
zPtQ^eMw>a)5P790qC#WmhRHBv@b&21+qa%q(1Rp~3cKhEVifRApT**k56n22z3i!j
z<pL=c9rd}cTImVq=gDWvG8?)BHSDy?KX?q~`v-`0l~;ba(C*`;lz;xqyP?tg7r5Hm
zgGW#4F*`nL%hNk_Ox6E(=uph~0QzZcG*bl3_-m2)GiBh$K&v;}Uo5wz7vcaOy*;9(
zTkij1Q;?+rmRr2F>fRXkZMk~cU=J6`0P*)r+3@P==Q<6O-#W#DdIl7<nXA37P%>RT
zTe^B0t5Pq((Gx>&ONX6*XgFVIzo|8^cq4Cq7!SUyRpA!UFsHNEp;kbD)c2SW#wBQv
ztfR;i$c6?f$#RJi7hX6d>pm;J*s{vx5K5Qu_H99ypHAF+PbitK%Zn157>F?!EYe%=
zT3+|*dak)?1gSS!FbR@Wr2GyJNNLvv?Xuz<u<^E{Xy#pWCzjZVkt5e%RZoj5Oloya
z&Uyo8tJr@!>%FnLiMre#Lu37@!f>1$pKLT+_(N3lnw-vXOWyVAmg}5a`Zrz&g`yQ$
ze2>5$C9teZZGr9x#1}oO`G@*Sut$h51LmWyEghu&?!y)8K;@3sf4UCb{Mu1}v9ho5
zV}HGPW#5tMY*KQC{fF0M1+!(O4Z-go)bF2oPQ1XKe=*2ImqX#!So#n|ODwmM^@@2m
zl;sMF6*@vhX&Hdq(+l0)A*uVwTJ`+bk;0H<eR<2XZd_yG!dVZIuwKX&CFe>O1=ItS
zR@1}d1)OLH2X*uV!)MiykNb=KpW~TxZ@rj&D|$ohbbCP#6gN?CO%vBky+z!xce;c3
z&ZOZ8G(9Eigi|BuBOOi(h_Q;7`e4hdeBgH&=l8<`%e347Ti}1d0uo%>hDdrVa9Z_|
z>*MEk*!NE!Xo+h<*6!lRcMY45uq52yPi~aH^;+Og%Y7p*Rh^>C-j9tqh6+XCG{o6Z
zT>&M(%mQEWIs+TKvZS!V^QEpSS{KhISp`~StU3B*Qjec1hL^bAypmI^b+g6)L;@79
zUehxa8wT6im_rOxn6W+6)XR6fK^Bt_lqzH(p4%x{PYF2@Q<|Z%&=1lS8x}qCzBn6<
z2V6itAvP>B%}XV**dTtBGKzB>392D)-mKODVP@0CUO2^t=fy}8<ZmD&6peO4J1kIJ
zgQ==1#n_;bcs7bV@8zj{cNuAF*4GQPZ4%SSFG;kBnQV=VBUSZ{nHnB1UNv5==;69o
zX>xLyC{3Lv(Cndk0>dE!E$Gffim_Up!*J%r06A_ukp$_#bW(yUM&RCYvGK&Gnc*s~
zkQz^YLG1#{XHY500sNl2Ha7Bp|IJUQ>bMyz_a;hv@HAaEUAk;qtEya#_pd#CnF97V
zF(M?uc<fz`+c%h)eYz@ILp<O{MrM1B>zogutqb2ajuw=PQn^das>8E~d%siEBD_Db
zHPu=OeEGopJhNBrE-mH5zV?qL#l<h<Ko!!K``0zce2H&E_38BWMI5VhuPbjNp=8(R
z^h_xDSe04YhPmG3vzR1w-SzZr_&do111b*Syq2*k+D}TU^vOea_RC>iGPv6*>DZx5
z#aPeg^s@0Me3nip0JQCH=QbEf!oA7PIj@+8cHeTw9#u5pJ>5~}F*GVUaC{1jk<c~V
zMjQAmf8&?B@dEuh&6(UIK|5xs?I)G>Pi~%W9`A~|tU7wA^}cJk4yj`lEk#k0Yc)`8
zV?n-{q<zJ`)5)whwGoza{c3Rm-jQxKS(J{1l$kYFMRWq^+ESGELe8A;vLbgRd9mZ$
z;qttn^NGRDB|+#SNsyWdZ{ZlZqQ*khDgU5H%miSk($BodR)i0GZ%)(E5c#|_I#5Hg
zs&#g*3Y75*DaxC;CL}gQrY-Ww%+Bt|a|F5Y0s&h5%r^rpYz*_G^3sh#b5#cTE<eVH
zabBMHz%T0sXw6?t%WWw_-yeYon1s*YGVClY3p>jy0z!brfM4Kxi~;{;fB7@%*X?DI
zGzrgKA?d%n44pT>%ltO8#rc?VlUd-=%N589TTB02@~W22Y=O6Auc3pN)o2mSIG8U5
z-=MdOm4knb>3_!lI#Djg`Y^8s`%`bC!+$edn(UkR<zmi^W6V-`AnR)2i$l$ylb)|Z
z#<O!NHwe5M?vj&qWnjkczZ49kSL@0}{^czAH~ezFX8}+5Hr?D?I9G%Jxf3l%XQ8VI
Q0POT{1l>$obOr$Y2bWk(?f?J)

diff --git a/datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip
deleted file mode 100644
index 44dc0a8e87e5e2530d3083554a752a35e7f71527..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4103
zcmeH~dpy(oAIHCJ<G79}x8=CCs3sFB6|MTYEyZwhtIX^mo4IT&w+MBRkh0`jhbY(P
zOdh3(lUpPdBG=rC)G5sU_-)lATXNd@<M-chdwjo-?_cld_w#<eKd;a0Yk>xVg#q?&
zDAx<~>EX*w1b_i96e7{@xC@qy-D+bk1c1_1EuGjG<&+4(2Ob9j0Qep2X0d7!HSQuf
zDgsCH@uK)5wre60+FIIL2&6UwiR7{R?-|Zqueg!N>RyD;0O&P?)m~qkwd$XM*592m
z<63`pg82Y|pC16Ucn6Suu<o9kWMAs1M#ZNYIL*-vzHQW#*j=7~YKiW3WlD#Rs5I3S
z<p+@U)Z?k5ws$V>5KBR#pEi|}qu<*=+}azvXF`laW2;eF#p`9)ImUW5DoF`B(wITV
zhD3KC&rX*NICBh6lTxzWj!h9Vu5$=_JcFUoI_8t;w_sHm#<Li@3ZvOcw9RcR>OShy
zgCxfLE#6ArD$m2K=U*fgNNL<?(|-|ROUt_D%veK|*>pIC<YeQI6Kjijof!6x-#>3Z
zL?>Ivuy?&lC!Ub+abkqLBeM!+DNXKD$(flMe5vGaH6K(a9t$lF#&(I@0V48k9x-x0
zUjEe;+LghGZF41WiRjUAJ=2^K^P}qq7yr)Do`{%VEM$<_!?N7%$$HW?30Ak4SeGX6
z-FCrKf7t7|UULvqv^xDyY3!Lz!J2j5(zh!#=&2r^T|);PHqC|WgGlScB%2VGYMIi?
zbUxaapmYMd_i3Dz`MiO0^5W?y)Yc~OBXQp)M<xSj8}lr75=1`(&)hi7Hwv$Z1ZMf`
zMz{7*en}3wdElAct;^!_nVyU)Bk)T`s#GeSW^z^{RT-ip-)__$szNZ2&=(dFSJLV8
zIO)PLjv^lTh@h%nfS=7@o3uA{e>XU+FB5ecsT8IiGbFFaH|6^-;kMZh)guFW9V253
z!CPL&PCwaGK8mwzJiT*5%J5($bqX3;z1HgBZF}EemuT6uLw+GsV#9;w-ahu410uZ=
zhw!$I_`BBXX*1Y^egO)9D=+Xb_uVwj8Dn)RZ`z90^M2N)!+YPUSb{st4)88tyU(8T
z?@T&Ac8{k5O7!elv&K68y$|~rj%Yd#%2qbSS+DCctj*oRxIyaoy=gu&b4*S_0F$`(
z=N{FWGX2Tj`PpgD<J*sq-W5vQlso1#x2JF+&#fn`P&eH!)la4DrlSWs2V3CRN2Mqx
z?*{l)FJ4Ggg@jq(dUQe~%lUlV2R<}7Z^N6Ez^p-Ye)YZ_A!i>ZgyH4he^6B4czdW~
zI6bb!ZjFEC>pG|K%&Id_jRKCF3FQ3ZVKd^LVmQi7I9%_w57!*-ayt6FWzE0`?{W&c
zqoize$@CS6l93PYqf2E43r&t^g{_nE7J^!Jf}7RaLY>UtMw^)36Bf_`P_BeUtAM@x
zBBZ0X4U)aX!`qA>JL1%7hDngzjstYK(Kya0E@KBdQ4Q_}Q&H;KFV=j-S;lNOhF=k8
zKXYK#G~ns`;CoY@hzTY&J7(Ck^eA&|j@qkuc^>)^x-1~urZ@%M%DaGMyf68ycB{D5
z|F!!cYp1NYeb)Jl>HUi%hD3J5`}{~eVsj}GhLjNKOU!ZZ4tdz0Y%bdfQyi|Em}5e^
z)1dciEyNyMpEJ{X9<dn-*@<*ISSc3=ln0-Zr*uFw62+!T!(9qTZJ*$GE!dviqhME9
z->a(F3ZqB{Y1MX-lHBnrjB@?P2M=(yU>C&olZWC5TO4yGo2=@k11sN}+S4}}$rNR+
z$(OMq_d)gx2}QYqCERr_j6SB|`jpmUaz@YpK?hU>2{57sHAZ)h6Muxusnn6F#ZuKV
zY_fPGe9Wq0dSduVyRl*dX(OxvbbOGxqoMp5sRj#U7RQzixZl^ibHrgQbpA!f#I8od
zrMval4#_6{Hi|*@M>iKYw7RCtop39b_Et~C70}OXCs^#Xgeqrw7F(J;AVx|`Sv?be
z^-$90r@rJVNL`z~V=RrIdeuKWB~aO?f2rt|p-Z~=ser3w2Sc&)U<2qC^TPTWrcR(8
zzH%ZW8`^$)96i@QCCI%Gd!k#4y;)jHu;RjY-uI!q=SnMGWjhk%(D718(R1x-Rx`jY
zjdGXzBTf}-F-02$x23+*Y`MPqj}!+BSQuhg?E^1a8iBCXQdl#hPq|uxb1iORnjSqD
zcQ@)@K}}&jT&AktU@4%*^ttJ;w5?Gm)*sSEB*5itvM1sW*ZJomu5|J{7>i@#FK6^$
zPLA9VDJTO@GH<-v5Oe>7rmj)sjSzZMD9&tyQG?iipf~zF_|<ybcKLX3S&W8kp6=Ui
z7fS5}5%a&HTZaOVDULBW)ll^HLCSd0(~;I(=w!?L-YNY2)WF}=rfI#-H`n|&F8U9c
zl7GH_ac;I~_vVb0PG-e(!RgtGn{B3hS|34*4#UABlGS&j;hoi0dvl$FsCJ{Wc2TE5
z$o72%-+VqX+*wC^Bf0=uGz4xv9MT}(tkkJaoh&gqy>Lh)vR2z^5NqZfk+A@kATVZR
z{!%?UHkMDlI@xtNP$By@rK1!!Y4ZAw{TvD^8}z2U{0&jAdCb)>+va_EFh`ta8!cxh
z*5zH`FY>yRxI-aZ;}z+dk6#hAS_qf@!j}NXQS3*qNDlcN$^S(`HDI~mBI-Zf1^*$Q
z|CqEgoG+8cQ5igvK1FpFj2+d#yR7Bt$Q{%_!&n;;$UMW2?;L$vby+MJM{H+NS5;@3
z${koasXi<R;F0<@zW$iGGOTejM_FHAJTkxW8y5LHvCRU_p2g+Wa;FSVaxR2t@^a?*
z@p9Qaj{A7zgt`jy4COd1d82Y0IVW|S5YN;Vw*F7-&s%<_$9!h9CYd$L(jq*wzjd6|
U&=3K3X<^pi6AS=hEC2xf7w2-!X8-^I

diff --git a/datasets/common_voice/dummy/pt/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/pt/6.1.0/dummy_data.zip
deleted file mode 100644
index ae45552c8dd719aa1ad69bd1fcb0146540c3039d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4130
zcmeH~c|4T+9>*VRV<{@4A!aPOGM1q<QX{gAo#9-|6j=rnGsKKbb%oNklx#UQhQx%D
z<`k0B#Ic_2+mtNT5egHM%pp26>W0c`&Oi6B`#i7TGtXb&_w)OHzTfZfGj^7I{K5d|
z8Y&DzFTH$zhyb#HHwljqclY)ndT9RSC<O3jDcT?AEK-;Vz{Ahv0|5Rv?1$Z&L6lYp
z@d{P<3JeY+g{bRlz}2<2w6)aX+UjsPm(g=_Jy(6A3YXD?h@Svr*~qm%UxK}B5YV0D
ziMALz7AMIA0E)Z-pv66a80>)~Xb?j}mlBndg*-f$j{G`N&B@k;8i;7j<98CW+1FW?
zkiwhC5*y$?@nBxw6g{+9N5X|`_n7n)sR-NF6a$%Al$m?3ufUwX|ADrsnwHyc<qlrR
zXXl@7A=D6vVb?{)%~F_KJ@Vls%LB-8{nTRN2f4qQzYXrF<A+KG?!}kD2!dCNI%Hn|
za)nf7ov}YKnF8v774Tfp6%oSv7-Wc-D2?1WfIbFSy2|qu@b#*-OSNOT6b0;elz>%J
z&=EIek{KON%1r@hyEL!)30|7Dn2LPVTI2A(L`-GAk5!N)X3>VoI&Pp=Q)j7scKxP6
zN4<l;m`S_jT|B&i3>rk3AA4Th(en_y+h9CUrT)abCK^JnNhF*#C)=g$^hR;-^oOZA
zsh-`O>RC?zW;(%;kDdOH?4`kd`n|ECKTdu#zFJGvv~^qp3<16K?Ex#i0tuSfC-Pvl
z1C+NF<hLZ(?vUCS8acq?IRTMJy85(`D2f5j6G0`Fm!#!-S+EOYu|tX<6A^GLisD41
zF@AHp(cPfjywcjTmk+EORB=*!Y~4)agtw(qZQe6nRq*JR(^<TGilS!=QmqX29ku;*
zq<@FH;tPuJKn-2|shx>7T1Qk#6ZcVcow^;lj`A?E-*O()9X1r)@Yj2m8CPGF=z1r2
zA)7Yl^IUgCKR)yF^wb<oVaw4plL~r`mr0dbe|VLeUy|LWX`Zobf|!_-W7vp_0h6b2
z+aU$rKKpFHLrq&XqaXFyc5WVWS=5~uTu!xlmM8kv#!6dHcABNxmA;evREK&5;MgMp
z>-{C+oJ#*rwPTZ-&-H1E#<V`&aj+^nN6GSt0%{#!Jk>|^NxFxBN@#~AIH3Y%^CIR%
z<zTCsn^gb2*TxXLlb8HK{VvV(VITYsX(}>D7R#vl<oq`|`Ox7PJN>;}NyL3+Py<+q
z=#fkjNuj;Y82;kvi0#I9BU;eyF?N}>@;p7N_V$hX0u(*Je@+~9G#)E=%?I<;L^DQ{
z5F*{$lr#}cl6bk`hF8T8e4F85LNDV@h!mr~UEZ(r$=1G+P{g!#*)?kwq0G*R$B(-P
z7G!Zfv<Q0OX{ho2;QUm=zi^l<4n)0PBI#=BL*#m}mSQ3S*%pJ|)pAxcR)OKzZko71
zho0D}oNj!2msu>;1cWH(AMi>l&`~Z$wfoAeI;D?_G>EnzHyuNOfTKs-UmvV<I`EcE
z4HFRdGGDy>#OLydvx^C=`;UTcZjaM6f&N4XnoUREvxhfdw~Zay8HvjwK{|lqkClr{
zgLk=gR_dwAy`DsD2=R-QYYeXcOZmT*Qf{$q=p?%}5r6Lx3MBergMXwYKDqgv41B3A
z9V<PdqwYUZ`A=2LPw;28SL(bUCZnTeqq|<zDXimppCTiVLQWp1Hp0e^l6wNeDl+~J
zaxrhh>-Wz?@4r~QwWu})8-}Shc0<}P?)=#%Ecq76?aT?B394EtIaB26Z5k9e_4|Wp
zT_;ta4SUL!;%oHmMH{+Yt*l*iwnNUV8mf`}0Ge!J+RNTF2fm9H!;V^NF|o+U&J<Jo
zP)CTh-1-^p>(&t8l+I8}cDE}!Hn4F^)8PCcXK10D={b^x3w`cI1N5B!T0&XWj87#Z
z<b-H?_~dn%?VjRo!!gnpaY+~-QR&=Mss}EL+m|b$YRJM?E%mP2AyueXMxbuXN2RDC
z7&zxNxk620D>N_ZP-oZZB*ab~An(J@$Te73x8;*#4h_ANFA>~THYvqyENPrcbUKb1
zf6JP}yWPXy>Kp0bq=9*MIc086W2b?8H#2@qS)&z;HN)y*4wfBy8Q+@qBz&kHF?{@_
zOn9ZQ!sYM}Jj*kf<2F=Mz0X*)g`Q+DUG94pNBDLMuhF`A_7wiF+)-EDEp|Zai>k6w
zkMtvVft?4b+57O@dYxZF_G`R|dd}DIpV{O$8RH`)FVgcrt_*Lwomwz*bV1MNQhYsa
zVIp?7{q7zYIpH`)-aWBj@82lIxW84;WibxkMkz}qTm@fGv(icvw;;sr73$t^<GiUp
zUSki=1XSq_s#Tt_${kjcF&*4ta;I=aI^(=IX1eiL6H^ew$50K}bXWQOt$%<YoOkLw
z_wpFUX<qx?Va<if(RxZjFJJyE)rGLwIzD-$oU3DWrhI|BM6kW7Ug;>JI|HY#<Q7%M
zaK)UX8zqyCrm7B4Z%r`%+1#iRNZ3xUW!_af<4Ku`aSE#TJp1I{Ks0=_q(x*y3$LDW
zBpF?byag4LEVNDL(<zf)OyIwnW`LS#Wy(4fH>o<zPS8RoJLYFc+L+~?x}oDz*pE{m
zmJZNQc9|pIO1r#^d+|O?hYMWo7diTUW2||2<@na}=hC0~;sDc-^Sa8d3w(0xe<Pq0
zupDr~&}4PMzi{q9CVlJMmq|>+TrNpVE}jkJxcKjGYxyKx?cqPc*bN3yqxjiBEAq7F
zw%D*0j-E|jQ=Mh%YU{d^dWt>9xukwEvOi{i>)}>1Ie*N!WPUYE*yQiHIXg>EEiUhN
zwcA)prVDUQUiKb8-Y%!(R-cP2p>gZDhORgzxudQg(krRKLR?e78QFiuUTemDVzVom
eUCNpwT(iHPIBTH=1UaFF**Ae70J7Ks0QeKQaMq6i

diff --git a/datasets/common_voice/dummy/rm-sursilv/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/rm-sursilv/6.1.0/dummy_data.zip
deleted file mode 100644
index eccefac1af29da768c557f35fb3d6af4f505ba82..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4041
zcmeHJc{r47AAT`|?6i?Ag-8v?BwLNGNE6vwMV66uFlIS&R8C0Y8`Fg2L?T;C7$b)G
zQbh7uipFxv5<Vdz%M{Uh=jh5z<<xwCfAe1R&ih{1T)+GG-1q(bp4$e+2N42TpTXOK
zc7H#7z6k>x0W<-FIeilCig(pK<gg0hOI5OUVSNd~!T=9sj1K@HZ^4_*iba$;i?|bE
z?g2r81RQLq1{|iXrL6^nYs291pR9f(!;Paw&7Z6ugn{7*K{#(e;u4T}8giv&tN~Xq
z2t2>xX19>*(r&~90KU8cpvOHBKFHPEUjvUL{!JVu73ngUi2RzkM!N@?8byiFE9z>B
zm$dlTttX!h(StS2iPd+Ab*vwD_Es9pqV(ElhT0joAWspE-^3<n*Gv)j+vd@o_Fy7B
z-KYgQqea-Y*wfd~DW)DvIZZkNk-D4gc3e=^zo4n?slfUKg#;=?`@PE?PgQ!jSEZ_N
z_z?+<k?oNhNg5E+ErjGeaG@htgO}`~BT}wV!RX!Lwb^Q?bY1Oxq@t+Lgt<W0X3+=f
z*!-!g>?lD=C`@0oj(+LA!#n$zD67_)%mgTql{(cG)9&PqP28$|xSkf9tM&3(x5ShE
zdZgUP1abS#ySfh57Q{90h(pygFXU~r-;n$wlj5%%E86EUIB7Igm4y_B#d{6?A~e}E
z1rKZ<%91H74$H14I19IjBxR%eUld1=b|sqo&(Eq&ZSEmj7agiqZdG<vKX>mFKg)N^
zf+haxWB(D9Z#5{`PVNOmdk}vp*TFwA)mjqshjUtZMbW$a<;ugc<%TM}+Z(6@_yu+2
zm_>Y8w|E?2Nv5Cbo}*a4X&JvE`cSq=k2!eEP}gJzH-EOT_EoLy<R$>C_UO3eVflPV
znQ76^%N{9L2+^?uE&l(OvcBXJ1gF@ls}<xJ?~zaou9m1Q?ebQL6{RtSY<;E0s@u1n
zJ{YZ|5?GzQT{@X!fih_FiQY6^W8G<K@1S=c!mt~;?QiZ(de_~QF*sA^H5gh*yBGiR
zqBwc0*{I^UqW5%uas<xokCPoA$XT-=D+FQj(#&L~#Rbz(w24&50t^27V{b!uHT^cZ
zha5S@P<AnLjWp6d%d^b2WvZLq^z8H&2)H@cfn4w6o@=73pEo#FJkVblXKjw}ZIH_-
z>l!VQ3+;Vt%7Ba-?^a^{d7MLjf8svO@psb7)^P%xsC_`N>jx%k5#6PfV^Ei-kr#05
zV(*^3E}fp{Cwn8OlolfB)Gj?}lhCD1`!UtNChfa&o3B57W}<jMsn?=VYdV&mCYPR^
zbfmwsRKC%a9_t*`#_Sgw)p`)-JkoT`0{zm3FNE(g;MZ|X*1#y@`D26DgZZ>|yCSFx
zd2oK^?67oMnKjqyu*jx=)!>k+s0Nj{Oqtm0RY%VmYUDWMF?U}FZppK2hYw3d>ib$d
z);3=hPfF(Xi0>HRFw?^`KP)>z=&|(-4LC13^K8KLs*cWNZiy6i4RShxI(Bq%;3Mxc
z^Q^JN&b%)7%;P<9_%EcdG?o5O`d6g$lyt|7gLYT?-_C*nyq8DN4_S}1Vp%<M$?6|>
zSkl}c`M<T%Cm~knT1Az}w4}&~lRE-|io#UbebsY%M||oWX!HlzwT9wt<URMTm@{ck
z12dHi`{-o4O+)c2;{g=BKoCJ4Et+@_d41=|laNemEHugsu|WY}6OlpN>AU(4pN-0}
zAK}1Z!$P9Xsu=n62$j(R^DM10mkN}DsZcF{v;3?qB8iN!6I4gby2DeIV(gwmG9dX4
zMQ1A=FK1&;ch8r!e?ZC%R^Y6!B_^-+B$t)e$_>ce%6u}?^odWi+rUp?-&Id?$L|iZ
zyL<vv(<7x$+)sbi(=XOTFE|i&`^C+f*_(xyk10q%e3hZ|wteRDd1<?Q2Q~k5kkj?m
z#U4k`fC6}e0wB3}y|@3j6K4hUN0RT6Hg1yG>pr_}PJjqq)8gQ#|4Y&2O|@v5%(JoS
zzkZz0K9ZQy$#Y)#M2&uhLnq0xam2FDY(^68ZaXwJ+a*4%K7PN%2vr_a-jJBJ)lFXa
zc@YApeVj?CDfJ6g9t$iH)Ay2MyfLFt4r~#m>4=c346Zwwt{cOxf<qlCb$-Eb_O8Qs
z1)G~oO!#?P(4x>414%J7m9Qu^1VlRxBX#9bnCt30ug*9zNAD#JG8oLWCC<OMH~P2(
zJV=J>&G81))|l*!{mCI&xsT7twBl;uq+c;@+2zxh_IH*prWY5D1h?(n;B;p_b#9&>
zvhDIh)pn$7+iSwrJ87HZ21G<mGp<cNZS0L%Ew|h3DH?kU&jUSWs+JNB!M<@vH(`m1
zMWSf42O=9OBs(E<hCyKT#aCMYO+%z3K0PEd<|ZN+#wzH!v(~NQC7M;Y)qE+8oCEEX
zLx-C&Bh6<D#r7JhmD0pb7NUn7N?r5cQA_(qhb*ItL$f0nZ@<=MiN2IZS)-0Ujqc`N
z^v{xe2XMwPmhRu=R~}w@zLjFI^yfY#fH4!+BS$EPE`?$)3MvE31s6>Aau)oDfc#^m
zZ{zVY(wIrcPa<)IWe}7VmcP5a<>bW~m6t%l%?f1xCIbGj@wMXe*g)BWGl;&TTFdC1
zxsM&a5}X!XqJK$&KgRwx2eD&sg>a4i)r<l=_)a3SL9x8Lyn4<&z}`Uyl<N+b)4`9I
z&)T7!=OR0}v;f!OY^UVUp>rMz?C5_8a*h7Y5%EtwtTdyRdH|gdTAu{|iA(c+J+W45
S0m{#U5CX4!&}C3z*4tlnySBXm

diff --git a/datasets/common_voice/dummy/rm-vallader/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/rm-vallader/6.1.0/dummy_data.zip
deleted file mode 100644
index 3d474f92bee3e3cc3495d138b69e7c03e5165215..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4384
zcmeH~dpy&98^?bPX_iBY<uIoa${1M+GfGaS8AUyJG%OU+GDawolEXyB>%Q?g-Xall
zo<p^p9Lgz4C^;8H)YF6F{;hgtDtGbM^T)Hj_S^Q)^|`*^_xE#suM2Mm;erCJ*WgVL
z!lxf!9(;fV;7D<E^RaUzdJ<I+Sgi#hiE<W4Ssx0O58&WpLI8kk9DLxH4T4-6#KB9&
z!JX_uIjw?0p;gq?)YVkb>MCgTFGf#EwO^8=+Al^YyQz2)U0sQeB=Tn^fh6p5>ktN)
z&kMAt*b^EaC#K1A0Du-J0PI+Wk|&wyauVfv+UwKSMJHm9&R)lUyLAuo`sNHZn9fD(
z0Z4Xqkx7h`v`o&mQLP?h7ee|gS!SsFr0B^H!LfoO{1Q|E>*GqR(SSqJcXHQaZa5Wf
z$E^*O?DDL;<~1~Rde(&~Q-mj5!KI7LL^KT2Y7Rf)nJ5!VD{%aX3ke?|>J)gcJ-E(U
zz<yhOlgI_HRu_1%A<ZD-m>R)ge=BbnpH%=?4_~M6O~EpAcx%D$XSkBr3}>s^^0te=
zqVG|nxBnTlN0|VdE*r}AhO}Bend4*dGGAn#*qM58=*~&qGu>|_)+hRyYrp856VJs<
z=s^Uo=%vd|T$?b;G#Y%<Kd2eYO?diJERrGV*Ca?IC>{~dkU!Y!#wTQ+?Qp(uHvQ53
zS+)O&G~VY3EI&pw;J`Z79=UN=dzOoJxE7)CulaMk5C}yDgtC1VP#j5KKSdISCNrdA
zh9}5SRFq^j{Js$R++&2y9b0wyneYznn4omiV$m>6jO>wT+oCIxAq&CsmOZuc4CtKU
zHW8~U42mR;l%dc{zNsg_7pagHis86KSE@gczht*hqagzjs=(Ymze(%;e0Da**i<~1
z@D`QQez1Jxj2;drfc8!IyTh19q%i4dDnI}IX6}xzkyPmKA-#LmpL<AbO_SMa)x0S;
zVO%y}VPjz-XRMDc!b_vI<YCjAtDIxr+p?rM#kC)v*lcSWV_Au?(y%VMD{d7k8pS<O
zX3_Oh*Y2ikcWkIyv0t?bJed*T(Ab!v)%F1pt(q5Hkg8g!IqOEPv#fhBuqhUKBB$+R
z0Ty^Cb<S>)fJve~!By#E`vs7IB}hPb6$C&hx_}jh<oJ~X%N9SdMg(3vQWz~9UXd}$
zZ8PKU&CAiU$K#yE%uH!5VPRzaP+CL@k9Vi5`#qD%Y~Cnis>QYExmnJ~>G81#86nfk
zk>*qxkz(c)J<;z{et_LPE_~}ATj&JmZhykwEyTV`%|{7XJW7)%9S->;$&K*Lpa8?Q
z!73jY8Bg5zIP&y6tW5%+Pkx^2Y*weyzjm1_QTui1d54^k8j4dzijfeTpfIz*xl$|O
zfJ3&&6?KySs7|hTX~ci?o1_xa`$l|K_ZSV|g3_js{Z&gE$P^mLsF_pZ8|cWfwUsGO
z*o+-4nriXr*4|(2g$t;mSGK8o<2sCp!6Fs&voeU1k!+%=37!|@G?o4|&Y-={MT{p#
zsdS((sYQNEQ5~EVM#}Is59*MIJsHF)9L7`9<Bmu3D*=JHgRwS+-ZIvg+ofls{F#rk
z3KobjGQ8*ZR|R}<Z7Au#kQ3KZMtR`AJ(1s}LxD!o%QMCfl+Y{cFIytqbHln7==5I1
zz3~2Sv}U&het|J5zQ~?c64}{nt%AKL>9prp?8{Wd|6~83!|owE6m|);yu~VSLU&JR
z68UG`{Rs_jl4zJt!+{%t@YdHaaO0W1fqJ@p&Ewdy;lbDHV`8la<|dnSkmSBy<WPRD
zH3LT<2xV?R6lvx_3F<dtLLL~TJn9;(MC5O=twKpD{bqQCMC=q6j4*GvK935bB&_jE
zQhqM#9&@+u*wvjrQhV!RoC=HwsH3Oqcy9kicu+Xdh%WEDE8lxB&ZqvKjk;wPR_r#4
z6#m>7YOAo}oNSN)GGtSPk+sB->@Mf#jkl)hm9#EnRM&@B{1x}(-mAx<I|+#4KP>es
z{dXk0_;UKkkLHb%5=3p@c1@d&iZ$^wnKK6(aweKFamgyyy2_NfnUBpdDP3B<a%#<-
znXztgW>n#Qsmsm@p%7saVLOAZa$2W+lrco(Cj|wX9_eDYwkW0SnzBS`3CU=yTMq7y
z@2qL`(ONH9KN*o)ySBww+;u9`STjz3UCa7~QGKXP?F)q8_;4As?4e>su|GA>xrM?Q
zZN>BuJ3LXfJUaOm5BqnD-JN$DE=%lcsWEJvs9Knqulm%?E#9{<sx-+0L@xrO*IR|-
zT~2;q@0T&Y|CI(eBpSB6i7ae-bY1A|#a=}oVUt4fk*1__Z;YhTORo5*t=0A~gNR2I
z2?uuTUb=p$XcSXW+W+pjxtpULx@w}hHlWUhx=`@3ybpOSI$}`GIL)+REI91OiNl5d
zP?vDrMqjfXyn?XHcq300qI0wnO{fHEoQ$#W6VW9NQ%lfk#7DyL<hL@CTX4aOSg(-k
z_6?4vCh~F?koT|T>UYeI5$vsRI_;V(8~^tyr|7P{dEegrq@jvHv%IwC#Evs+s$msU
zbDq7&H;FbBTx-Kl(zBg+4AF|6`eATgJFmFSV%W#saBZ~~;O%er6LNregkU<<A!wvW
zG16adM7Ug-rt-w*27y-{YNS{nbtSbyS@m^w;@i^F0Y|FpR=m(gzpIQMr`^@-&oUZH
zuP;#Vxa+jNdlO(2Up$JFZ|kIt9lt$IeumvYJKZs(`|8uZ{kgYi)o%9Q9=i&XUpTtB
zFYRerFuxlII5-iI<)#YwmqY{sratTEk{N^h^Ng{AfJk65;75JKr2+q8-uSuF_tVCr
z5>r3*7nPRI9-t~~_W0rY7EQNw^7yO@dJsqpTnB!!ZP>EwV^?LHLqPduc`eE>?XlVA
zQ@|>)Qu(hP_s_+@Z!g)!kz6Ym|EBnWDg2<(#GA3s>*D&Cb~)@RNb{^bg~gud=NrKC
z>C(%VU0sxS<?3u#@2|-(E%@y6>p@4a6#jSR|DTdru2y|c0zChq0lN8Cp2WA6Yq=b_
SLAn3{1>Z&BQL}=v{{0QNzdHl~

diff --git a/datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip
deleted file mode 100644
index 292c905b122bef413fe8303d22b5acab4bebc7e0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3976
zcmeH~eK?c*AIE2Ao;6Y=ra~F5P;(S*M$XUE4n`hQVzytDv5XOZE1{z>l6fj4=IJOz
zQBKq)rz46`!tpFRV&tS%XPkcb(iJ;$tbcxg{dQlwci-2w>+`z5@6YG^`Mhs?8&QZ9
zi2oTaq2NDVd_GBoR6)dGGWp~YqF0dD&I3-8AW@cvgD3wR94ZYGfpA1YAjlMOvR^TX
zreF|n8qzx;kP=Kq8t%|X8t575A@vQA`uajf&&?wUJmNMXqXz~o0M_K7S9*L!VAT?!
zXSoD?cEa`=Nf8i85)1<A2@ePg^z!xJ5k#eZ5-OR6_WUyy{db`rU%>s)peGaad`R&&
zsPV(`WcPA{dp^S$l5h3QaF=*`9)&sN!Hz{8r_t@W9X#Trb4~gMjh&P`m9^43mRGNh
z+=Sn#bPX5{kx?Qj>0B^e594iZUT0oF4|c1GIN!0U&23|XcwV&DHY@^pXO;lBmn~w-
zgYC83I~(1ufJA6uWL7HnH*I!c2a%nf<)Q;hj@fk|p$GLfbnr2UJ2Xfbx)yHU|71|y
zAkyn_YFH=Ea>V&xlAX#WbIlbxT<)zm`J-j6RZkcu1@$)M2+rf`2k(My)Qj)vJv^;e
z?O(t!=?DZ@T&wtKT{!gP6v}VU<s&{B;%LlN7vp6Ubx_0?6Kdtsbn3}}MEMo7DEVEI
z?nfy=^0k2EJA{{<NTRJOc<aw(Kyc{b!3?vts<RdF1LuirqShD3i8LD!KN#HOT+|AT
zvOu$O*$hqknYm$uj-Ce_x}lsXPVE(xf^pT1vkFJ1`Q@MP^~^4&GrMWi0~XQdT)F)~
z%~*e(EuG>EGZM>yY(bRhyB4ByQP9iwnl~j3a@zMMr+BJq`_2yLuJ?iT<n-n^{;}I_
z9BR`P4t}{7fs77@%UEimw(3EnepFVFpN$JkPx>X9x@RCuBjIMewa-XxP}&_|E{#~H
zmS7OT+;T#*4s@5fcV8UDJ<p{e-IL>8N}d{yc;$+m9B7}r$dzN7o3JXz)6Y#O8kZ&V
z>eO^uMzN6aiTl%KZy%ZFHy?SbzNokri-6ZCLPZ|}6qf)L-z~i2G%r72pc{~gUv%de
z`Bw^^l!Rj_>H~$I;rLYYdxUXItHOz-RE`nOs*i~ehO)a`u&xwlMz}o8@18EEnI4w?
zwyGPw!#!9$ap#nN4CmO?=&cCOeRwH1fR~+GHdb2YVysruf(w~w`tFj;b6$#FQiYF4
zo{nXkDuEHBX&9HW2jPWFawq<zjr26Pw21C}c5t^0OurI;ancQ}^%hqPX&iJR$2dYt
zH>bQXx6Qdpd1$8TwjXwEUEw@^`tGp*v9!t~_l~-r8~i}e;YA!gJUdU`^Y6|#q;6hS
zmv>itTHalMbxc+F)I71<e>$L5?s7<F%M|O(Q9oUUq=FPhmzi}4b9RBHSO$C4?>5Eu
zA+@9iubscH7;AFO^-^FFz8tsBHU-h3yrC!``cQ`YXv{nlBhRAsCVP5sKe*50FLfC`
zPw1R1*D66h>SMiA_&PYPr=l;BF4zBSv@<KJZ~b5I(ea5hYliF=Gs$mPOEbAs_)G?o
zs6k(nxI(x3e-i&!Nwm;_wDM<;__xiRfFK`I;A&ZX+}2FK&u@X1dt;#5;8!F!(b;}k
znate7VpYUO>k2!Ual(P~9^umo7*mBg(;kJFhWXlZe(J-Mk-M(nQtPPQjkegHIOjgx
zL>N`QEv5M?i=Z35@m+E7^o&+ZR6!f>cd${ONuo;pb)DaHQX?Okb{gfMPVt(^@{mE;
z4<0QKyP<L|(fZN}R+e0|ay)hu_jRwn3WX7R!ols=NXuU&;PXBr8m<ze#)?kdT9Z~>
zgCp`H6jKD7Bw^x1%Mi`(eb@gec~N;iKEmQ;v%+-xMvqa(bNo&-Nf@UMTb!rVpS|B9
z?tL11jWthwz&)AT)aP7&N<-^p%GT#)t*Tlq@(`!}@TtZ+5y5?P%EJ$~7m(ZzklaxC
zedFuDya%qZeI8%<G<(h-CrQb|RA<Yz!efKRWe>8aAYI9uZPG;i&pRP1n$(i6ZxQ$H
zDBNTU#;6b2RYT5`q#6yim8z=G<Z3uqk7r78WIEjEa6`6}w(^Kwlu|@@vg-DyH7&<=
zv&XJf*C`n#woC8Fe?Q`LwKKM0;+~WRKF)zrbc3d;r7UettEv7$dJ>Z9(Crb9dE&?B
z$QFBTDv#QhAEQS*Eyqq)*%ze75WUs57W-QfZ$pO7L?(k8m4_&_lI%;U=N1NcOrkmt
ziO$T^M_{2=`DipaBuEUd!c|b|9eG0!ACsdy#TadEL{gE2eLYEBd>W6_0CBu}$ZW@z
zla+9E`Q<JQ3M)0^U2j9#{F+(UZxa`Gwl~S_1?9D4LCpJ3j||<`8`!h;-^FTuxc5#)
z`mQO1T%Y}VfS;)aQ_AmfOHC<C_)LAylW%~aE8{0?nSB)j!$nte!OEZcFc1gJzZCGq
zVhf)58v<&A76Z;>Ed&GpgFjxKwA>vpk~r8rAxWRSG63Uy<*zPl(Y6UX<pmh9qd@FP
z2=KE+rxlk4z?S%CfVyIK7O8?}aVa$f*vdjuzgWlBnal0cQsy}D`XeOsZ;b;Wf5kZ2
z+wf~~akYZueJQz8Olb1ruwQ+-{2eDaj+R10#f6408BW4c1v~Om>V$;Q)MfqopV%w)
lm<2Xa$v`PHfj@m)g1>Z}mC#~PerPG+@P`0z(9-<7{{m$TbeRAE

diff --git a/datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip
deleted file mode 100644
index 032973e097f490a7ab00002bcc0451ff0afebf56..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4811
zcmeH~c{tQ-8^=dOg)mvh?#Q0ZFq5e4Wh@P{hDx@v%&|7cphA{H_MHg{MM!v!^++gd
zDoY5Hy^)xbEQP$I&UH}7Y0ls8J9Eu%esj%r-Jjq6+~4QEpRYa=NXr1&cuf|#8-Ba_
z^TG(=1E4%zTzo81)>vz46JvS+FhkhDYUAVS%?O~Oodp5_v}DRff5#vq+k@ERAhvE8
zcTW$9925qD%fMwIFgOGTqcVC{w#~Lj6sIye#`8OXB~y8)$7iQxy%+E@%f?XD->V{z
z1^^JGBql>W0E@9kyF#%ZxNi#;m!WL+B}w_mh3YUk=i05xz9w*Hl+O62;_7)_Spmxs
zt&4nj=+{Jp%e4|h%m7X@2cXw`?HLa%j76k>Y*s0qs9H)}QMRwje2FWU=s)XbS$gPb
z8T>Ks-K&C2NYlc+2c`~2&%-*<CNHj|Z9%O`gihp%-ai6Ygmkr<?g^bV*iW*P7}wFq
zI|*=V+%s&Gf}M5BZVL@hK$^$Ew38UR;H=v0NXal>?t`fT5IZxCoNIxJliHzU$L=Zi
zRmBX2O6W9XY0<LypjgLJ?e7-l4@B3<$PUif3w0!fa?};}-gqTlUcypd(TEr!5AiDW
zyODjZqFvnvAI&O_T>#DoR4WYbTDA*$^i`lU-etDCMTk{PK1;LbN*RBbH8Xjfomh{a
zQx?o!;%`n{@i^$kh>3ph9zYIo`&hALG{QVZj2P!5)}MymIg<B^#(6fo=9Z(%ki$~1
z-=0@NpE4`(NHK1QL8s8ttL}-MVJ^6tIuk7&24BLOIm-;Tv}69rX_6DJyS8L1_mMA`
z=xP|3qDt{s$nrDG9-_`ArQsF+>GTipha#gF30H!w6P%6bZtYnCZU}l)5e9_LA!mvr
zQg;CWQ0f(dvcvsS4>7uWjWVEXeQ(&00rFYbMUUxDEayh{$eP@2NGiV?Y*uU?a1?P*
zLj8B%BAJK#$H8G7mr;le?mGFll~0wW@#T5@QuA@@Z~FVg%26;#5x)0X7`jMY<qpqT
zcp00mgE~6$6x_9p=Ny*F$yqw>P<w<zJDQv#Wv6kFv#6(#eZVXUt<5wksY}Ryib^xL
zvu!*ayVohjCKYp$M`1Qt2Q>pvK8z17OQ;m%WRC3O(Tg|4AiK>R&QEGLm=)P{9As0M
z_ap`n-!FsMhX*#H<8n_hPahQiu<!CZdws98HR-+3(S{~zVee}KQD4Ch^2LqQ0@Z@=
z*18^81&wo@Il6q6S-CTqM^Opd(8nUH*l?hsV!ZXm?JJR9EFm<vMZ{+F@C1&r0-z-P
zT#SGFK_!n%^zJ8C3I30k54;2~<__`CKu>;Pem$q48cn=uX&aIvL|fB5Ql}%d+<+)x
z>lb7cn)q#?(P#Kp#6z0Rnrdg*7<Lw%v>Bt+^a-V=veefUXYGupv_m`8UzOe1<$tr&
z?P8IxgEFA&eNPW!uL^QKZ0+W{(t2cslTYE*z%wC>94*GO!Hjxc;bsCWiD81{Qe4yw
zEy4hjmpZDl?x=iiR4|fMt0rP3zpe^}X^+RW*bLO*pLUCe#lM?7H}xg#Q}@xk+!qgI
z#_bcYXJW%GXr^QY^aS?F!i_@mB<7QXJi|1@V*3ci`_fRxyNe17;vJekqJDRg!akVd
zCn=r)u!+*v%X%cxFb*9@IORq#o_Cy1DAh?(t#z<6o&cqP%G6GY(1<5{>^ZPtA1*K+
zII8FY!(-y=NXhw9^J42QA*Ii5mO{NqCa;aFT6GgGKS;=bhRI6|WE9m*L*JJ;Sgu|0
zyI)LbRe`y*RkxkCc&}VBiToV8K$^}Q%6dYB*khb+d5-)MSPdDz58W%Q`XLq4!p+r2
z*CXUj`Xku!50gw#oQ#-{zHts0LGJ?Z8+F}U{iajz*`@>|BjI?w!Cl!3ypKS-Y2Za*
zZ%uVCUtIF8ytoVVdF)Sy_wxhBL3uEp_VpNpaJGn2anx{|WQC7cpvQUAT-g_3(K0r;
z>PbgUhhMts<X2rI*Zfk--%FexRZqBLuQY{olq(6hfosVIK965!VXo=%N*PbG<$LjW
zn(}px^nelXvq1|gp!IJ<Z?i{i)sr;!dcxXyVE>ZR4nyPrQu^PLlKYvs@;XXuQu}uY
z5I3x&9p+bB>M%SPbyJ3Y4QD#aq0S}KR1g*c<IVwP?L+&MZkKl_YfSqOLQrDyN&8M#
z(_Pk763hIGrQv8;Tu@r_j9Ms_EwI1v26Z3Lh@o#REITl`u<A)nLB{pDof$WP>@!a>
zag|d?Gl`h@o`o=FfjOR<-V617#s*D&j875NMWp1OZ$2Rr%bR#N7Np35;G=_gBE*<X
z5lPMFd?iwQeMHQ&Xi4xybE}A+k{cJ3Lk!;ug6X6qI;BV_nxrP$S|k_}Vwyy_+l{i`
z<>&j>+>x8_zzM*?#ab=qgs9N9(W04FatV2c@gYCq^rBTXd4SljIXECp7#<}y4mYma
zd6D@S!F`MMzO;+<_R+N!wJQ^^4Od(q^WCe_jADLQPZS(iN*}TP0&97w&p!2{1=xlq
z&ts1X5ff_=ZmPo56a3LVzTD3opKE$byfrF6+@Ph3565&T#N>W7jOj>PeQ-Rd-aO<1
z*LX!=gN2dTyVs}Zy`T9p$NPV9UT&>dzpQb|9@MwX`$L&-p`o+F$_(&r3`Zf$L!#v|
z2{Q#IH7^IA!2n{AX7-+zp3ehHJXP~7W2?~Pt1EBmpFWaZ;q3GOq338ns@z>o5sC;!
zD00;I9JK4tQ*4LU(_x6j45)%EPV2$MGevH5N9vsn`wYqlWyw4~7m=PLS=k~~V;qOj
zQ!SpfvTZe;sxo-Jx9!ygAQn9BH~)q3WnrqdUyn1W#QHxrb4eD0B}Oi<Fmm3ekQute
zFDF0ofrZ;OaKF!_ahwfm6-$JQflabm-`*5?Zh?wmg&M$d@OBqwDY@fdIq9b_9Mj+4
z?vndYSEU3SG*SCN&dbz<d&sonH68mB1IZgX`{^+d!&n|+=d`^?u)cPQGSKA4N@<iM
z&>}gMkEPqh{LH9V^8oXkcC|udd{EyZ2Vafh)$85imYo59D@?O!%_B-411pE}*DjM)
zy7p_Ggv=gW1I#feh`m6QR7NfXbm~^ci^;Ka{55mD?A=R;N&A&D-J>h6KL(Bv>L&e|
z?u+}+R`5A8wI)UR@t#^OZ?;J)ej=*F*Ej`FX?ufQymkCWc3o7)?#8quOQwpIyF_#@
zH1^6Z5AO2TsBRkFFM#uA{LM7)%9Ntw^7FKZSZiXvCS@m%MGZW9U`@=A@BeHv;`hXS
z+)XbMlwL8<I(1uMG0=NWvTjn1;Z!a$m^gcMP5on!1@QcJf$*=j!77`UY{S!R^>s(6
zSIVC{LAz}?HXCC3$pO;r5&-UGB)R_Ng8;K?8@JnRCEa&h`7Z*B05$`zsUfxp{0EEq
zYtqjKbCWcymQ5w;o86?qHtgo#rnPx&*lsqz!zeo%kP|>l`P#BhJElc}ZLyvd>W<af
zq;5AfTd815E1{D5m%aHl^XCKiR^~J<)yy9n7lr&cM{j-Ph88!o-F`INN<O}aYVzhm
z?bqpU?6~bid@J-CJ=M@HBb_?x_KC2STEs{-^{4spPweko{^uU^olQ|PMaun5RI`8V
WI6I-~c5j4cpuAlv>d|E0`1N0%c1P9#

diff --git a/datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip
deleted file mode 100644
index 4c837eecc194e53ea7ed4f7d0175e297dfbdf83a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4150
zcmeH}c|2768^;gEGRB}r)(J6L!q{3yi>53ovTxZMW0ZA<F=?^H^vlJSeTs>$CE3fq
zbR~r*m8F_O3RyxfWvSb>T)(4wh2%E(uirnvGp}>z{Pn!g^L(D?`F>0dIUu~i;x(A>
zWB&Q&>jMTL0m2Dy@6$&K&J<@=D@!QAaYfb)w|Jj81q0Y1V;leg83!MxYX(8B4C3Oi
z;zA<(obXfGj#gLEP}5LTQP)sWS7$MLMkap6C(5%JoqX~OKp;(ft<M(%v*rPwW#G+a
zV`$3!Yycp~4ghMb11Mx?cOshN=l{7-@mI8QGZ(eLFI01^A@RH#pB`g;O9^$$Ug;xn
zGIFmIFVqAZIs8^4kN@`&<AMOx(I!S23e8B8sd<0uZg&Vwo!S1WTCzsRp><Ni7oSw6
z+YLm#kW*HAeYbGiU>NDKU4L%nfaC+naYYY&CSGKZW{G}w;>L_AQEFlX+eEtMu&bzL
zPeO(ITO!u;wAxlTkEaqIdQ!K_LNhf|p9(`=TT`V_hPfSG`uwRjSkIQxAfk+9r62Q!
zduC#At11VkeZ84waq21Vn?>*6rR4L7kakZs%8Zr!p21_=(t0tM9Xtf?S1NnS&)Gj6
zXC$KIhV#T4Um7LYn%Bka&fFc*oe*@-o7D_TQI5>1z6kTHgJ;{@{GBxmUu>JDy5BZ%
z>@fiAjsxqCX1(qNSO1?^{V;JrT2Q}zKqw_`P)h9$q*SW8pfDq+LQKNVS=0JB&GXTR
zHz>seorCP_%*{66+8-!Ti5*FziU?qYi>I*AgG|Qs4yiKOZ>`gvca8AZ26B0@1amJj
zXW4GLY>X;h-<xJO-;0dCAopr|ocQ~lxTity#QW3L4Nkh4b_yqo`RMkX8I6r9O>67#
zpj!(ZQq*ai;aBm{-xZvj+3l8<Ebd$*(Njf4+LurstGLUnF_Lb)<GPxgV$p89>wR_O
z0o7mzJ(~VT*(81hIC%`pk9wpvS^m6KUm-mt%BtnWQ5@WX5N^T`pG+P+!=A}W%s>=k
z!k?h~T(OB3f$g94gBljZd_wyMV;lmFCv8-ko_QxI*<%#UL#H9hf8U;WQJjr;JcjUd
z3@v56=e?9aaE|gC^ID+ECn9#R30BU`ax$i?8h5iT>2>f5-n?B~P0<U~YaOW9R@U|M
zclL4z7m6$4n`(=5@`G9Nj@=c{#PWspcUw%eRaDH(8#M7q%hThT9wz<80S{dIso|bJ
zdyMzD3KJA&HqV=nAYO^zn9bVY`^)?mN!iylTjZY)D{H585`(Xw-E5>R=L93Mhlg3b
z+<E5$gIN|etyfyoIjn1U958C)Oz1d{d?M%v&BaZdZooNPVK`&7?Oc`e_(UnDR+@@g
zcP=I&%r?FAENszjm9v$xu{cANQ($91kM!mjlC|Ka&Xi0KRbTj2k}8r}V0ACMa*)_4
zQZ?!SNG&uax|$^2#q{sMjgV8%F3df9BX>zBcxP=eX%h06*hQBBYi(o)v3^c2$`s$=
zA1T6-L)Bx{p>?)}J?s6*Pi~<FQ7xffE!Tt-iezwnwW!Of#gE}|n&_?edy0Fx)H7}b
z7oEpZ{O_mh)2Qrv+6vq+stIJ@2bL!2Id1Fh2{@w1c`rUef;4Y+J)r(0hh%$nP?(KY
zy&c`*H9@Q6!01N9h58weC1n>k;BsYESyz_g>PPt|>KY61|Dyh{L?sJj#>n8f?*4bj
z8j0fOO8%LuCt?kwk(>Ci?T-{O*sbhF6I%PMJCAi~;M&v1GwePjTeprdHo7MB2v!uS
zbmvzZ_g76{8$T78<G5a&#yovH@H+QcUSZ6{j)r8WQRd@oMb^2NmF0aGu6jD5g6fkx
z<Ppi9`6&)*G9vbdjdbREOwNCdng<@7esHIv$>pJX2`pKxrc(WRIPP%55w;{3=>#&s
zn@*#~C6M?n-iS-b*6iAq^;UXlf8O<nCAGC(N{$tbMloR2BJk*V+V!&Z?%BFgE%BC|
z8$&}{{n$N4y>5<TM;JN=H`~yv8s&SK_6lAZ%`FGmA`qKHgvhDaBIt}KNs1OwZJ$uo
zd9k>3x*MK1bTsh-N6T=Nt1JpvMZMbFfRB`{J{X`FYbfLxR>)}cB5z;MdHIjM#RUJE
zb06|!YNaGbemlj(uxU3*nN8SQJ4}%({lg0Pp%gYCu;<DU)eaJPzjMZ_FLSi#mgK11
z7Jj7Wh|^4MB+bvhXJKeiV&%YwXLY?rg4!ZMZMU;Nu<pdw%Xp2sZH{a9u2<tb-}+XF
zE*<MNZ!1d)Nr6Wm^V(EQQ1FKjF}K}xl2=4bx2-!AQWpR~YhoJgGipA%3&IK#s!G$N
zP!(AV3QDgUO+U^lIiu9gC3ASO0|$MTcy{y^`W~C*x@qNfWE!pClQewT@VO}umm{+c
z4&OG@M-k!5-tFNQC|XeQ0;9%bP_W;#E?@r~3~l=medd>Y?+=~Zmub$HF}kzpMRj`l
zb2l#?>5Utp%2N(kd=chH|8zQ8F<M44nR*iG_wv`>IBHcr5Bb;2QTAE8_Xl6)UGH&I
zI5`?67H39tgV?-kyv?bpS_IwFduJkBX4Boci-r;0_vhyS`Y?%b*i?V8HpX2h;y^W`
z{}Ny8M9t*JPvx{(wzIism$~1?<+<EKZ)07HuS&R+R(i-qIaZl%Hg-vlwLH7%SH2)H
zwr}xug@fn(;^6;9KoqbPaDJcO%7Fji=6_CF?d6w9WBW2$Bz<=FAZ*du|1hm3f4S1v
ze}REB16&P-fPc&Cv}Rf$Y?;dkscWjUL|y4>ms115VP%o}&Ex)@x!Rd6XDUNjW`4IC
zK=KbfnyKMpFD_-f(m5<Ab8)duUh)w?Pj_+RR-SCjp<Uc8LzkU%tWj6)+~w3tD9hAU
s8~C5tUq*iQj`_j{I~i<c9T>~(?>Ej`XiiW$0N@3GM6i2GKmY*z7ec?#umAu6

diff --git a/datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip
deleted file mode 100644
index 24059a67ada958d92e28ad720d825f8a6880064e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4569
zcmeH~dpwi<8^>?UA|-R2EJSEz4kPANi4Ha-v{2654>`n~Dj{=Dk26upVM9(G7$v1P
z5yhjF$6-7~p+afq5b>Mp_2B98#J|7SZ+q>&@BMSVzu)_FeXr~Km|)hia{&vVw?&?&
z-yVKkHUe7#oUgn48C#sakG<ki^9{hdYuk_5E&O~>Zv@!bN7exV`zY&TvSJXa<v|?%
z5RRt^p1$4)H3ek^QVFSqP(~t@l~)-(hvcwqiSSiM_qKQbPT<Mfz0&#xSli|VT5}vs
z>FGfO;cNgv1p$Ea8VEiFdlwG{A8)^JuPX7{Zo4n3yZ`>Gni8};`jsFNt^)#^dv9eK
zhR3}W9*-{tTcq;t>`<#nIGIfdDXE}K9>r}7=R3YRo`2KZjrb^30pUEDjbO7UlEFI=
zZu_dAc00Gk$!&kE?Rh8lXJhV1X9T@ln`=8LX3cp)uV;6a0p<+@g6)+%S}9VKG;fY{
zkYaLbXYBp51DqT(<%v$9uzRMU<QF#1u2&DYh7r}EU1o<04Q1=OkEN?;;r((O28sE^
z!BosGtks~~f5!E-E)^V6kcVGE*JmXE8MPs_P`+rY3p9Hg@6>N@hWVWtN01esqYggr
zm7t3H-F*`19MOYgW*g8%=$_C&UPC7~h~28ahJ|71X%gGr#tw;eCOr=`a#k=9?Dh=a
zB5iFxdY$T&R8~1tfB&l3CnarD5|nGA{e|@8*yYoGuJUK;WxN<mot-y65YJ!YRUm1!
z&&T|6F6%2M$t~(|=BBv(NKfb_ipj6M$+gt>A!p0U=m1!fsNdsQoRgO}-2dXd*H+14
zr*mBN*Ww|sdx&Q^7V>pb?;&>Kaz~cl;Vius*Pu5J@Apf!O)wrbC5UxZem$EB{>d~d
zUw6(Bh?ctk-0#`JVtADeQB+WVA1QU`>E3;!dg>2!bxO4v15NcGJH|NQTL!;r*Kdw+
zLxW!lwKK@`Esb_hPcW?J^UeZpKzZnnyb@^{m^wBtByYW73QszrLUsLJwp^}^L&x`e
zgd@4G9&O=NAje;RBB^4TcTw5ERD{E-Q#A$6XnAb;&QPvAX`8!EUW>Fe6b(GyeDU^v
zEMe3kcmEa)?@kGZIo|HHeWhJRL7~h!e_ZyS)|zS)IjE(icu_QNZs>T|mzy?k=$M1k
zi4HN0D3JirBx>?t*OXp;RR11Xs}o%os?Z}J^(Tj$?@D;;#m{H`9{uHU<%mR>Ocl(S
z5!ToFQTE=gl-i3bn&avd!6WEVzX$y=j{x01Ia;mn!O}}@-M*0@;G84^Z&*^7T8>kk
z>k0M()DxpOJXG-uy~B4tMuik%B8#dX1YewDTh!9vnuDpv6gGy&(lV8$<*qeo>1Xfe
z!YXrk++T_=oZWvozwYr`9(3hRbxv;ucI#$yFg1UZI5MlXzK`SN9ZN^3E=SuCx2DfC
zb6Ard*&qO1P-!Bo<wy-lo(cpWivMZoQZ05k#v+edZZQ@*aHy|_Fza<YF9jY$pQfkD
zNXe)wj@wA-V=r6*=fND>aCOZ)NoG>#A?vI}$juFojp61S8jc@qu}pjsk+_$~j=~%w
zlBt@fkb?47XmC`WV{~n$VTd*Ww?%J;AS_%;(Fcf=omy|T!`>uLbVLr_e^l#MF*InL
zP*>!mgQaS4N>pD#{B}K#`D%iRH;-dJAAO%4*Aw7sdysiMaC)YnnLXz=Q&Ey%T{V>X
z-sP2=b)9(Rs9V)Ux~qEO;hQ`ou}1Pi4(}4UuT;1PWQX&8I39t-dzNiB_@vV(=WI=E
zL$RkD;kJf~l#+#|_yl59ajvAJ7LdS(2K|iUW7((@fNzT>9JM7WlC6w_h8bv(*tV3h
ziON2T=Pkz;W>uJ6WrpU0jIBsd=ON+S9*jZtdxY!qzf^U%>BUj{JIBZKzchAK=+LEy
zZ#beHwLjn232wD*9Go_M|CUzZ|3~K7S9?`0o3gBKmA)E$tM#y>gK3mHdj4CpT0Hzq
z6Gmwb3FCwJ_W3JUR;cy=&z1kJT;azg6rW=im(n#32B&<S@q}L`OVg1D#@m%4H=4|*
z*m`mOwR;ZpYfQM6nySGPbetu4b5OaJiLr@t(fSFPcI_I}eMmp|2(w-!VBkvg<@8fD
zGSNlry5yB<zs3Q9N2<V#4y<bzqS4ZAG{5JDI2(+rN<%OtAW24-dw1-8cwkpJbejX*
z5Wp8#%!%aqqicJ~Z8;{o=`a$}ODLJCKvdNO1s1SF$VT>t@C+sJT{~YdM_D2CA!)Rt
z&nKH1;lS6!&+9DGKA2A1-G7yR(JItTPw4VJ4?@+^^VF_5jusIovV2ghO6@1<ui1?H
zmy(FFv}-LO_4Y9en#ZN~CBN8M=&-oQ%hExgt&^Fj-o0jUS7Bm<y&?m;KFxLXE0o--
zQ3&24<=yOY*0}j@)tl)&pK2bQ*3$=rcN9;D(^V#{`zU!Aic}KJ_&H80G2#r7)J`v7
zI+hZu`Ju^Kbz*yK#XbZ3=93RwP~w(3HfcdfLQh!`-3Go-17U~ptB-la9Rdro%G{jK
z41O{PTI?YU^B=zpe2WuH<^-0^>T4(oE*?L3krgUKQ#{5Lqzs8@8xzm~@Ob3SMwjc_
zXV1&o!F9Vo!;cAUgnD#U38A5!#AAn03Bh}ze?Ce_ANKz|nfb1HB0%*jB_Q^<aq_b|
zxU+L~cWkGBQ&?T&uF0BAbN?t`=b@np7uRgu*Shz+vx`*ro{>IbO)f8ukaI%1>r19I
zJtZTJ%}lMfzA7xvpqw$zgI`V1Y$~F_DGeEjjO$2=2Cm!`zH~lgX}NGSODx({h97wk
zyYF}*xcaVK3Ohd%`<|vm-?ZLnw)<M{goQsn`sp*NrpHXXWaRu->|D}}k;N3L=ZUb`
z>@kb5Gco-`I>of)8EcEa46k5q9cv+;vX`!D5rP*+-hrjr8Mnq=V`bWvs8ThdJ3Hoc
z<4?e~9U3juW=eWy5TDaTQ8Xcy5{G+ZAC#D`!Opd_Q4QObhCw3jo|Iw|JNs_QP*KQ%
zrnWNyLK|o^^1Tnqti%b_<52b7qxg8wc=fm0;Jdk5&~>S~QCowqKU8%&wS2f=koG4Z
z#0C;ux00O_{;>cEFrvHgxXfIEzniNy1e5|61Af)rzdYc-7_DEEezsYQq!C@xDoNi>
z7mK!Fy8dxri(PoR<@!!zoj`yb0%qNoyr>oTwLn{9z*yK7^;yI&pZu3%nXI$FTI^pw
z;MdTfN7SWID*Ni8e=9XC^gj%%CYXhMT-@*SVP`4&^ZM1J7e}98Z}?k1UDh_2QpGl`
zp1P!ot`fVfj4#E4H?AK0Q&Imn_)4|rJDioztc32~w0ij8E6+-7FiSlE;9`CKSSe)0
Hz3}h9dt|%<

diff --git a/datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip
deleted file mode 100644
index f1a939b8c4185f5250411a96f422b77d014f1c33..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3947
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_I+3Oq=Rhpr
zUtovJ`657Hb1+<8loYgtV-?d^An!dR1A_ta7L*hvX6EUY6qkVmYSKKrgkRI_&;#o1
zd0)RH20W|}wpzOKH3ZbDd2V*{xU;w@C;h|)v(Vc!CtBD_9+_rqZGUsxJ@??LPbIVu
zcO7ScV|&7h>3Zp&t<Gf+%Hr$XZMf&8Cf;P|{H5yCTQKQg>AdQiuAJtK_Q*wIXAX+^
zZIf7Ds@1o`MdbLo$a*8i)<-P83m36Z-|-~q#<G1vOByZQ-ZvR*CMPuOyea;YVY=8~
zLW56aE2mXtIoEQDJ*S@-2h4u{ZAG-_v05qLFNqth_MTbYynW>=-u>$~%Y0g6_i<6*
zvTN67Uk~4~;NgyPm$w&wHa68Z{ns-3<^H9w`^_t*SKByGFZ$8;s>pWRuTRsTOM_Ae
zB6uC<mi2xH2JbIm@ahpCyeX+=1Os=XZ{86F9=GdXL@XUln;e)d-T4K6_Dj04ENWSH
zU4L=e-BR;wzf+<=P1aP`%=KMUmcd{8O`+AL_TB5y*>k#HhyQfl(Y?z%(NaX<tLId0
z$7%aY?^o4C?-BpfUEc6!hM3diM;x;b7(H^4_<BZB;kH4uSzgnLwk8*s^=m&CZZTAl
znDOSxpB(W^Jt@74W;w?n9Lgwt|I>1h#tx0*88;Uk^iZyFExrBf^&OW@^D0g6>33iI
zwe4+KZOkPVe*Yb>3`HzjE*FHYnfv{VhQH?5Wo}DDeKszAC!Jb-bEWa>`qgQ5SF--!
z{4nKVPQjJ$3q#xDzWr&xXpRV4&v{8f0ulb@M!;Y_2E-=B2W?qmP9`vmr>4LI7?g6c
zq~6s2v$@iaJgw&!YsEAg{+T^LnCpY8EQhg7P-MnZ(Jfsut-Fg7<=8D*PFPPoqWse~
zR(!Wz-)WBR-+qXGyLls2_s>4r`L_>$GuZb1_5DqAb>_UWdLuXMRY_av<!!g_SjDDQ
zJLGMgRi)cv`uxe#r6(G;Do=C@dE>ymgro5E)<quD6B14ZE-WbTnbEN^%zu{mX_IAJ
z!`V0Pe0ynS(6u!tcZ9Bmxc6RM;K{gb!R@D;Y9H%m9j~=jDmWFqe!6t){POBs&ySzp
zx985Z&)f3O|J}2PjmOaEZMKEM@mm1}C7MfDyB*i@dy!`M()-<^+B9v+XLn?6kMd0|
zi8!<7Qt5{UM#fb#y}K3~G^a@3Rpie#j68Ww=zPk=2^L)LEGdD{PyPSsJahlEz4m|d
zZ(D8L|M9!~?*Bi;|8OJX2UM~k$B#bo@l%poTmp|890kj0)KC^Ry9@tW2LZFq4<II?
z(9bW)NG&21H@W9e=RPsuVSBJSsFj<8X|HDKMDKIVUHzTHx21aKF6k1P7+3Z?>a4=?
zy%SkH|H$iZ&N-f$R($aB)i;YL_8zH;@tgm5!rg-Zx9YrYx-Cy1D>C&kF*|#SQ(Q9q
zdFdAZ6OLPcamlQV67he*b#=;`T@p5eKd+b+IDT8%am+!{;%m>w1*h%YG*(4*thpND
zq_cLKn-j}YgNezCTMex<UTJPOd}3m!8og?BPus6u-=go#e6mGfon29U#k<VCET3!M
z=1wfFn!NJt>5{4Y?Kjy<rmwNw=k}*kAoQB_3i<WSJN2I}WDw9ai#Ik2m)^ccW+!7}
z^AyG4AdbeRRo`S(S123_tKHM88)joAG=byfEr;IZ)~K7F>$v<DZ}*h-$~m~o?F!2)
zCI@NdjT2t~_Ylx&v3~iq=eL=>k*<9GzURkZ*3L7FQ9qV=$Np(V)$9FE6LZbW@7(>$
zkD9GEi?_N41LKW@iGjh4_-vJ#hgtC9$X%rq&i39h02X|=v|=1KD11F%&E>()+_fmp
zcH`57H$H|<To)AaMB(_8W9}a&zF(}f`P*Z&@Xt>kAAaMUA#Q3vZ+o$IiFxVoH}&58
zR`1MS|I0|nNbT$<PJOGG@1-ZUpE_sSzE7xPc}}!q!xb$Htw1KvHDM>%$~0ZhZoAeH
zcr~-+Tyl=%B-JefnvB;3j8ena#jWaT&e`nyNO8x62+h`>hRQWLdae$9IVsWC-*#Dd
z-~0Bo?WEvOmiMnO&&;nnvv=N~8((L9smXnA^8KX0cxT<rdFylMUoU4qHtTWmmuHQ8
z;wsYO&qxa>GMfhVOqv;$$s7IgNuyJmqpRN1Co@899kp&*{19Q+Gm%eul7ERwqmk|P
zru-QzO>z$(S}5ij;gV{@%@yMM!7Mye=&tF~5Bz5Rk2yc`-wyw~@5Pr3EBQNGm)F*P
zSp53S#t-Y~|INDpQ|8$JM}OHsQ3z>wf=WhclM_st5+8-o2+Yh!ZFqrV5u=65$Rx)M
zu@JMND*?4%fZ?qph>6ndWrZ|*Nia}_0b$@@M^{t>@ic)6+l1KwM%eV$aWRoLft$lX
z%RtRxtlmPj)=(S65X*pBjbTY63y?wzC!F2_T87j#2HJ`<b`Z9rwh@tSO$O$1B5j4Y
z7zx{n*?2;>b04sWL!_N(B?{1HEX}9@Z%|r9_!hNUfNXOdE73M18VQ8G3(9e*6&|vs
zKiP=36sh_n&Q{c-8rjyZ97NlSQE=n48&??vu^X6@fl0XnsED)(hL<?FtY!rUA_D^#
N5azKkFgypl0svRlk;MQ2

diff --git a/datasets/common_voice/dummy/sv-SE/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/sv-SE/6.1.0/dummy_data.zip
deleted file mode 100644
index 9dd77cf834a9bf1ef8eeb6ce615cbdc08e552173..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3993
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7LleGm+1z(
zLhRsQV28^GB0!IGFkD@f6!goku=693_neV|!I&gFN{SLQ^YluJ%fP`kX`WreuW5GZ
z!F9IZ+wZ6WkL&R?(VGnZnRUKyJ_)QIKHA3%e{0=ZmUVfPV%ec%9HPo$4eoZ~*C+A&
zD(f$m7G4waZbSHkR*Al?*2NEHgZ@R-h092=#R=s%xNPMNR-3aUj<@(I`;;eZ`FpvJ
z_<5$^Ua*uiXyPl*srfpA`9eh*OFi{Gi`p(~q%R5e6;>9sbSi6HIZ1VXm2klJgTLam
zxqMlB<6cVyda1oldT}yBa;@w7hEF!n=a;)b%Cnb`kGE2-{je`H|MRMcypw<Yy5sUl
z$?f3rY4aT#TdcyR|7~{KAu^AD<Nn1KI~N=X(wpkN@>!)=`5l+@m*4hZGY6#)MCk5)
zeRd@eFl2?97#Ivm3f+{{GJ;_{VQ=AK1D@9G6fV9AKRiDA%N+2%#j}9fGvL}uq0&Wf
zbe^nvz`7+OL%sA>NyYmH_8N}ojnCFfE9vZ*yCB4|SYe0GUDNy!g=gn~Za*?_!E(z>
zToO)iTZ;KAmRVi(58bywblq2`rJp^E{Vr-rCtZzkS-I4G-UUnP7C(`S$I-1prh7R1
zx^E@<TrrdK?=^g}BrryJ72Cz?cTtzsyd|<WEDTdoWsy8+=(*bY>;^Yhj>=h|9^X+6
z2w#^ke|+7;TN1iY5~EaP(x-j<9i?fw>2%b$+dKB!>5640tNOp$a$id^`kTI4SpAve
zg<JnUVc;$B__-<1`R~y$589q{{WyQ;*U7(qw%W@74!r*^_<zw)Hbk&{&Pxhn&!6O*
z1Ppc|V6dB$6zpY*IhnvRAT<RZ_@G3MC1K}YI9;fs$iw!)xUY%j+EH)bJLSp+48cx9
zZ@2A!xiZtnFT-q-XU<IB`7w5ip$GMAFYNMoy}b3R<nPs1ZpytbwR8Nh?>)V+;(t!O
z>bXT}nL6UFE{~T*l^+T{wz^XK?d0_isk0`rCvePEaewmW&J=H5yQKnZlP4F?lQa}z
zKgfBE!OYX}kwPjDW7G7khz$phL>eqV!MewAW^hNC$E2e#CcIVlFzSxbVx1J?`g_~k
z{!;Ntmka-X{q+1vO;Pfa%6sSH{!Uw`|NVCL;qJP_k?+6M-TD72`TiD@y@}D&BhP=o
zy;L{1F3eBzs(pKo<gdN^`F8x8ziQr+4o9I?J!Uhb4kU^csCIW=4QX6CD`9c_p?g7I
zJ{jsO&P`6`N>+{b-NGnf5N>aGGJVCps$d2l#nk8;T~l^P?#?U&<wj@EWiH23>X%4~
z*Yy1CUUmL-p+bML!1kptxBq^->rukDKK?nOu}f;F|7Au*DX5%6j#49%qO>HnxC9=X
zIEty!*rac4uAOG<2PzjifEkpuS|`6GBejT7be`=$oeL}(+s?ldky7B*IXP)5Bh!~_
z&g=#vlQ#9<C`!<L=X6#ja!t;KV08^nCj0ph*gw1M{3hI$y=Jk-oB%eV?b-X+PwhJJ
z)%gD@iwP1M%DGBRAC}Ar?=b#&Z9~DYMO&P67B8N1VbvAEIl__~1PluA3eDvV@zJcj
zzjNE$COhpSIT^{voW0rB%Xu=owb~x&{A6qD7UNx{ICqx7K?!fR$z})5??vSXl>0u4
z-Y2+0@AR+F?^v!(|NHgvmF&CQPDPZ)taIDr8Yx#EX@2W@<_h+m{+j|OPM>kQ^~sSE
z_lp}g+g?9fpLWTAS-Hea#p0Zz&Pb(DOS^sBnwOl?I`{kWzLT2i?|NmXmrmUNOZ>Am
zYBnl$`fybV81M$bfVU(m8)fEUmW?=a)7go;K^0S5b{e>1@~R2VoWxl)J&b+g1fS?d
zH6m+$o``O05a(XwJ=aRKLB9Vmdy8h`y4bEaOkYogaT&0uT=(9tD;h2~ef^);hdOeO
z$ZRy=Xw<s7g=NNsUB_k~Qoqcm*lKjoRAbMlqzk>%7^^i8dOyFIU~tWY*^l+^ZMhbM
zixuyq?<#dMDo5P@zBhM^a8Ptk$ri!p?O75RJY9uFJDHd)<X84?QP}v=!lXGlapv=<
z8}qnsti65uc(wXm(QDh^ChjwN!>euY^iX<A$j@DO^I0o=U5>Q>F5h|W`Q1-dyZX47
z>%?a+tx1@q8P>n@ic9$n`7=54=l^MM4)J<5_owiwb0uwO{0%<2$UjroJn;LsE>rx&
z9##7{KFd9lYyUETX9q<Jq$vt2+mM^079>RqG)gk_Q5#~Qc)@75GBU|A<7)m&K%F4K
z@YWH;L}?7OLK?#)7^uR4Fz~OVE2@Eb8pece!fX~JY<lasm`IyojbxyGphhxQpCKA?
zs7+*ueZVZlu%wY4NFfCkPM<;SLuxPsEyfu~2#Zl$kH{9M0<$mi7Q@?;gssJFJt13r
z2iO53-deQs0%$pwc2s~jC|x4_j9SklTfUx^6w48{K4DLT@*rxBglumuJ1O=e)lbA(
zj9MZiTl|fK6pJy+XMEP<DtjQ-1JgDzad!h1ksR6Zk_eaWtZX3Dxqz^Zg@NHO*dYLN
CIBNy~

diff --git a/datasets/common_voice/dummy/ta/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/ta/6.1.0/dummy_data.zip
deleted file mode 100644
index ffa8dd08b0af6c275765bcdc931c7b8bdecea182..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4385
zcmeH~c|4SB8^<5Z$XXLBl{H%<Bm2G$+2e?lt(3jSzQ>qEmKGGz4A}+~%933vAtIqH
z8Cy{(j*R^zvb@h!Z%9sa{(L{5cjhzCJb&Ha`@VkH^}DajP>+h59w2?*=eQevefj=i
z0N4RL9~T$@({|S0)`yS(Ne57+@E)@w{e1iw019d%6#!6AfDgkh7vbBuh^;T&*3Hx1
z#|thkA_f;16&HnziNnRj$Xq==!)C)41<73fzuVt|Q{HO(tl+6>fzEUrBaskoj$IT0
zu#XY|M9Bwuds-u1MZCRyzXs|?in7%rM)~JJwX`Z2)rvxaR+Jn>&}yz&IJv$UV{+w;
zjY6!fVTUS&LrUW;E?NhnmtUlX2$pPi2P&3djQ16cosGXG@?>8Yzq6j2I(K(o;zy5k
zc-^#v?4|qMX01>p6`NW@XeXPyhP_i8c1}&!^kaPJ^*qnQr}7$6I{Sj#%EX!JZVGX+
z3t&S_^JfTmN7LQ%a?kgeWVbz(Hyg5u5#mjjG$;$z+4j!TuphOz7pFfL9_h<>FiqY5
zQN6qx+m}Ga<UWX_fHNLjnj2a%*0oPo#Ut<?dU&aRmSd!re`H6>q}-%`;M=9<S@_7w
z!FIplrA14nI66aaD+}MmJBKa+@z52xNot|SV&lScXj8)gE?qfxUZuYZeVUX#YvI3L
zF>R&_hTj<szX<u^x3l;CZSbR&4C+LoVNPCGD}iS!9_*&<{GRng{OtnfM2ECYz(h@3
zvd@{~sSyxH#SY{TX1w)VxHF2>lo*<iTH<VWbpBFQX6L%dQWQy57Gx)N_>YI2*gD@c
zL{q0s?aXeAlTa$?O*=wxf;B!<pM6IX{+fdNZK|h-4$-@Mg(cv7gAh+~l&zF|4Vxq^
zJzzc}aUry$jPb4ECXiy0J((Ae;tGpIvoq8zYV`Ay(hbrGWzi3yLfc?>td8;J91Y@Y
zmC~09ET<6;Fc!BgNYV{_)eNCq5&z7?C{v@7xd;uPeVZC`VaE!;tH578S0~zMrTw_V
zbM9Aazd`XWY2A(S>t*RHL2*n*w)zp{-PNvqG>Le8tx(GL2wrA*-~(^qY!CmQ`9|Ut
zOOt?H|9OSi4O5tUv#!|M><9YQ0*qZ3dRD2|bka$&F;aDB@ks)8!hkwSlCP7mwKEd*
z6MMTKYLUw17o}n!qwmTp1`X?a0h2`x3t?9cBs{T19lef@3_MR?0Yxaaf^_bg7`t~g
zlqWoQaY3>YDRxTVM&+Ag2F~>VjZ|@2Jh<Ju0&1#{R}!3_De#vv_)E8L8nyb6sgJJ=
z$<=Y2wz&o7%X`Vv@kC!*qeO!0!|gD996xV?N4TndnK&d>-4hop(Hqj(Hf%+uu|1T>
zVhDD_dKYESWwx4FvFsa=J@T<qS{PjizA?Obd<j8F!3w)`Z>e`T4?X84Yc$IBB~l`c
zi84)M`^!rq%%$v33+7EHN4iSm<S+OgYdR0VUaBZr`0Cut`c$U{qmlAbbk1y|#RQ?_
zUc~3eY;M?2TM0T=?#zf4&$0cv@8}}qFUnGR4#NtIOp3E!ydS6Vs_TB@;DX;q3%`=~
z8t}`Dn|*;)&#f@25WW-9!csb~d5pKgGKiX633gRT)3M4rqom%SLlstZgq>YyQmJnB
z&cTScQ%BwhHhPo?$%^=|esKHCHHBy4zKtv~c?R3yQ#RHYHdkAPADPeUTUB*h&aw!U
zM2#?|vf5~TVpz)rl3}jTgu~=#g15bw_m3pl!Y}`y1pg~Z;O|s7a}Bfu%YS!kx_O_q
z_xxQJxI|Zxtf1a$1ZL3ZHPc<ZGAk(<p!hh~J0L)J_gNi^G&;*MNf>T=TC2o=w+ekf
zgf-1dFT-uAhp?n9&xlSsr_P)udiv@|!c>geBV}U$QHj$Ll9fs;G1f0_UN+R39)Ol6
z4vHR=bq^@zIuWq57=cv_y*z4<*$veZGzzXHj6dU_L?xK@q{wN!=+zyPDrcf$+@E^m
z&$pI@Q|X>|$Dia1k2XK&(_^jCa;gq#6br+%->`Sl(aL*-L$YVtzYg}3xy0djuizX?
zA*Cey{o@ojNB``Rhoyn9Ul}<r7&Q%{=FFOm;R7ERd}r^PPm-*n%YhDR)Qqj|R4ieu
zhnP1bP8L|~{>03q$6@B+T+RHf_mnkmG;i{YG>VbHV|6vC#@{C6a$GE8kuxYR%0MR)
z%3XxWv(Usu!xu+645ttc-QG5d$#hJ&LOy7(j!5k@?a#;j!SLqdCl`$`ZF9aIJ`CLp
z>hM(4d!6U1<o3@I@46ClCsqUqYN)Y_mWb8emBbZ&!C5cms;|x7+I9V2H@P<w)ZG!(
zU7GyjN4jpl2)6J5muMwVRs>_%&?gw0Qg?YkzQv^6qB7-faxy%ix=@GPLrq()nkF2#
z?PApVo67V`Ax#k01bkLj`<GGCDUW_8^Q1vE?*yt=%VzphNq|fXT5!hu0va}y@-*do
zf|~~47)g3X_jAOC{!=hsV*?6=Q^WIawqTa{L5{H`OPFEOaUmWn!(KUod-Mg=lZaci
zz^J{D0Wy^`sAe!BX?nXbzY7(XS)wnxq!Gh_)9pVo8KZq-Z!t;wzu$n;HzXgKYKJU1
zC?ea&9>90x1-djfm>kXvbZQ#%$>FkC?in+7BSbA$c8-`5+cEm5IP+J_Qf1!h<)nT|
z2h!@$13MWxI_@9X>BVX~9xyj9F%mfrZTk$3<r6S3u)C}YiQ_KT)HQ%K8lr{M^yFy@
z1Qsr`GWK<Iz-}Fod(Li4o4iK>`OKFZW2|NyB{*s-fzXN6R9NJ?j%$0chjV#;K5wO)
z^6<R`mAnl2RL3p%7Ie?tY-&eC=i_UE#sU2kuXcr>w0_N4zFPemvexL6jDP*KAVt3R
z-<3zYZETlG@^6|eC@49pwi-{^zVo2~5kY$0Fswkn4J-d*K|Ww@!DWQT#s&XjaQQuH
z^VqURA|f)#Bz+xXKp1I=`Q@?JigM!!^9=^pD1Z;92LIN@Y0G1Quyq3sNZnGMHR{IB
zeLXb<GzT)NKl=FJGdFjk>zRkB$!7j!MIiYX{ivZHDHqpHyRqw8PnMz~o4nQo{r<S5
ziraY0u7}Fgk_}yV=aNU==<(~R;$Ur(DeNZ4|0njhlHcqx-`HR#gQ+aSNH+Ut$Jq)E
T0hI#)dhqKCW=}E*0D%7hnHN{u

diff --git a/datasets/common_voice/dummy/th/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/th/6.1.0/dummy_data.zip
deleted file mode 100644
index 335835de4b15a91afd63fbffd6ae2eb1c7497c2d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4467
zcmeH~dpy(oAIHBk<rXW7Qk2VFI-6WV<tQW75Oc{AxlS7)!%C&O7s6UG=6)F^QKD2%
zN#z!EC$~^xom_G)ayehsBUDa1fBpXXZI6BT-Q%&x`}uu;Uhk`+E<1!9n1A)$IBE3d
z=G%n_*aX;nIy-w=+1q&7$eWsQ0qhA9XzTfp=V=~*4KlzE0FXiOVz{gk$)!f@h;Tbs
z!bwkexC%l6uDD%sJ6u5#uAso8^^{cGC66f0qID14R{(#K#&VA@2zt#4G^W@ZnV}vV
zI<bL2LC3bU4)7q@;9U?N?!+$#MM==Go{G}=@t~^EZ76mHsCT$kx6lDZtnwsHDmLC(
z=)psJ_LL*t(Az9+%6=b&ZXFug-}%;9&o<HsdQLEEySqbgW!#?*7-UB}HeBby<|Mns
zz0N@VrMmj`K)b7|$H!F~y!U!va~GCyzBHu$w1m0!)AjQ`_5Qs6l~U`)Z79KhLXx!`
z$^joPdW;E#Jz6pqWqo~vlxQR@Ij)CW#s;Z`P%knSk4dx%>gDPUszVC57_ZzGFKdkw
zc^qJugdvD?R4Z(KDOjd#+8E%j%Z}fDKvp0{8c#3@6DAN}_qVp}Z=UQb_M|cxNNY0&
zBYynd(LXx8sZqnR1%vG^XPc{R+jJq`>Z?sA-rSh1@l0U0wmd)fq$Q(Bq(o)S@RPGI
zZLBO$t>dSaer!_hzZDQ)?RI)5<LQ7F=0p9452hCG0tcgfi5CRW0qET_rOxl-;Yr*Y
zXMmZb0+UV8@C~>4{QMJ+T>a}}_CQNs@SYvf;}2U6=3{a})GMyV)`vk+sVe{g!MdpS
zSmJLbCBFbA6%1+~wvoJdCB}=k^~y71l9F*h?w&b!<0`{7+SkPW_MwL<Ua_MxFuYse
zD}6QU+@7~{m1#t0tEMPD&pog^H3dy-1Z8-{!&g-o2YLsx8zlDYWakf(adv(by<Mr&
zhyopb+(*I2V!?JGKjjvR3oHA^NI3Ocqg;-h?6$+3aj`x3Gkw_jZ4tbENm~U1YXVom
zw<X2&04uX9(ipPaX})Cz4vKw(++|AZ2MvvV57%86KaTW)r906O=kKXx_m@fuacf_`
zJ>ww9Ips9wsuoAL4W1R5QR0(ksxswk&W@=v#cJeFszq!g1<oqHQ!O8f{Ciz@z~%Fz
zE1KC*0&HZTK~g$Zz3<`JJ!WTZ=kp|mG2M{U5wjBcF9h8-3%GW6E9;B8VV;SLbBzLB
zJKo(R7e9GX_J`k*4F4yhmkjXv;S)aUuO=IFh-${4g<gm)sCO{I*2oy;w80%zPYhH~
znRWGuHYf04?ZVoB*Kj^1f035X<QE|(3eceDm<yM89h9SH;3=6X1FchuC3w&GSnKF<
zUWo@#?RFVIp@6ZrhSX?{H^qTm0S)`*rmbp;5})5VKJ7IOr}jDu=1MD6^WEqhMQ8I?
z%TJJUuy6<X*{F<kXc*ODw{c4a(|kzBs~c`i5Xg%bIEqT%iP(x{*yP$t5a0m}TXH3D
zFbt0j4qb&u)B#0Z_x10;4BD}3f71gC%kcyl!`F69w&9`*y##%3M1?&BWA?nJu&t`E
zK!nmbOu$+e0c?r=d<^D0`b2Xp$&o~#p-xn{l5me}XWe|o@<{dPb7t<{URjB;+ai?w
z9I_e$Wh3m}G-+3LC5_@8+SsJG{(0n<{Gf}+U+-^Sb9WNL&gZ?~=X{FKGanc2^dh%u
zbEY~(OiM3L>46^PE^6eW6jz4B<eo#dI-G;G*E<nwhw?~ABt=a%@T8hq;_Xd`I>$Y{
zgECZGAODTdyOitn_~@q@<SY3;=3kS|v-}GJ&)3|=0?V^5um{%N<GZ%Yl-&Po`#;t;
zmN&<-3{2h%FnRw&KXvuMVF|w#ms0H_pa3nY9(=kbXD&PumRtHp*@$wpRWKW3nsSoE
z7ivUT2|2U(0Da4cM`2?0yH~jRvdo&!O{l)<=oF^%n7cZJmNBM#uUeXDI!O|}RL<^n
zVfsBEZk!UH+If5DO^JA=2u%EmohepERkWN;buO4ppHCY!ZY|{~;g}Bo=DH>hHtwqp
zC6b|hWm9a10Ix=7lI3d)$(+l|s<co|-7ea0CrznDzp#o6_F8qcLM5bHLkQrwIXK&4
zb@<w+ZuuMA`VeG?*!mc<&bHKs(8d^h>>0}u=W<eohvmpeiYsc?bmRi1lRAwV@h(5*
z-ScSoDYy%x4e>ata0vXH=WMON>!0tF2kzc19h@GkUSn0^F>cYFDbz_Vgd11QSd3H;
zdNC6^?u<S#i_Q#N?b(nq(PK{*zxK>>?b{+BGKEJ%L0dUa_ZD)c0UEPz-4?%8Vcwlu
z=rcKTM$&WxQBx$pHaY06zn*_j`>w1@{=w;YSKD%0><Uf@h1zvJfo;kQoJ-}n$&V9B
zu#|SQzCS={Be{{%8im`Z18JRCryuGKZA=U59=v$Vtz%+b%iwNlh4DBk%N1LEWTSkw
zYUmkhvw>Mp<PXg{H`ysS1dNy!-~^_^dNshi{9N#tDGF7WtsF(bxJhg@{sh5F-OR7U
zUph?M<lpQ>a~?IPBHLjRE1LyTb%u2?2#ri`X{$ubh25#y%Lj$pKI)FGWvaSp@PxW3
zj-J3jWG6;zV+x91$xDm1A~q>@ioz3+yH}ssxL%{suIc_;`_ZNruLGv_uyWPN%nFQ*
zfZugbbQJ9L@yNpp1hkB4TalAXfmxvm*5k0FeEzLN5jQX>*$^CkU9J&DH#v)nGBu~&
z{>X4cmr_b`rH}ZUMDVG#h2aT260cu!i2mvBT%YIF9>a5G&{{lAF4Abq`Oa0|&k>wv
zoDnZcngyDld{0K4Oz@u->=r8CTx`It!YL%Vr(U07F)Px|Uo2G&FAit0dx|Kw+$k?_
z@1wk3|9Qub7Oa8FkPc@e&d)BuiDs>(*daYvMdh9q&$>NFTTv2Y-GMGv*S&#LBu1^D
zFpqgJ-TNUPcY>?oOxLK|v}+&$={nI>xaxD53T1>}!bYch@)++qvw+k5r{*g4vnzEi
zYtwxzk5=?!h82rQ3#-F?A6UG9>|kB_Z@S}Mmk!|b;{UX%u&oefUvA+M{>Fy_1Df-<
zOV%Hbuj|i$FenKu7@X1Ex76UjtU|vh{k#Y*kOnkUStNZ~i$K`?TJ%e|7Ut)rOVL*t
zn4v(LF9iHrbWY2<1;Q4sNFa4tbrz^gr}o9v7O<JHNc}#;|DO5tthty;g|N*0p%8)O
zUre2by7RHP;O){OXfgTpN|wnBBhl~OozJ+X-FY$8gp+0HqF&7!b!mlPObr5alSN^F
ts`vlK{+jYX*O;$tFp|Mg_Tgoj{bR*h4$T272LRmQ+XalCArJrn{{U;cj+g)d

diff --git a/datasets/common_voice/dummy/tr/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/tr/6.1.0/dummy_data.zip
deleted file mode 100644
index 14c0dbad654231310ea3ca5afc1fcb91b3a8bf85..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4222
zcmeH~c|4SB8^_1k#~$KEj3Y%g#!_UY>?zV1vdwE76l!KHL-s9{rN|aV$QCi;SVERT
zvP6~;vPZ|RkdU02>JyUFod4cG-e*3~GtXbw_qng%eO>qOf@@GxfdHgy?2#Sf>&y2C
z1Aq@;>4ZkRnp&FU%%K-_=>e4Yg|y8`x04G4fP!j<5&)o@B|hN02HC$oh=nu60_$Ms
z<Oq=#lYmHyONv7zBq0(KWJZ6Gg4*_p2g!_%bNB{eNjkaP=W`HOO$X?FfI^7NIx4bJ
z001B$03c320Ow$CgAv0yI)6=6#Ql?It1%~kPE>Q49HvH`Rb{GK)CH*g`9>mpDkz+n
z?(-4!9~{E)v%ob;3J*$|%=U&7eQP1~Do!dSj&L%OJ;q6$;$j)UnGH?IX_tiS#Wv5V
zs~1hY()xv_^Qf}T%CTqaBa8ooynIS}O(71houvPoR>-nd?KGF$GuS?186}wsm2j;?
zqXky5e6jhYOck*!iv-XFUv#gEfR*lR8{ue2<2jotWLBRsy@2bGW|1L2J6}+8z<n0(
zmsn?CT7K^Y^W}Zq$3hVub9Hewj#-)ER%<7$vV-`F90yw7S~oQ<MW9PeZt3(>Tw~U&
z&98D%zE)UWIVO1(pD{D#|CUW&Ea>>>FW~1i9QZqDnGNOw{gLUGM<&kK3y^lsX6lc&
zs#^;YQ_n+8y%_nax3qHped6VbiDy*1w8Q~ZVEwDR2-Aw>=1>(5oO%5?nxnxGgnKca
zrXy3foYjk)U}~K3q@wp=DGSuf!~l5(_w~e6CWOCs&4wyJhzl4VTMV%SJ16NR1yP_X
zEc<<0wZFg?WlyS_6HYH4uub-7>7s=cNcsC6NfHaqS8I59#aEDfBJac7RLMFXfmG4M
za538pdb*<R6}kku6o0!qIGl$uMpy87K}!^&iLS9PE4WBRWYurDb-CthKTW2V4kUp!
z-wrhxb~h(ie?;oP#c6{vyaMejb@z&T<gM2Y=H$HmqkP;ad&l#br`9-b<SwIaD3bf7
zo>4VL9{7m$91EM1y6ITsxFYnz5CmM@r&%^FIaJbI53S9eU+F??Xqh9w)O%8FN^#~s
z3ZWi;JI9MCg#=NGqvT8BY<|Ut*e<Lre+WXVf?rgE)$MbbQE|qA?nzM(Kc!cd81XfJ
zaFPg&ldsP=#d65xVvO<CL1Wl54^4tGwV290+uLRJj5>`|1z9bxO*>WhnB3@G@_3Sy
z0L{Wz-q|ockq`TTaMe-Yf7Z!}{`?W+=Z5VE(LgCgV>sbf9gRLDJv1d?GNik!ga@Fp
z9}RXrRT{ph6JC|y0<btF3<HOhz~}2_Ypb<&eD3a9mDCe41FJyO;Of`L+xDL_k~(lG
zxXaZ<U`l#Kd5}3@sa3S^-O80t=$pK&eCccN*4MwtGI?*zdyV8Ub-2}ZbU&}a)ZDyY
z4*9Hnv|!C08Q3lzBwbuAT@YY|ZO3`K>#6xhbjvKIigGea_8`&PlnMo1y65v8aMBW&
zT2gWc477Dwfmto|(Z1a0ME%=rXt^U=BxvxQIH&NoK(?m_KKy9OC{)pn5sWip0aYl&
zJi@|dS@CC&N7;O}J6fM$>&O|H$s{$HUR0b6IyM{NjvYLa;9Bx<#eTT3vawSDxAC=M
zZ<gIwt)b*=jk9vZ{g8Q=hWCG&|951Le%yT4mRQnk|L$PJ;;gM4ey20~r5uJ&f{|v(
z=wyJF862D0$Q1g<Ofl%KT0`hX3AxJ_o`;twrmq=lA<{=#=9dc77LIEryrK>hn1oS=
z)lE9f%42CwInmHj)1XUwm-uVTs=F>`7!?IFyv$sHjZ*GSij5pbF?3baF+WIE>s*a{
zs(~-z>Crgag~atZ*tSvc<${YNFcM|K1L{jxz<B`*N(VAfXOuXwQnb8x-wQSe2%@ZG
z$_aPlFQAd#GgJKopK#L$BXbzwFGk($C)X<0Ej{PfdY5|d$j#m@qQ3g-u$Y@3lehe#
z6oq|IosJ9xm-P>s7WynNid>Kgh}k#<9}T_-UXX_xpcKH)L&>1?B5^*xOqYOmOlN{E
z@+)|56_rMTxdf|T6po2RoE3)$a}Cm|Xeu|bf@&M2V#P}~hT@^Z1c5i~>@4L>o*OPm
zy6Y9?1A%zY?(XG5xhXeZDQr`4cUZA+Y30Y{MAtcnB`iBSYPIPz)n=O}nU&Eu*f%6i
zS&5oTlV7?vn4OJ!mkM^e{Z-k(%PSl(-(=dQSjmHiaE@bpo%tDDr_DU$qA=y2FzhU6
zuR5>v2#W{DC*K0xz%8BJNx$cjZZTE7d2S`~Cd=0nFZ6q3HjLs8o1b3b2L<`-HfcS1
ztl9ji_B#E|^N?{=UsOqSY$&u&8~xXx*g!;P6alEh06b04msAw)N9((Xi=8z=gHBdk
znNjg^BcLkXCnwVdc>ig$x_F+ZT5#Jz5VLw)1%9o&G~p92H1ZsOw#K-9xy#ZhY*t1|
z-usK!sA}$$^5Lp6uZ}bpV>{Woz7YIFQ~MK#9>qt%0^L~I6WLKq{?BAEVs2H9u^EV9
z7B+&Tx|eOffIYLHYBpZbFcQw6T`!2_MT)aIG-xLCglOV*!7EYeu{X~}YE--$dPwod
zJ85mqu(wf1ao4)V+5*G-*7b#QZ0}?tRG+VOmVr4>Zt5`q^1UayZsUY^Uj5Hr-WYuU
z>UrB({!}+-rSS%E(?ye%-D<^;k+1A`U2U-2y(mfC9j2WE$WOVO_vil3X9dhCl3uq>
z0@U9ofqxNjKVUQ9x}xg#fd4QR{GPOPCfFp+D5j7}`Z^&HVWbJ+m&@8bEw|4H-(bWF
z1f=>>5&yR2Y1d^DVOypJB6U}FHmTdK@m8ugvGmEL{xHhFXYTZXTbUp#vY9_M8Y1}@
zZV;|P+KZd3-R?-Xl0Q<DP2Th;zh5q?;<lfUTcQ25WJ9-{mgG^lckr#$JbJRJJNo%Q
qvA>o4&K~oPP29=EtsKHYHv8v}vm2V4h7=k^{9%Z@XO##50R9H=IscRZ

diff --git a/datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip
deleted file mode 100644
index 12fd52efb3e419825ad08016f7fc83c624754a2d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4270
zcmeH~c|4SB8^?#SldMA-R7mz+_6*s#F=S~q)7TjrW8arXwnS2x>?6W!2}kHvrj$bz
zm8_Aa6A_Uu%Nf-tBRS3a<NfcQ`8>}&^Uw8t?)!IL*ZsSoMnEbCz~*PH*c0~k;ropd
z00tnv-QE4qA>kOf)Co&^05DU)+-~#o_G1K4P)z{=0IF%?4Z3R(!R<lL`$(MkKzn+7
zNhnInNXSadN=wMdO328N89g@-vF#Ow$&8M{d;{#w*4XX!If$#K1H8;d!1^OP$Gs^4
zfE-EyK$?621`T&cNn*TwzHU@<riR^0s>aV7_4JB~DM6a$V)r<QGT`=y*B7pKS*{!L
z6gl5U*qzcWyk=AJA(dAwwwJCKR@-#r>?Ii;1c3sY65}X4*Zf9~@Ml6$StYV$H4F!T
ztJ|D3cj061>~#F=arC^GWw71~3ONvU<b3&6CZ0Sj^Q{$y0^#_cHcp+Mmx^7vZ8}M!
zqSX%V`9(15^OmkcH$@pzuNR4?cw-%`#*;1Ja%o}%_~cV3#;IGyr4mG{%dO~c%8BIH
z#EZ2jAaJ4rN7&QilbiCbEKH0ox=k8T_NP0(@J*t_%?67e<L;}A6V#N|u*IHxrkvSO
zaQeViFldIYAeuVWm;2T;AAQ3kvym=u#RSU)gl02Wt&c+zFYVPo8Vy<~*AQp(!&)2H
z(7O%uD$ZaVROR9OH>#32XJy<j>)B}Y3=a#$Bn&ef6??=XzFcsj?~^9fV#mz+9+@ki
zj^=TrldIbM#%67R1?5TLi-!6}a^adL0?+VV`y<Qj`3m!JD9<|eW_E7Ln{f}x1t!YN
zM3h&Oe0h=fKED;WL)jE4!$C9Di$m0Ejs`0~ebVYB3v)PZ$W+D3Ap$c;Q`H^N)Yoj{
zbQ{xBI%3bPx!7nDkd@^tonOP~1n*)hb)kr)#oTf7cuXb6(Of?ap5l1Mfdv>L@0DdZ
zX#}W72PjOr`U_PzS-$3uqHW_IPZzsQwWtF9cqy^hFgMI7508;lDyXHJ01II)j4Ot%
zSmEXI*yDv<5k~GeI3aQmYGY3u0Yx?Q?+e!O<qOKHv@6jRB=LU<QM=dY?_}W;gXo8v
z3a>@vtloIjTG!(7_}1aZQEl7nCOzY2t2cd@s|ylckF)AJzml-9zPHwbg=uYon%m1i
zJp*U87PQo(vli|9R{a<JyGJ{EPgxI{S?EPu;4Fpi=C6`yV33J`snksdm=ZPUCtmX8
zYv2QSb0+p8d*lx#HcRXmWo8dGLUBNt&UE7r319lKa@VC*M_2N?t1KcQAO&NW5k<{d
z(+|uSze3rclfe$xU=(ba*_g{5!@QWQJm%v&Jk~s5o9$@s8`{&D#&yptI3#d=@)KkL
z(s9OvH_qmk>eD9S>LLPim|3OZV64Qb<e=CJh+TUjGQHz&PBI&P1*cF#lE@NOQqmJ&
zSNDoXbQeYc$jQzvDU6X-bOv*xD=>~RXM^GH3a_R&z_~D8FIls$RmN#^Orv-pze`wh
z*JlhpDzJC3w4*QjbKD#=uBEcpx_QCjZC|%46nwC<@XGN179Yo1ChA<vJx_v1FTJQ(
zJRS3?@J4o&O%s696o%g``aI+ctQt+(UVZ3R&J&>+Prd=Y8JjAQWmKm+m15=)aCI*q
z3yas%o{+Vrk*<Yu^)v2?&2Rf&uJ{M{3}z@UK5Fa!+%vy88Cp0MLb<(uNVSt9UptJw
z7v_g-yL89@%l5yMElhJ%8$~R)sDF2=d0?FE(ZAD;<W$q3h;>!<F|}Bh5`uME+SwTa
z+4RM2a?wC5s+ibL;-OZ1mxZ0eXyJp}NA<)tjiaSa;+MtD#t-u)v4fl1sL?9?Sqd^Z
zmN&SLkjoBiPXyb7lnys58z(!Dw8;;Z8YGN_SBo>IpE~DY96*p~&*cyhy3^$9Xg_Tz
zPz6HgH?-o$1~~-Apyjc12ddtg>N{v1ylW+Zx(3B}0J%~P^1L#=_Ho>?g<H0B_nu6#
zWsRB>7jrxCpQ1^JCDd4xQnO5#VR1wSA)uXJO_R3cJzEXOtD=Ep=6-|69)JeF*q^-5
z;4{-HCDL%8)=hdrL%#@L{+{0=jv#zCOu_B1^$?q|2MX!$6$kKQr=P#Q&B;3$){SBG
zkUvuZ8esg3nO<ODjNZ!&B`ipFO#zwYP=k^o<B4;KVoW5y{Lvmz-@&WQ=I#}P*aGU#
zKpoaKS4*Gy3`v^0a71*>I_scapA>C#sMgX?<v5kw!j?wV)rqL9BKhU$jM~{ucd1J6
zD@G_D876G^pCUmKnje!)&qMP^nTBS$TdI0!T5_p_A~1cx7Z9H83iv!j`nVPzhw0V=
z=olX3=f5!U?rGHr12=db+fnf79*0?Y{i>4bvhFj#dVgQ|wMU{JN6s;7BJnnfbdzni
z#o>gw6xEDs+R}2Rzuc@D_PtkNeOTg^1mZPxMpOzFMbPO^-cU0*pnVw~uVUNQL+hET
zj}^RmjULE)Y(EgGtf~?R(V5Jfhoq&*?>~h;7(OB$@7~WhhfYtuEti^D53bRDA=TLU
zQd_-2LvkWyy>0rxnV`=xW!IN4Pc8aen_bspkSHi#512^vcM3wJKfDMV^}F9GgjNeJ
zH;<ebDT+S{JdQ^i4#_<%)OyYmE^DZ#;U^%W>|l6v9-AFi2;Mg?)xu_*5%KAjFpMT1
z0@bBFSyVDtny+JGP!LK`USwXZ=Hsy%Su$UYTh7sVs>+-=hk6cuSHH|Y$9r<->YFUC
zvc%7LzMjkpuQhMb?4jw6Y5WQ2-hfe$R9tb#qPt#Ddos{>*y>%Khx5k3Ape%@-n6Zx
z)?I=8MEITt&K}#n>t-(OF!>aee8Ami1@7;B7Qhr_^KskcK>clU_!j{M0b2prAUfLv
z{=*dUd(zGsVv95d$s?2Wb)q1`HYbW-E^EthZl5Q<!H5M5$PcF?{*dg`uFE3ANTv%S
zbysz^sM~EWDRqR{DafS$FulKL?zDBJOh+oRnLqU<BKa2v4{EfTi(9MRZc0eW0yJck
zw=Bx<m%CYU+mBRIXfG|<P?7_dJnHs_PfD$%C!4yX{r?mDTgmV2G2ht4OeUssJtNud
ZpF7TOXlkNz000B=iy~%^DAVS@zX6u;Av*v7

diff --git a/datasets/common_voice/dummy/uk/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/uk/6.1.0/dummy_data.zip
deleted file mode 100644
index c9001c18749712fe79c8d8321eadc6f80691b18b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4550
zcmeH~c|6qlAIHa#GYl$cjxZ>rMw%f@<jR?Gu45)=jLeK<43Q{TEB8_?CI(q@m1C15
z5|YS~s~j;~455fLmUdK+#kRKD|9*e`W*+nXe&?U}^ZWU{-tX7v{WdjZWMT(wd|uu1
zGW-1S^}+!V0l4^KFz3&>IN}`-TUxLI7>SYyhmD_K00)49X_gTHV49;{Om_?-wLOS4
z0pg7F@$&P9z!abm#UqMGAW%gJ6iR1wQjXKMSCpYMy5G4k0PZaHonC(rZPl!RK9ZA}
zeG$)87y|%M2m}C*&=0`-IHIu%cwfTjjY=Y_J5aBw|9zu65QZj_P!KZ6WPC{P5#iJD
zp*Y@;=3G4b#}BZT8#{pAVWHv4dwH#|902l2se}bbsI*xelUD{psx9X&`wk713%_?%
znq5{CPNxW_i-#sM!i4<~r^^|o;x5LjYxOerhb2aFoXu!5fVpP~0ticytRcUX_7%Q8
zi8if=rSvP7)|IDu=Pe}&&&S9g#MN;DJ?wt5os{6~*SGUv5f4jL3{CEF^Bd;?fd@!j
z&y&+QQaY}JVh;M2=_n+FxC-yx0h@!f0-EiQioozA^<i^wYH!^k$-8EsjwHu4w+0wp
zUD+4!&F4=!SVAVYx`mgX5+Mf-w`l#aP|+a5iypO_O>iHF?8PMpH+CA|n|pirwNx)r
zrOhwkUSpE5nL#@~igCR|O!;!3T>J;6%lXplSMv5HkB+aboxYjsu+q^v<X?H)LJX$f
z-GU}sRrpbx$h;jn#{Cqy%sKCDMjP!kPG*(`^iW`hj$ThRU&^}EbaimtnwyX`JoR~^
zB@27~y#=w2T-_8KN{*2ar-_|I6I+3Pv0YpV-<R6X#10BtBUpR$hbGLfE+CD6Z&lWo
zsz&Hf;aELBYQAF03mMS}IVpZ10(`2-p;off7||!~fG$cq>eh0!^`dgty9zj~lT*IT
z#BPRIwV#G^{YOnGM&ZltD<hS`yum^&RQaM04$+P?vk9hsViJE?Bw&3kEq)>WT=}G!
zy)r++?yxIj`ZyeAsl$`P8OzopaPFKQs=cSr>OAt{k8%*sg2yR3Ud;P5`k(hj%i8i8
zkPA(%AGd*%t|=C_cp!!Lx+;ybj>YL>;ByV;IKuPZ^ENPJOVN06^~)O<TA$Xw(i_c7
zmq)%ev|YfhHyuKWaX@k#(lVpDYxLIq<8cBXBj9MFLeH@id9KUUs~<v_Yg=@=YYnUc
zy}TcH%{SWJ6f1oyIF{p{SaMXhY@$X<Z)LBj62szaS@M38NnB9H3bBMsdFmYg;=?)>
zQ`LaR7bY4~J($QOh*X_6A<SBJec$s=RU~&W{!K`Oy6V(=!|4(Csp_Clad<W@VAc93
z_~>U}`l7Wy(aDTo!sdrqn${9Dt(EB4n&9Y(ra7Ri%U_Cb6!t$9w`-y?mQNEDF_3Bg
zgn@*x{`3J|G*{^1+)TdBt2)G-FnH&un1^gX9ysy{<XF<>8<7z%+7xG-jM6X7rJ}|I
zo6|>}bez%&r4Nz90b%mXFPhixl2tXyO|Nel8SXP21&u|A4`w*2IobNYu;s$k0^@b<
zoTHx!Abi>646k2ePi(u=!-Ol_vz!pz1u8!4m77%AXs|1RQzd~%MOp{!Q+0`N)xEeJ
z#X@P|)9levtj;>-hO@9@>Q{Lp<s}m7t?nODJ2)nv9M`BTxaxSTxt(loeWKrY4HaJ;
zQao8dUqdaYOa@L*U$-%uub~w18=UI)2$3ui-y_M9#hDxZUgL?5+yJOzQT}JE;ht1R
zsDl10X%{zTnLLClSEN){kM8jUPd%j(rUF21c0pd%$USCk#|0koH1L}uE;HX^^Ci3s
zk#)7G&_!iUXA^J2m+&KfXO?`|)YSUTwF0r!fDvO{4YlA>aQdim6h-Uk{(P-9XdU*o
zFTAzukC2?Z;@7=HYV0blxfksYuY_!oe#6tZN`IJs>G7_<_`g)&!P5V)`v0uzV$L@Y
zX3`4Y=)axhIJ~>7&-Y}<BpPCcp}Qjo9?LQN0<=h@`A{i@5A(Z0T}5nFEQia`u!M})
zb0#R&t5jp;*I%U4ff2A1nr2-e<cgze)2=bzL8dAU86R2zyH<@sPGIscG<gNpe7xeA
zGIS7k<V<;jS-Q&%hJ9wZ%F6lBVN!xYN|CEsn+hgEub6|CtL4`W9%&qs|E@`=omgX_
z4N2<~s}RtK2ho;=G`LdWqC^Y|J&>8=ev&D_YxO0cP-exVLi=U*h{+%J*)SQt@E-1b
zy59^nS9%Q~UcvaAfPYfAWG1CRbZWS7mD<&PgPPX4L_|$hUqpW#?8XIBrug?$Cha>I
z^{BAce3*BZxn3#0$bV6KsRTI?h?%s%uONVPyC+hTpIsp22JsAOR;XB(n|@Rt8R^8A
z%9}Tl7KE*{<g6DcA>kh=vw}R|3wmNN&6omZ8IFogcJ9e8OMZDoJ66-VqLV5wqG3+f
z+#``E%kBhK2t8$xYQIj_KJMI6^D|M@t*k`7D;_>LfKA<%K?ySVnil%~Ct?)*eSK+E
z_B{WZ>NIBcT?pb6aQhj*Bojc2qosi!Ee&Aw7e5;Nt@H2T0hol-SaT?-WnfNjl%eqP
z^5R&p_Tt$V2@e!+cQ}><*#NwKvTTA&P70L$K=6&QpQt8K!li8?=DuL?eeUZVD2yA2
zned>xIo#|myUi@DoPP!~CTlxZa`A*?r$^_ZiAi>lPQkTv;nMBTE+j7i<<a09O7D5h
zvmbEP65>LUZO2m^)&N>n>Svphi@;9tqgk)i!7m@UC+Qh_l5S^30d>w^Ud$F#77sYR
zOaB8{$K^VPfn3J)1QrCBwT~wU-l@?b&X%}Y<-eh-pNUOJ^~EFADpskL#A#<Wza>32
z!cz9Sr~lG3RMl_Sk!#Dh5ci)22QB?(R6jp6E?;A2Yg1XbcOoKJ(S+^Sk;;2lUgl8r
zWmy)6R9wI;-H6+1<?V^gGqtQtnx4|uN~$U<yW_%`uS(>LUz0rF1v(~lO4ADPdP4G~
z6r@ZjO<p4@H&d=fh>hV#;Z?CeiYmS5Vm_fCtl5jbh*~bGBR=Aqoy%xt3K}x@mwC^<
zRHJ=|e>6B~Y?Sqan!(}<cfk0GsneU&$cFWA9chlyul`r<(YWnB{YC<OGyE_BMHzRR
zp#;A2L4aA!jmK>h)vhlS)qe;m1=tL@rm3?%;J-{)-zR-LTWykNHFM}BeV(*vu#HLU
zAD6Y+dT*bzzQAZz3dp_0M7wR-ryZ9?gKe3*Xw)6m*`#i7__tEu(d>av>R;{u_nF@|
zo?DqCOms8<W=1sfKeV8xh8wxKx!Uc0&sOpvGu`CPF6jHq-Ke<j$L>~WCoA31Er&3D
z)a@?6mD)$EO*)1B#`FJ;{iWo;^_VYgS|-y{xpz0+?7utCPUu}U<p2P7+80a9o|Bv#
G|NaCTG?D=T

diff --git a/datasets/common_voice/dummy/vi/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/vi/6.1.0/dummy_data.zip
deleted file mode 100644
index 3b3be9d43dbdd5f6d8712d6a43af70ef1518ed60..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4094
zcmeH}X*iU7AIERYkQp%~*|L<ZF~|}kWEqbojmp?!G?Z<|I;AN~vL|9JjZi~L_EQs;
zQ=G_FmX55Q(_%@DCCAgqdYyZ!D<(P3dGow_X0CgA_50la-}nFh{l9ir9Gp;q@f#@#
zKz)AsdV>KH05;U$Kg<n_3Be!^9pnKx^0psvXI!BtVE`NF1P1_cPJ%bPb&JR{7dc8$
zIf@So2n|-zR#jKgP}5LTQP)sWS7)(0rND#f6XjW~PVoK$;J;$H-scN}tKI-Sr+A=_
z-DtonvjM<%b^uUg9S{<P@y4l!1QR|tDlN~@eLmCh`$j!Y)wgX>gU9*?L-T?D=>^R$
ztcSdCSSdQatSj?id>T6(vQf~)kE(cUON-5iZBD1iU5jLV;T+ffvABbe$$rvQY)-UP
zXL^o}ZgLCW(qJ<7rLfq3vAx~PWWYntQfFI;Aztgug)Iu#j@1)eqN&zO?r9K7b|-oC
z2t>F}*>r2oEn`s{*41&N`&~_~>H(f62sSqCno@u;c0)VF-$<i?3bhL!?s3GjXJ0hK
z#29TntdNX`ODu%%$rta+vfl~ATvIo9rHdP14-G$m+c^bsk*{c}b2xn2G?BK*rKu8!
z44k9*b8(^I=Fe2E8h8GgzoSK{<)K#h-q@wTtSbhP`)Avf%X;QivxY2Mb)V5f!{arn
zZAA0TeRuXx=uLUIM$3~Uf)}!8bQpVTMU>G+<Ny>D<vn<*vMvhNlkihXQVeV#8Np*{
zuZ4*0n)Kz_5P@Trgkw!rX0?0`Ja1oZFp3%*OO-eP3FLoyZX;!g{Wy6x&&zzOz@{V2
zXP$FVEwoZOc9;5i^;68{lic359S=21MSCu9dTA`Jlv63H=vU)X7UzLL(LyU58XgxO
zR_rd*D90A28tC?OIhy)*dbq}1DSdq>O^Z_9r6k)k&@A$6A+p`Rvp(jgbwZnS4$1mI
z7ZW92gU|w(8ssR6WTY~`Cf}oz<%64a;vN+HI*3mFF;bZkKz)+p@jTZ{u-6UG=5xc{
zqg`L59Ud_DIzkX_x+v4^ioo6TT!!cKw=Ehb)JV8l?wkpp<Wy<OlE%#z6}fbN;2&7k
zamzdp)Zsm6L>$0wzX2~z);mtX_<4i#!V~+g?+hFKzzY6$>Np8?-qI0=aTt37$@JYO
z#Xlq!()17k_lcDqSto|s6j9fZi9qoi<&DEq3+c_qi9(o0nYdxz5zWC<x=-G>pAHMy
z&(Wten%5CWBUH&34aR-Mw^{^SI6s%%n#6lu-~Tsxar-9Ey899$If()dN2HTtxzHvv
zSH(y#Vs0n(nip@*Pm0Vg9MyMh+iZYLCaK4Y?1OijRyjBDZmr_J=|$Bx*0M{3TU}Ku
zuDe3*iKOWEXL=IT^^>-i&tP0qOn3b1+&(rrJLr&?`F4pOnA_7UH4_qca!EUvp4EYW
zH)czl?wwttd*?~^N6p|GLJ23zILj!a8}3<pHIhw`$v5aEt>NYvV!WXHK`XPg*z7CH
znpIaX8yvolo<|Q+o#=Oci=;^+YzD`u-TM?DS@=BZY9hDp)(FqApHHc9F%Vpy{+Jgx
zncQnhAI!o8M9j@qe0s@jo7EyBSQjzGGdSd%e(Q{~|LgZZ(637DV|WZ`K&gLs7U4sV
zdj|bfz*GZ<0nL5qg!GK16Xwf;Q)7?^c14dr**V^?<x+&4*d!aDR2Rv?iBv+MlGBM#
zY$#f&dopuuvURh^_kOrN9=nY%>|LV`Y>z<F5atAV%L4(=2f52BcaL^WAN=HmrQA`Z
z?;0_vF2VUtL${9ph7_$*KxykWw@OnS8jrKzWXpz)iupwu3DsS)Q;<!r&lsz`8QB`y
z=145DQMw~$p?zPhwp?)(b^Z`F$b@%?bIw^W`vRTF!!UAF;DcIy_>}FVcB4u@(it5|
zqgRCGJ;?XHQ}&2JVtDrOvkEsvC~=t}|E^=GyM=#gzJyxrmM<5UJJPoFQfhE6YT)%W
zPCtT-tv2LQA)#^NGo7{Pw{XAFj8}G+Iq=^br99Tj6@lc$bLVA*6*2~Q36sj(eXQK0
zr+3K5qR*V=jkL|M*3vbHkOsVFQb`py*L>$sKa~;PobJHqF`b=3A5Pa1acr#S+`Xax
zA@t<KWfBd4*X?2RbWQFQVi+N1kw81j@eI$l#KCOW#I_z+2~chUP;PD3?dpwNJ9XD_
z?b8nMv{MrTUd;tUrGP{9TbuOXz5*^T`N?OJ3w>TtB%`AeATm|kL(g6?eo>%PUzUpA
z4Yd8fknyJlP3Z9pIezgRVz!llIka-{_JU%Z=wis;<-$9#&X}$^6*T1+9$Lb<9;pPU
zFp}(D3V(6+b?wy~$O0ae68`otNA!J~(}Jj_%1Lrg4u{S;waEzvj^q}XCUzmaQ8IV+
zphYI}E|}7D0Y|c1?NZx=Qe4Duiifo8)PE2QH7$v{j>%S?l7mhdQWbcWR2zNzkTuR1
zjyC;8r%jwbwT(U(U}%t8Q#+39Kb`xOFgs{iZUmT2{;pj_ky1tV$$zT0uGQeq>8so$
z)z`O0z98JA-`0#IqB3RXcuuN<`&Goy1GbeE%ZP!iqp%k1T6|T)8_%2t8FH)%&1~$F
z9P4Fok*|C>Fk#GiWlG|ZFG>7g6qE&43N9I&F&F%YRQ_|)+DyJenlLV4k@PvCgD^%y
z|KYM$q9AiV{{jO&0~AJaf{#^wT6b9>Y*ktZsq3n<LS+u1tErbjv$9D27Ds>1TpPhw
zGhv)8Grt=RAo&Mj%+89j7gtuxoEuh?J0UESS5m~!m&<S*^I^9dn##>Gbk)(v8kKqG
zuBJxtuuNSuf&Yp9#qw)s%ojGelfkVFgR#v1e&Vc$hHx=TL%}}|+&!m2008_Avu?ye

diff --git a/datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip
deleted file mode 100644
index 026a291767021b7c416b6a947fc0e8cb12e5ce05..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3339
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_29c(h<(EKg
z;9p>e%l#riZ*wqQU6d4Lb;jk*D<JPCBLjmW2{x1zC1&R7l@ynOgKE+|yM$lU?9hYi
zZ2#%pBL+M!=Y2%iDNI<SzdABNWUpY3x7!=vhA&UK?nG>H)O)G0XF~HshR+q6LH*Gu
zH(dP4RxKVB`C#K+yIYFZPrgOg`|skuvox)eJ?Y*24JON6t6uFZ`?<({iO7v?XS3=n
zhA%o!Nj`0H3v%4JW6j*l2Y*a-?LWB5JvG-z$kyBJg-4T<&qem2vQs_>Bj=pZ*rwxo
zB4ZNg<b!i$ih`2;j;X&|)_ov1_V(ty#L8*=GJ6a)<m0cOJ#cMW(VL*Gna^L#sPJCD
z`Q7JusUCOFf(8El2luNw{&n7T>Z0Q}qstdwIeh-J;`-08?GL0ui2@O*?OP&lIsgOJ
z7l`#q2-K9+GD1PB+wI8JV8Fw2{<XTd$F!)9%*@qw_ht5+J<wB?{N@sm^S92ahXNkS
zJ$*Tw``Y~<fm(r*$2bGe3f&8xQ+ZYU>kn}1g*eo69%}5HlHky?#GFiEMoCS9I~NrD
zqbUFw-;ivDoB|9;2-uR;;u3h6;>cChO9G$*fH7Rb<pa>&H%Kf1@=G#OiwHUOY`<?7
zumEVi{!X)~>7adK&i#em3&lC6sV-mUDPuHG^Gn?3w`ni-u9|Ym_zP!?oX4ZuOua4J
zHZHi4R4poQwna_odhVX9ZsLt!YyZkVc1c<!nkjCmdQYU-w@_{0EU|6=j?eC;pWUQZ
z{lZv6ed+0IZS$htHgUYTApLf4x!2@(sbOLV*jpt;Uu5(!7C2b2oMB5XnHWE@sl|bT
zxzX&}Y8E3^o);&-Ex1{!>A7T=&ex7;A=!oQqHPIm_H(A4?GWD;`bO@1Wnb*edyndD
zeO^Ybji0ZsxHF<g?Vo<<Z^g>ANqr5E?cd#<8MSzCO+n39_7A+M#lWVi-tD%)U<C!O
zISGj%GY>O=;YbU)6FmL29R*yjAJwd33i(_hrM=wf@5_Tt+a~T{Zg==3x+nR1f&QhA
zv(rzwu5jwo+3O+2KJ)tD1J5{KH&=!!8+TgDet(`iHU04E(|>-qG|y;0qvOxkVUl^Q
zH9_RAl~30DS#LL%t)J|)Uc$O9Dfgx3))ki`g_OnD9lRdZ{rS=2w|a`#p3OM8uw8&D
z$?@Tx9qiiz9QF&fAJOQDywDyZyHNGfvCtrw_3NUv+ookk<?UcT$y@b4D(}>VcgvFv
zUVqzIyXMnRvp=`>cBkH}-D|J6{NtCbH6PvF&3mSuIJ$d^w&t#mw~sdWPf?xn!mFe8
zrfbGgrh~dSytJQmCWT2X)Z<;UD(u>X?Mi2lT~Ks%UKz5oJ#0tH%e~yJTRN^eZfKlf
zr0_rRkoDR%<v%qI^QUL-{@{1c{O-EjrRIkU?ydX3{RbPOCI@8%U><-2GZJD88dsV5
zsMR4T!Z2D6j7)OOxEdf5P}>C<-a3MqD9sU8NOOb)163Fh2L5$)MKusl(}b{1m<<zz
zO>Z3+6KNBqnF6#8)J(zZFGPKc+DL)e2Fz0oOBz{$6jDIp^cToBq$Uf{TAcBNuok%x
z*M|lt#M&q(;;n^O>Vz%Dtdf!a$_#9L66;sA0t9F^mdZK68<ZZg`_}-)zcDPtTaBo~
z2zwZ_0z&pM8!Pd)B2`DkS&LdYBU_uuM!dBcMKwOlAvqtjtbte#OwYgs&Cfx+<>)03
XF1uNQ9%5kN0>Uz2Jk|qkU|;|MQ{-U}

diff --git a/datasets/common_voice/dummy/zh-CN/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/zh-CN/6.1.0/dummy_data.zip
deleted file mode 100644
index 9268b85991f32e3fa532c62558225ddb5d60232f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4204
zcmeH~c|4SB8^?zkWW<>|LLG#t45P7RIkuTJ*@Y}&GL{*Pvcy<Y#B^|yEvYHXWIZNZ
z_AHepDm6+W3N4lx`zuo+Ix|OaNKVZ^?_clC=XvJ&>-s*|{kyN<{Tow5AP5X#eg^Zr
z&Hs7$a^nFA1F(J`9w#tZM}p%XOABrQki5gpfqD4_@&MRC<3IoaG{Lx;u3JQMbrENO
zn6sCUx1TReSxz3N09Sy+<P~7@@;_NUBhzWsD?)#=dXO7T$7G4|OPbbt9q=<6-vH>y
za55LLLi*dY0RRWs0RTmo7z7_jyr&$&*Z-g8N+4@F%w5v@cDY*OADc8FAa@^+*0@#7
z=k$_q&4aNuH5{7_&ZbF{x79=TQn*kOje6|$7hZ=UAOntL0&@a#!_P~7FH=cW(v`_w
z;1ibl*y+uQfl^!#4p1^oUuPw%@yg_G#2$t%#%kmtthQwT&WYYo$_-9`&4EKdue+})
z8-Gqyst6{H6nIA&-0?OD?O*^GG(t2BlO$p_Xkw)5Karej^^F0lJB9T-t>g87gI*0Z
z?Gk+#bFNa0bQ`B1r&jTcc<JRdK2#KhnrOIXKVK;~o5V(<_q?Y)g=*vZd206=NSVCL
zGo)#Qn8)?LxZU>APhUduaeX*N@~dNoul!u`LDMCzg8eNH*~xMEam&*t7p!&^(4QTo
z2MW+;P2;@UBJWxg_#j1yY<GnrUXsJZ&<2mNUbCsw`jEacy?k}AxsoazyvAT>Th?#1
z#a2+q!ZR21S3SH>=XcW{T^O1>i2mr><Hg*J%Ti9!BU*Pcq-0}ADbJFWSe*Y4g^a5=
z0Lt^DI{$-7lvMOe$nTx!(Rn*)Y&T4ZDgYNoSr)k!^9x$a5XSdy;&(5`I4d^Q!fgW<
zL%sYa`|T^al6l?z+9I4ix(iB8;O+aIxW;FS#>^wDrYUq8kC?+Twz2Q8pf>71lYOEr
zUFMD|vKvl(kSW<FDFXG@Feyuu6|`j|TB!|)LzD#Nh3<)$IQ-g|q)QZ5J1bAxn!Z2X
ziHp;yp}CP0;o}?~dnD136AcDEI+2gGyEtH_%gMj1zDPD|`lKgNTC>n=aw6LtEaI|X
z9i{Hn@h)xT@uyEAdne~_2%c84YZ2@q6xnNiEPXTgiJlqIAlWb5b1-$mb!Qjc_+(gJ
zT*PR1oBOl{@6L{v%-U^pP37A$A#%DMW4)MYm8-R<Fw2VQlby_!WjTo$h9dopONAvx
z{2h<s8D50Le$|CJT;CZmzqp6)iAYFzXKH|1ZeC4Zb>~EIczShpkOEvKK_6r-Q8~pq
z0gB-XAPT{$5}q@TRP7PFE>3G=lTCZNT*^TbkKLrpxwjQ1>i#)*x@=ogJzR&9Q&X!e
zP82P6Kj`V{dDdc*LWwoL(L<5-K(|=m5oJsAnQ4-_22~lyLW$(_3g)f%qlMj%S?yB+
zZrRq~P#ic?J}czhSI3dgVIf7;w4h4a80>dTP;3{<XZrxDIs6Cb#jW{<?^<Xs<u5*+
ztuam3)VS?Cxwvr?1ya}Bp<GFuo_*8T`=+md2B$qUN)M?%r5GGC-9JJj(D6YvpBIOV
zb+|m+$E(zdLO1<G1$6J*MkUtWDc)d#aT3r&n`x#)*bPUUxDqajJLq5wtLWh4MG;oJ
zoj9l|WjVXCC}@)frK}+#A%=7fWwZ%s&XX~yZG=#pwfpN1Jum(0B*y5mRu8)N=Iq?~
zcH*mLJ(*^?Qcnez^d#VX316jL$1DFY<^NF1y%m>6WEo@0&+@tFMR3FU{7B1|IAa4K
zb1cV?Bo>lsWLo*j>u`c~rzG&omH23IAyLfc&ja4lAAwO%!i5B%qzx#zO#AYv*8?ld
zl&_1npKOmTh>n6b3m;9+ZLWT48rJx}yBTR?Myi@F`gp|n%;7U&T@e?^g4EHAZngu5
zVtTq^;;<3D-wQ!Q$|a%O*lDtzXei=gYK4S9ggYS=Dgf=sN>mEld*1KSR@8kn@y<FF
zRHQQA^e6!xX*I9hulfffoo^o`y@YZf9ECt#6O6!-NuvRZSp$@+TQ#K6*g(j=dXtxY
zU8e%APxA`Ix#oqO4w=6_rrmj_a~4~j^s+tEY{8%2tL{>#nwMmg^FYJbU^WR3n27X%
zKCd0L5Ocg^gGt$Qau?kj{YtxKXU@5Yr04gB`rgcW1!}LJu}cY<szXe_WC)|i(v0Cf
z*Sh99qt;?-Xu^)<N8bo|LY<kJaUFjt6HlH{L9lP!)ZTcp78E0PcE6GV|7J7ieV_8<
z4{AiW0rPJXVM%TpVP5o+?A+;G3pt)}A84ML*(F5^mW`i3t=!376RN{abIY8%^*||2
ziC-yS;#wju1D%EC%ZH}k@J70YOK*8Pq(lHlNWIs{F3J#wpBD!0ApUB=KByWM^nT!|
zIAuzb2RYo@(eOcd6PzZ`tD05VuNZ#34&)|*h!D!((OH-U4%Dmc>%yf#?jA>MFgMfM
z9(+65GX4csR_=Nhdcygwe1J4&rWM<zCui5}doP7xydW;>zpFrJKrDKUIy<Z#S@#a|
zHbdkCS81vBq0z=O^#iB&;H`|x@v(V-6?=*5Hsdw%B2@p$zBhqZ*RBN32V0cqW(A1U
zqJsxxBXza0GLyZH3uKscP|ZU3i5Imh{W&)ZQnjUmR|I3GQ?5+csw}DaMMJ#T>TJmr
za!nLxV;2Rkm*|DQyaEI;j$}Tr%J&>g`JRP>l7Quc3rN)Jg8!2Be@t4N^p{EF$jqN4
z{WteBc+A}Y-OiSy;_B4D#AD1HAS;ZE@v~y7*6nPGx8gql1H7(A%iz@+btU*IW8Qxr
z{57%u7`rxzt;AwMtYE(}7zX@1k<8SPxi6P{UY#0N!j(8#fiGu>A2-dM!PQ6HO73Sa
zR=6t;yq^QFw(ynU%iOGh*BJW0;g@`1t@kY98GD<tx$Sva!GG&S>(MzFDgppt#y5ho
KnOYeP0N`&#HvbU-

diff --git a/datasets/common_voice/dummy/zh-HK/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/zh-HK/6.1.0/dummy_data.zip
deleted file mode 100644
index bc6d96478de5c17a91b4829611a71ea48b5e112b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4210
zcmeH~dpy&98^@>3$cl-@%_73%aGSOW#h62xGdY#Yp<xco*pNs$<WM0Qx3e4`H%hXI
z3MEBkL<i?%qMSP{ZlY(q^@Pe@`{ViV`Tbts-+q5x@89?OT-Wve%+vq~;sP+QS67JU
zf4_WrKmc$6&WAwox5GJ*9TY4rHUfYdJIw5v%jX0Hzy_KG0sx>X#=~^oAQEeX9P^bw
z=IKrJA<6GXAmvdzQ9I?4D0w9E7o%tAIIj6b*<Xx)5-+c3u*~4Oj9zaC{EUrr0-k3(
znip8t!=Y>dzyxFMVu?Zac5pq8Ad`ImPS>RjwEcV{`rCBTTCn;+Gk*3bZ%GP`)2Z7y
z5!rLJg#^vQ8d+GHDwmKM^g%KoFeC&gp(A$XZA1k`t|kjY{D`Q8%!V}xj#mrr#~;xM
zInHUr5f*<_g>Z6W`)(Kd^c(%KE#r~(16qP;hc=LMO_rsVcF)TOsCJyBe$jxq|9Ktv
zLL4r}R4Qa9g>y?Xs!duqpD!)|E0660!K&h3xTdQKmDbo6C-b7_DA|&`8}y<$G2-3=
zu=j0(8Kt;=yfkUpR;>e;G~X!|doOgChMKJ4Q!Uf`JcWtc85*PIvz38I?3!Q9n)uJh
zd7^{af<;QEE_b=vTZetV%Db25Jb%eLw*UQ=z}bre<*DZ%wtFX9w*w=W*yq`p<+GCf
z!Oj<O1xDf*7z@c#@^MbSKTUnCk@0;~{<HNhRrvWJS3mn9@s!!AQ#^Wj1CWxJgmku^
zB3MdjGf+yx&=7v6z)Oj8^Un1x9Eh5|d;X<5Ivn)Gt*b(KcU-AYIc85n3DU;9joT#t
z+9cGldC0C{Jn<Rf!p9{18CmS1jjt1n4$<?e0d>S&W9*cS2{Cja-xq)xsA$vDo6=N?
zHqY6bu`9o*DA<&X9dyb2Y>Xu@fInL*iJW0pr#B-*RIIa!Ha_^of+PIgz00ja(%HqB
zlOj%+G`HwBwA?YeKInh!^Ncj^!e5O7OPFUPxrPci=%{fX>Ov`eb7Or=rLsbwRd=U$
zXDUwg@x%~fL1^@8r&Rh#ajm0LU!O7O4vIy&u*ByibLuvyk5uWth=-1p{==h-K1+dR
zr^Y<n{2vZ34$gsBl)+{=n!nXM#%94#rJb>qSyIN=!NZj?MVxS7^<lQf4_bq8(dZln
zvnjk?$<M4%wjkFVSfj0IiLK5EU#fM5^+Z>AiG}0{TVE3*kN&O${-D@P5UuqlN;=c4
zXF%_C{(#s`J3}DdH}7n+&Y3%;uGZU}LGTTNdAtu?RK9sD;#M&&Gc0e<qw76*+xd=^
z;(b+-$PG&IW1B7;;dKyg?uewk7~08an~D_aF=fEK6vyTsef=b;de(l~jwFM2yraq-
z_k+%jHn)Xt$<;UH6*xRMmwKghpTAe^wR2h5Csn^Jbb!}i1FbI)Hg8Da(l|^IgIC{G
zow5@N3fxom*|)k{J*XoB?kGs&i8!A`b&2W$<u!6Qmsqtl!K#w!X<_FQ#LaDC$J{|B
z*Y{>M=JtB2Z4XsO6{Q~pi&01@K`vyDW}#96RJ?$MDX871Pmpl_a2g0j6!Zwg`-4=c
zi%rU>x&8e9Y*+p>HS;dIde7;l`Ph2T6%m=UZMBFfmP8~wk;q^5TW7TWU%&sgewH^m
z#ITG8L~LO7Z1N=IoxFc0Ag!fazg80p9vG(|q0mIA4Z@sgosNmD&5GEP)(}5Su0vPx
zx`n2>dL=`hXRKuBcMHdt6R#W%nC$&j+q5$iuOr(Kq8B37GIrT1R6lHkFwK4!%>weM
zH2qUzWuB&yq<Ik#5ta^$wwMX0+Lm?HNycc|$#Iwmt2e>(lyD+2=*g6dHac<;!AEg3
z-p-Hk1=Tgxno0-9aj5T$8YjLX6*@dpad^aYM3>Vsi7!i%to6V~x&X&L+;Jeq6?2OZ
zr7@i5xV=rqSYUde7P~F|)$=ld*zlI2+JuUz;N+vp<9?(Hy9trxQ01q+GW)cj3a57Z
zcLtzxf|CueG<zkt6gJ2a0ic83_r=f03YgBE;``(xJs)hGH{yJ!a>C}7OSQ|<5#1oG
z=;_8c_sE`)JGJSp3ZtjGWu`J|Gp$Vu@?B}eBOiu%W(C%Eok?|HY!X8?QHE-&EOnjh
z@$ZM{y3N<(#*cy4wWRx*&G&yGR|OAf|K9uj`5oZi8wM!Z6DSTbG1+sD4|Ke+dO8w1
z>e6+U(dy0BXJId~mHhO#mN5DPvCp*KBt;@ID>F)d>oc*ay^kAubxmx=<)?g3ckh%$
z{u|Mn#~G<6GXmK}8Y*&q?7d_B@Na-7UUR1#K!Te)`J%q0%mKYaV1cr02|OtT0Wqp!
zJ~lfi_bMf$YGTa8<74BP#VtX3d&?bW7Y0IODO$~V89^!_3_Ru`=bm;?w;OS-k8n|H
z)<yYvZw6k5&FxI;WLqTlG&=Bb!XN#o38IUYC(K0dikKkUyIUfZGwb53s3bIe>Kad9
z3`f8O_jC~Ph^PkqE_|a*>1!97b3YL}2`il=*?FAUMlr(PHsq#Q!64yOaCEu8kd%_r
z-SG=iHE$vj;81R)yB#v#$p9?6Js^iSG%+oCb~ye<Oi8{}<f9tw%x{M8`q%|5s9aDp
z%g+&3=5IeLeR#9*Zjh>WN^wF>&ID2Sl*5T$n6@qPMz2?yM$po2^pY=a|FcEltT3|_
zmOVCe0I&AgDlC=4mqKv$T<b8I)$pAmXWJkGT+i1Fe)$9_U=qW8UE}iEmt8&!0VMz{
z0T(g4YXkm+<NrD7d%wRzn#AP%BI&>GpTT3g{~zvb#eS~!{>wZ@;{tMnIT$~yQnl{R
zmU*lG2{6FxYP14g>w8y&!x%&3*TG+1@XxW|o4VCl5fCfbZ>AIj{sVhwYQU_^l|8RD
zBdg&r*;#?FSd*V`n%RSE57E`!91d2vtB%xP1Ft;?R)Y_2WCi@)sqjzu^=8j9o>AM3
e;>JN(!GAl6)}yn7nbElzcQ~V%UN9H{z<&U=p5-F|

diff --git a/datasets/common_voice/dummy/zh-TW/6.1.0/dummy_data.zip b/datasets/common_voice/dummy/zh-TW/6.1.0/dummy_data.zip
deleted file mode 100644
index a825b9ac44eb7046081e28e68ae3aa8aa7cc924c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4052
zcmWIWW@h1H0D-5g3xdE5D8bDj!;n&%n_C&5l30?c9~#2Rz&uYmFad;1E4UdLS>7@;
zFtEG>5&<|3Q9(5%xlA`Xzo?+JSl3L?P}j)7$UxW7NY~JiNYfWBPC{{_7Lle`W$1>4
zL+s#RV28^GB0!IGFkD@f6m-(oU0jlpfq|Eafx(z0J4%WYGxPLHip#*kHEEt*!mnv|
z=)rZif3FvFpg`;EkFisJ|Nku4^t7n_-JP>eW#%(_9kUhNLZUP}6qGb`bahy2r8d8c
zt`M$a@SgopW`<?4oy_$s-&|!Ag?<V8Y+fkFnKt>+lGmAB8)t9dZxq5Jx^-foqQWy@
ziE@c!Q`WCiyKQ@NPw&MHoz^g)pe08Z8NRr9W6Bf()nX&7NvnL~y!)p~FV=N`r&K5%
zxn)k|G@V5o{3IT(aSva0N&02S6!WD)o)bMAlQ(D@$WC#dy~RzjgyGGsWw)=bo!GVc
z;=hN1iZhI?_S<Es3AJ^fK7adj;!de~@Ak{dJ1{iY&0Bq4Qa^BW>r3a}#gWmX5#N_@
z;BNfyv#cy-_Sf5KdnMjYJN!9xp8WdthaaVWuKQfO_tca>`BUsc=?D=E-CeiW3jkx`
z6A&Ac6bmV-Wd!5k{C+QHM*+4ERkh-&|L>lCTg@@;_?tI5MH6;&KQl=v?eGooKI-J5
zE7sJc(cz!0x8#%P9hU6u-9HodCDxg&-0||1R)=O;vY~gQbW)*_uK(@6bfqnypOqgG
z>=ra{>}M5P>T8(EsXixavDH!A)_d-5Thvkle?@4tUdwb*c<glL){_8($fgw>dy+jl
zCP=U6bt|(oubiT@^`lJ8F~fj&T;e5(O;Z=lVN8zom~OEup_5~6yM$`W2|IWF6>DFY
z{myfZnb-TBzu$|M)#2fT1@oi~@9g8(H$Qy$a>4!|Rj<SUF1_H?l{atS`bit^WgS_k
z@o)Vx&&`+D$j4ppKYjI4)n)tqbNWKrUn{r1{QbWFsr(vwM9_QAOA2aPb$@>%FxX!L
zu{lXWUzV7Y2`n#CQ{WK*$^ck0LGFazezJ}{ZLc53HrZ^KJzwqOwR4m0yu)7U<^6}4
znz#bpyMkQ1GBi3eoE&FqY+wFx!UwUL&KiYJ?|*vwU+1;s@}=@!H!l<=@Xfy9c||7o
z*W=2I&cV+nzqDLBb=Fir*ZWdJnRC-@xz}#>U3*OJJA;;>^-%?mr5Pm-jZ8%=#U^Gg
zcbs$fz!SlPg&*!LJ#&@ITQ#(E8%N+N0YhO1kyC3Y?Bx%c``*F&dQtvFWwTb-My;z_
z(^zv%jQSt%*=*hMEuDRR-2RuhceXG-`nTg_+V7vQ=YQY7eY*MW*YW)J`tSDNmr6gN
zqki`91&-sgXPrL36!hK8b9!ZERMNKvDi5Yu6r{GV5Xf0EE5tUM&0EiW|Dy>?zui3M
z9+|Op6F-lDi-pswL#*5;NoQTA7*BdzrfZg%XV3gi&c?aj?5Mz%P8L0ttAQ00d6XH-
z{@LGrdiHTYKmUBYd-cCRRn&d|{_3u9WOsT~Jt9UyMH+I98j%#EC8@<F@W{kbqK!r-
zT_clERdJaSFl&-tKjoKXq!tm1&9nUny$(AFw4HywTVwWJU$x(IO~*G_-`v)yrfC-S
z;YxE?F25+NsHlFFi0f-t9nn{(Q!angy(6~%-rSvM?EjxJkT(;aEE*^@d775glD91<
zFHG1Qwq?HfGT%pgCjXtbD$nC?%Ep_jTo-*OrS+}Kyc1?yTExuad^pu%u5`n*h6bxC
zmpImGerI{w<tb*g?r_7$1K%Z^B`Rf-9&oxTewf2Lr>SMurCG-}sjX;NOY>Q9KKsO!
z_09$&HIf-?9lg>yp6=cz&LnW=vPt~j*$X=Pq-Ni?p2*&`J0Uu=TiAN{=J(FOts>)i
z-aJnDd11Yu-`gKccgg6^TlegG`{(zYU%i-Dq1WB#>?KpS%Sq*R_7iz!ubpl&d*+7k
z%`3gFy*$`jW_tYMqD!$gHPbdH->%>GSGwKpf5UG^)V#Ky#YCAG7(MTS*pj5YmYIiH
ze&Wb-XV32jH6qv^RMm(-onu^`_g*nV@43-Uoj0m8t~Sg~+U)8v*^QTDrHe<hs-3@?
zVg5tU4O_R*lKfFWLq9~-G&as{#*RFm%OwHA9C>O#b8lZ1NwUta3lA3%x#F^mP29=L
z=_*@zV3ug9)%g=Ic?=UKI7ZIboRqQnlTFpZz2A;4JSsR@PW15|DIvGSl&;DA$GeKp
zXy`6YIM#o}Bf9^b%Q4li1&c3BMRrv^(p&V|%|LeR6`RvNQ*SNfT)6i2w;y?u8vCW^
z$HzU5S-k1mhwiGT4S%EfKL5V^EcyGF4S(;*T+4U1XJ=#D70EP(Gg98M;MJjewoc)U
zh&}Ja{%(kPKYji0=f@A9epP)>UEm+%FL3xnTCt$A4!IR;K~nfbLq9VgwG9V~0*odv
zBa<96u9mX|)CmF%ZyiBQly)>Lq#aFyfhr6L1OGa@q8f;&O-<M)%oa7mrnioZiL?pU
zz6RO{YF}gZ8KPZ>+Pa3=2h1r9OB&gM6jD&(^cloHq_#HDVw`b=uo$&LiEME%u%S=9
z#qj1OVQVoPQ^?kGu#jRcTG0Ts980q*z#Eh<5q?Ik_mM5X#Y&3hh*kh$PlNIxY88cS
z?{aoh>_w`uh_e{A@I|)Rjgu6MF^Xb**5fLBAl3uZHZXA?11cgujiHxBxNK)-1DVbR
Ngl)jYWWf#M0RXIGm1O_`

diff --git a/datasets/openslr/dummy/SLR32/0.0.0/dummy_data.zip b/datasets/openslr/dummy/SLR32/0.0.0/dummy_data.zip
index 4c518d6a7ffa048f1fdbf5865e9fd6cca1420616..505fa794f07b31c161afff09248005e00de7c883 100644
GIT binary patch
literal 12652
zcmc&(2{={l7Cy#~i#o<k87i4i=6O0qhKxl@rjQiMREAQ=oR9`X5e+0`^4G7V%tOXZ
zNvNbmC4{7byFWVF`*gVUcklgrcKhslp7pM8t#_^UeQO!(Q&2La!0Pw&+&cT?$6vH4
zUX+8ktE;b#gPoV%Mzd}7s1+uE#oLwLinjxAZy#C|8RY~S3N<UhcvcV^g+>VKhJ>*|
zf^2LIaGFM2EjD6p=4B#8RzPBjMB3Z3*gM(owjD9pVm`9UJlyy=c?=tc0EUxnRG61~
zbXXii#igkccxg*aw2=CH{Iy^PzQ&ZKl&p9Q<CwFV@remZ6wD5LEG)`K9g@%1P4tvH
zJ$U_W8zX<CwHFFytiJ;7D98HP3epg0Ij=vb|GXh6*-$8Eq#>}Iw%Tm<I^f|rt9}7w
z33KQHZzA=F;&Fy$>dc|fv#g|RujG0uC>`F=eJRF|{E%B)3;N`48r$6#6Y|f>69=@C
z+yhuNpQA*=i#DDQI5{n*cJllnhG8<L4r`*fiN?1&y-hFhrZ#0tSE<dg&!zo4pS-Kz
zX>v}px}hif;>}ZDo9ooZ4sWXSP|c1g%#~-()b1(GOWyMR>t~sB7u$TTw|V0Nw0w*2
z`@9Podu=#1=yUs|Wt+<(H>O1YNsS#h3ldiQ(M~;nM(g+V%A}c~$S2k@jRSnopMDLg
zvK*gAJ61m1Z9O3N%F>UA?|p32P(kPw!4=__JakE=QF46&l+_|lsd;#*JEMDEt(6aD
zk`JK=|G_F&)f+Tg7s>Q5dkH5BC#CO>S_&M7MWzKa&uiWs(=wTs;Cd5OWTdttP^Q~@
z|NA>L0mq-JQO0;%j`Dxs10)m#pap9me5M3`V&VIgt__F=d4sf{d>{|<z<D;$GKg}&
zx(u*wS&X~Mi3$45qF#^M+RIyVoI*Y{m=brMMcis&j!CrPCGs+m4nU7N4WS<>W!-b8
z;7wR;NG5rD+4$LEz3e=&PJVOowJ9LLb}~QS4m?C`TC!B1z5flVUT(i7l@}IEzzReb
zu-ii{^*$6wkDv#<Nm>A8m=~~AwH4w8JJPN$&Tfu2&TbBlK3Ff$L$hJ31&Pyze6I6r
zmN95laVr<63+?Sxp(?;?y(kuZoXn?Ra3u?;X~Sif?_exRZj;e5P?^aq_$Ufh8o$cM
zk?Yo5ZSVJ8mxs-^Zy#pz*OznfaJLd;6g!@diMl9+9p|VekCxoK`>;=Mv3AFyLURMT
zfwyI%xAZDxhWCWX=i?)(Dg{*=Z;31EDY3g~NU)*FJk%ZuIF$A?B==~``Lzj{IaK3P
zGHK}o?UI=W?3oIJT>Qf6;s$~}dtXK|cK089K$XVfmbbyNe+ycguk)KcCcEoZP7r;X
zj6SaWbc#*8Nr!#TZBtFl4oz?DNNneujVG0w?(XF&Rd4G2W|>~T5d>8}yOZlU<Pu=O
z`JjTEHjW&HS`V4_XYxh_)go42PGm`li>INSc~hJ@C`?Ku)~`pGkq_F^k?j_c=Fi5s
zQwLB}kG}f0>wZOGVA8hxF>Miy9YwFcovSbM%9E^e;N>eR;SDi9d)8u?>CU7JXU<rh
z$>hS&Nkrdd)A2z?X<n~0*wTov8KP!1prgyw@yOsmq5l1ZdFKOcGJnS1UiJ*F3^%Q*
zyS(-ZF8EFBDS{LZO{kvG%}^LMbKJnc!#QLsmVM6#A>E8l$A`Li&723%`-Z4Dr13i!
zvOW&2+MpfUAIG|Z+wz(~AeYDfkN*<GG}L3R#^#MpQ=h7F^KSp}*TYXAE2u;ib%bjk
ziee?iEo#^{Vv<ZuO^sdBZrHeF$0sCO#HPesSZA*o$E()#RA%s3`mv`Cs!yh+^KVEe
z=N})VH64raj2I6(8W<Th&eZxci*dqJa?G?(YRuGBWLz(zPX5j%d>#6B3{3+WP0G&L
zC*AejW>1dB7AYI3H5_|de*V-B#*-bn50eY9S%t~`x+h%wc4s*X%p`J(T}bc!=4Ohk
zOJkRHY<=vlr|jFxKK<n8$;;_|M}qn9Rqcp6uC3zVF(c7*Dgdi6#sjfr1v)6eD6Ilw
z4B*DXDTx~ykcdP#`|%Ib9g%EAx|{v@bFvXhZ}wvuvWZh4k=}^ZH~TRs8zeqWC`&Dx
z0O7JhQRG9~8zCE6{D4@(LzG{K5{$rkCG6OZxYQGgouwYRCG^^+h9!;k#vI-zlIyFY
zvbe+ieVB}{lz+WkxjT}R@8bi9k1`jFFIE=$Qt_Tm_oI_XQMH}njyBF4uNlC-E#kD_
zp_Y3=xJo;Em3IK$A;%L+^`l4bOgFyk4^XzOoY4MS^C|Dpn!5abyQzB8JGy%78ah}f
zR*A{3>S6Cr84j{z3w2iz*IC_>ZWU_B=A@&@qr7IoTr9h3ps2OFIeDd57VF=214-Du
zoBb6Z<y;7h6Q9B~78zLwZ+9B?`rIzSSyg#3%|mF~sJLl{zWj5wj&)-#jVfaxx6tD+
zlle~$qpm7^i9f#On0wU7j^RP|f_?YUY*iUP6oBUkdb*#}Ks-ktIpB5&rCt=WJE`o5
zbO6!$1DOB+bN(Qr35l@u`*G0!t%C@WcoLEMC&iNp&LxWnhX}YYftLjqiJ`gpTt*}*
ztt=DAY;rP)(2>Rp^&CXG*=ijm$Cd~<rZR_z)@~L!y&1zGc%89S*?D5^&{IQ>E>4uW
z)jRqZWanzCIP9}&!#JZSUUw%w{rjd<BmKVYls^8A!5EG3snV;3y9~c_XoR!346njR
zd)3PJud$-5@=<(r+OV@qar0^~T__fy-`%$&8^YWjMVyURC<aVC)c8h!QuKMsj8f6?
zObDhpilyuO-(^?%*j9VFvndN4c|dvRo2I)A8|Q#dx{DQMa6*I<2!df5G(`nO^TmTe
zaw$RZ!-D^}P=I?I+R>sJb52(RnLxOzEtZghIO3B&KaxzOBhAzEz%uH|e}EtzZ9aSd
zAOz`%^YV0C;x0hcIl?XkfL6jtLm?99Pd^|7PxXg0!26dHhI4}gPirQ4TSQyb;h~o8
zFQ}<QnDX9b(ldB5I8eS-t?uQ%uWMx5J#|j!a_kdSpzoQ9?8u=fJDwz?Vn#QsP;wn7
z-=`b9j#*Si@(Z80p?y76zV>XdeDT@1WAlZtr~?W1RtE~lW_;U3LtA(3%Dt82vg)*>
zXkF$BmhrA5_mz)E7dC%M8F+hNDO2g|1^ROF$gkSlD*GJM?bxbV&U{*7w4%_AwvYYd
z;V<$}Tk&q~Wrx@=wo(o41^@2Z%D922Om6t>>R#`i%4-aF=G%4CZ)veooDn$oW+v3G
zTl`FXt4HMX;7gko_kYhV`k-_?eXfA{s`Ozdy)IeawxXo#Gd(g}zy@E$k*^>lj$OF<
zK>?Fldh9~+Kzx%19{Ar>vQQXpK)eyf7irqq?eUmn<`GEGqtFB1M2wlg`M~1~2N_NZ
z$;^dAn8Za0l+Vwk@9|=%3=Av8>9jjuGe5gY&hIRsrXPpqa}N^EQC=?{#iyH5JLt|X
zo5Ei=+?gAz-S-8rX%m@m9{z!SXSgr^$p?|_E-v0`c3SDc8x8S+gQK~U0y!~P^G){;
zQl}MeH#*xC*2Hc863wlwudz$Wmp7c|u7qpOhlCb|ZSG~}>)vzM>K@FwW4sn!QC`=W
zJ7aZU*8WEDqec#a!qvs53M`cLs~xIKY0s`Y)}JS48Bl!sG24dQkMvd=)_$Pr_dr?o
z2CY=R)<)f-AyO2Q@KWGd{f-0IZ|X${4fwyqHd+;ON&Vf|%TdfAxbB<nWgAdIN_C~%
z-$MQZ1|@CkzF$qfa4K7#srgOn+`7f-l<(P;dfudA?#jPaO-}#F`YQ#k;s#OejZUuZ
z`;Q&Xz;42<)oP5a39FM+vGD#Y=yv8%_U#6LMYOytS?j}+W~TKQCih{)9|ZWfU2A+|
z>9o_cJ;+3JY&z`Pi4X5AK1kVC+(^1c=PR0(MeQSLr2Kpb@8GL=^Pr8-9CZ}>Dfni@
z5=8_wE~F_LkfkUeD3p_v8oze$>GZxt9&V#8Qe@GeFY#_x-0SYhkbV3agASI)cr>ua
ztgh54XWi79M?6aWp;zNnh0e9;bss#s_35X~sSy9W$G;9Y$H(qw#2vv~`FVWw^$eVr
z+H5#Qek(_L?KT<)JNJb3WC7{T%@szW4`ONdw-;P%&1r5JmOgg###^b81g0D4dUpmr
z`4iR;Hco!i<w>MuI+i~|bva)6G}*0Mx9E!$H<(9iB&!Sa)^GJDqcXwrux&o4o@_Hi
z20U65KOSxZ`3(4tv_}Pg_2|NtV|gAGJB4#A=f!Asd=TnKnFUKw7iAW{y=HyeG=6VO
zq=dKa+L8#5uV-ZM6(`6I&+I(Ydh~|X#%yyvj;n=DGR)Wc7_%wU4+d7SxcT5iq&C$)
zFDT#Yx=W3|h28PizH=7}w~r)^Y{A@(V_D})M|X-XH{n|yj~ty|p{}5PEO+XMJ@tuA
z;dL#IOk`!J2P20Jf+TheCLIw$`A$}cS?XWDXl&qCA#T{y!696#G!^S9++>_Lk?e^-
zKXz1ay87IjkiV|R);@J_>*1tuOUUrO;MOTp#?qTVW;KaZJ14aJ%3N1w>I%!mGbmty
z5$DYdPk*3lVo`|R{KDxESvf)eBRoWTa412=txO%G?NrYi*W#jkcl!uEtAm~}R=q?}
zNL@REap&P(_~fex8CT+X;=Vhuck-GroF3M_y1jlnGO=gP$?%oMC%05Y=m<CG#~PY$
zI!P(}=22eN$90MIb?t@z7Y*zAWj|uoKDJ8~H|EqTr-;?<5ZohY&1Lkp{Dq=Tcf$=e
zp4=#m2HICkTKK5H8+&k!nvOgV*ZW@Kou`d#TNCW<R`}nEFpt!?rAW^USp)q$M_AmI
zDdYFq1IBG-D?hC|D#<q!YJHA=az@AP(@b#8YHb76hhe62lwxI8?;0gs4GTxnIxm~b
z0_>B5RCZ~N@u}DR^_dFrnd*AvrwPd5$TKe7;Gj+_KnDMh7rme76;ODHYbCJz|E5;@
z6V=Kk9kGDQhwTn|&K1o4kg^`oa|n8fh%!$~c)`5>Ys@c9$^~NH2Lb~R5uibg64al?
zY})^txhsu^^81#v_i5#D5tPnRX7X|Dda3HB7_YQl>sHgQDL;OXYR}|Q`G`VND&vsu
z`$FwQ7x!kpy#1#65!H$86sI5^Ch;?4Pw8F>_#IL>o<{k$GtTIU(=CH6VUq$=md*X@
zo+I+<22JR=$aSvj&%&BlNlREpa5{_Pi(1i?!M9V5f@$i|hx;C9nhV@s-RUCG{y<)%
z+_|4GP&Lk0F6PAcbx{wyN`&6M3|aeLa(bH5A@g3zKtcjjsmZCmEfc)9)?b<}*y49+
z)-Z~+3}rpZc)LxAKiDNxk2OqGRqE7Wm+fml$1aga&90AExgQz0%4+JZ@xfthQNxU-
zO0GXgs)86(4%eV@i@)xa=7*9aoe7nf`tIkpue;U|dkhd+RcU|4ddLmH6{MY%2N7BI
zH;BB5zn0VKC39Z`%}_Ym!pWog_;AJ>bgIdDS=G0eg=6CN8k^enMFneg+v{{iu|i^o
z2D-YUx@IfU)I!-$BvlwTHQCa>pl*itwE`V4r8BS#+6mar{62|EkAwCJYgaE!##z!s
z?6;AXQ4qHgS5T0Zm6sG3m&Zc?F*`Z{u|Rl;s7W!a5TpkR>W0p)d4&!WMd9)ll2?$G
zvXPRMQV^GwkdjyUsgOOS5aC&dLcaf4AqfQu1ser9S!rom8F>Y{p9(oa3K5=VDCG5z
z6`Hk>tc<jzJfxD8_)mp)K?)I`B`8Ej&P=iJ4E%>@QA0TJ3&ttL_?qNId`>X9_JZL$
zeA*0V;)FGX*M)xmtS-x$kQ?jILx8b0d{7F;)`XRc2fa*~?#Oe&B13Fq1o)s63|t8-
z6@L?ofuEazLlO0S0N4VBkN^gR!mA7T29ra=8p3U*Kx_wbIh*y%q7BS>hcAwjfkZ*v
zOa2_n3=#!#R$dMYaf=ISF*8bp#SjT`76tUV7M#4!(+ms<;iE$^dnK$?d=?UgMCDjy
z5=(GD2p9kZpEZFwBw?lEsi_y@+9HS~Y83$yNG7HNnAZ^25KbA5Fdu?&vmZ<7fh|ls
zBvt|v3qiZtk3Wbtj>JNcZ}wwZSi}@W?#q}4VM7E3XFuk!01+3QfXx#TV8N#}d@z5d
z;<FdQLMA@|M|g;e1%V!bkN^&z{J`jkuu}2sX-N|ju}9)70YLqYObFlT5Uv$MhGkn4
z#3VGbe={W_;Uwu0a3r|-N8wH*;YiSP8E^zQ!p9n5>O)wm_z@%=!h3TJ{Uh7~6BEKp
z#S5)m0(X9(6V=isGKe@VkuH4(;lmh2V$%d01Goi8Aq!_W(5xk_A)Lx81RQC?E^Rcj
zRLDz6NYbQT+NflykQ5liE=d!2X`_?*ln1)^@V-8{*%MYOeh(6ov>%cDU1XF3&wT(F
zp8LSmfUr{WuNjuiUx-M=NpO)F0zm};1#ifJZzQKKG9s)^dh1IY2h3X=90=h3YcLQX
ztW-RT2?4n<jY*tH78wr^_6%|rPypU;MJli#y0XB+c41QgGX>xcNu&ZQtOx}brugqG
z0K5)w(1E@uVWr}eRwEQx*y~8r6FQ*ay-2|a*l^N=79e4z;_2C!L;7Vl1PLSnaKw!b
zc+3ys9ztrsNU*R8me|`vvLD`3Li%2UV~My6_<n(2A7SGJ_QQKUpmRl7srbj7%eVg`
zx-4=x0J;#IKxp&e5V}0zTB0%jNv{cR8bWW0v^qv>maEQhbeiDm5IRew)p@aOb$+AI
m1XqX9S0b&BJNIu{XL(&FV|^;<hKfQ7LVvbFwTYYueEl0JtAcI-

delta 1087
zcmaEpG*@Br0)8oeZUzR1l+xVX%J`JTl0^N<6M3YVBp4<i)Rh7;nI#w)CKoVE!h{#9
zfrUXT<P{-IIgnxnpaK~ntqfME3KCM63GilS5dj*{!C>P#CCJV?`-lo71H%d+HU=x0
zoS?_Id6ASF<K)Ra63lWyn?7*MgA8H>v6MhYF{^?_l@yDM^Yu~9JLqY0^%c-`N3eMi
z9rnUPlM994fXrh8x@(~t#9a&3;O+u36LeV6-1c*k=+}Edv(zDOgD4W3yiV)^%qWm=
zHz>1#y$NMAS3!dd=v}CgGEk@@1Kp)3rhn-O1e*L9Vmi#IJKCa~?}!95PEwYV1;qjc
z2r#^L1hJ6g<e!p^05Bq;0Hi__5<@@<K~#x&XnbL6nF<OIsmYiUry<V&rzFRRYy~4D
zI9@^O6=C8GAiHfr63UbHG*Q*pfP~d`0=yZSM3`|!JKP|KC5`*R8bFBv=qNdUm=wqg
z6J|(S0QnEq@&d4o63BKG9qS?DhLCiE>IAfu0<#F{gilb@K<NhE|GKQ;^n>aDOppYo
z|5reSfr$$v3{aC7+<u?~Uc)qD+AqZpOI6@FLk)&7xFk3gqQ}KlxU?}mMWTiYYMMlL
c(ha!Y$-opU7U0dw29g#9!d@V&iW9^G0GodGv;Y7A

diff --git a/datasets/vivos/dummy/1.1.0/dummy_data.zip b/datasets/vivos/dummy/1.1.0/dummy_data.zip
index 271d0bda93cbd52190edcab860f1abd1b45b2caa..1c7173a58a3a8b23fe87ddaff4de1e4157d29cf6 100644
GIT binary patch
literal 14710
zcmeHNc{r3^8y{Q9Hp*7^R%NH~CTU_UX(2*I_N0(yl2*o^ypk4*D5>5`rBc}<N{Ui=
zNehxFNtPC=@SXGMG0%))X5Kg7_s@5(>v*2IT-S5|&VB#Rea`va$J~sAQv?Uw&F%h{
z$jh%kd^kCr>sBwX?GCO^Bq!C?RsuL~$twv?_pT;5!ROXMJ{%k87d9LY`G7g{As!qL
z)rVe!kBNd0IXEmQ7_79kRaJ4Ie3R;v_rYgrKI!5p>f-L`?AWz@nRS<tb(DE9dz=Kv
zY`i2}_guLN-JNQ9E-8a0At#o_#m-s$fqW)RSiT}PIrTz<t$EzR^9f0b$s8iC8*FWh
zyIoTbmVc=)axZ9Uu$nZpVm%3mGdJVranly7wFd|xN(%n^wS2>lSOtbRmD&+Lep|eJ
zNd79MKoZH#pM(se=g4FyY2Xt&ezA`$TFRj=LhKD0^wD>cs@(nHkZI<lLmpB-Etg}G
zrV*E(R53JM+LorfO=Z%%S?`1L^}9Em*{4xH<88{z-l^4ApPXO1_POjnTwh!>X^~TN
zRj~)}&Yu}G^Plzfr2R4blhpfON3)k6(+4)5ByshqD!kU5*xl$ImtAq+#5V2CspTKa
z99_5eF5j0EHh*hB`Go$0WzJ?nyIO8VC!JJvpFiE{K(zJin-jhyrq={Hn9f@unp}BZ
zGheLGCh^``?$*~kZS>AXK2Nf&Xxk}xNv*A(=xG^Ozq-@UZZ$DBTs-j%Ij??qVi0s?
zNzKAn&A^S&Vf0-Y2>v@0O9cuM04v7wDofP3B8d<ECdz+*aWEUV4?iz&h=!sXiNJ=$
z+fGero(6ycp-@0>-$5ZhrvwfsLLF}vLrVus(iT5AB+`+9re>K5PVh-X*N#NO(z|LR
z5ra00oFrkpoQJ$&IRvTqysfxRXVoT3Yv{aRqJ1lTUD2sP;m%iO$N%gzHa;KzOeeNL
zsMzMd+4twfcyXLY{`m>PT#j){-A#P_>mp+sVnZ8S6Ps}fIwi#RM{=zN`!*-vRNhq<
z6q2-Q(WP?yrZ&FwvqLL&mbrTryl5_{^H7MsD(EKZ^6~5XkEyaULO+@>9DeQYUY@hC
z|7WL_2+=|dXR4vwzTCj#kb{YxtW-zatTlD1m3>tbX1dR-mtR#=IpzQS$&rApvq`Qh
zYl~kSd}}3kPXoUl+761gG-e)lZ5{~vyu1IhLYb^lkFynj_j)`jYUjnxFCIAKvXf5j
z<hftv*0gBRcWL-#A@=C2D(l%GKjzieH%tNCfDWebdKeVhQTZ`43Xm25fEkM);w+ow
zZU=MdM1*k?j*6<;zk`UldHAAk>&&Mj2l#=*(PTvlYH|d41Z)ARz{yX=eaBFWEV`WF
zR17+N(h!6)?pK%;;|shZN&QvqWbkwH@J5*T8RVELAje3I%)M^r<x4pT_<X(rgP>xg
zW^T_}nVPQEc^hP3J#cuyu|!bD&z7^qfJff=(StjmeM1H-gPb2ti|V$juC7^KQ=fa|
zuk(KK{4(yQ8YlJ&@VT5@RoL8q;p@uYh?v#So|yFS)zr|_+_I$f?%c<RE+q5UXe~V8
zVRb|BkT~AhBIeB8%dr(NE(;yLRb=2+5uB-@`lt7~6j6x`xvCdWcK@w;C`V*#q?mi{
zxeT+p!r2pKYkC}B%b)f<RT?dEMWX3%S(!a=ro71K@c*(?`~pw&`aM#uCYSw|bvN3T
zH8d@IM|S7qXFGe+-~7+KTP4+h>GyPAN<N+-wg0VtpCpW#QMhv17Z4=qL;Bu@teOS9
zJG=mmnsr7v`X7+%-=rORY7S5_Yb5(e%|Ys$?tmQB=r4^YRn}S9bP4?ZDn-sE!C5js
zIFw!f`0hACa`MTY^2TLVSFAtl`Z`BGInDd%$KqW^RR>*#;#2lUMV3#KuQE(mNbENd
znt0lPQ)pM0cMS2${ql!7@joJS^(N%?tf<b-ZvJxfe7UbiS-q!>bI;S}+M{OG8KgD#
z7Z&Lk*u41C+LvYQd$M#_RZ3HPr+%0mPl%89t2pcZPwgY*c(kJ$W%)gu<|bCk6!|I|
z-I;L1NA%%4@gnsMM*+#Y;?-(e%0@j!i8-e0ZNdKvceoL<C1pag^amX(-VTPy{dCb;
zVcZt?m6yOBRC_Epe?D7Sb#iV(u)<HJU1<h+WMw<9a^hnAr4k`X5ic&?W<fxYu@jHd
zC={WNg2Tu-{BN?5%otojAhVE*XqUg)48J|n3!}1;u;Pvyp6DKOfM2O=Qb55H?^o7s
zygS%TLUD6SR<FCE@Yb>X4_jZpkGCh*O)hsRImvaGeQo9TRXyoP_i7$Ets^MxJD6Lx
zmtWW;y5j4`nNv$AP04SRk-GQShEutp3tDS9c$171b&389PEC1K-M5aj{nIX;o@Ubw
z{etEGE)F7Y9pzeQQe}63-qHJ{z4e0Xr-S=Qp5iO=yu6p5C7IfI=$dA~FyGVo=!~+o
zpm)mp*UGx0HQJ)}H&*P~J&BWmACOS0{_c3oizg(L9M&5eF8$I_vu8?1{APYnCvNS^
zm|TAA%m@MGJ<vm4OEyMVGeE)9kuOhH@2Z@qfUd=?jKR>yy$EG(Men4o+$T^p$?4+t
z&jYjHG)Ll`Cti1&vuNpSu>haR%ChVsnJNv)Le!|3uCBq5u^P-Uc8Ciq|1;hOV769p
zF`dqkTX0J~MBZSqzN4M)LJB|VLStYCV*@bH4_R0XEZBh#pL8w2(+urtK!vddI0VcR
zK72x01MJw2wDtcJ0*oYRnV|o{JgpI`P@Np8Jk8Sb?JEsV4U!U3?6LCl1oK%YrR;KI
zi!F<z@Ci#R9@m&DDcm=$DK}M8nWMa7xv8m=>1siq33FoV=IIF=?yK14%#(&}azlRf
zZLs?_5XyFBKZc`l6@qIGW*0)W#0*W?p^;IYJ1zm<go6&BG+gaSRQE21Yra{i>$!cU
zg3`OVykDh#=CnH@aIOsRI)l^RKcOLAZ^hGCe#f5%<Y4p2iB<zEuNK_@dhK}0*K1K5
zx2)Yt%!raOt^MYU3$2<Ks;eG2|6ckr{`VST-@8o2P25*EyjioM=W5=-#rNN@zmN^9
z44my#y<j5IqV}X+?Xe&G`=0jxczb8h{`ZFayEEr=$cN#3W^ato(d*RUkry|4DJ%7%
zY1W#`l%hWCm~d(2T{YCd?zUJ!W@M79=i2Bu2I`*{Y`5Ukts8uwz?(DGJKl5QpzQv)
zgJE$&F|EYHcMoRCZxnvK_NLOxc3hE%N{89vBQkuQZ_d^i$#}<a7nXZtr-Y==XB#&)
zbF<Md_SgWx{h&6>9HU<GU<}58-KR}Hh5$K}$;aKz+ttkvktl|zL?Jkj<c~a)@O|R{
zBoyE)ps8kczolhjXm^Zx;5~(flSNX_Em1*yy5L~e0~uf&+8-h{c)?+P3V+XGj?~+n
z0^IzEvM=-^+G!X~nLD%xB~`{|ko-=Ye&OLs9T<e2#@PQ4Phi5(_c%<*3s{&ei0emY
zWYqcNpONRigc&*LuT*wF0S<({TNia*gAyMc3T2S-G$~n`Nm^0ofh<5Kn3Lg?hE&*`
z{CA0qkpo#W+oCSlm#mdO*C1{>@%FJipF=VyO5btm;B*GaPs~rMa2@V7trdT^=f=UJ
z$6()^t6*e!EJ}pyT98iYk~ZNY4%wHn7UEyB9v)Zh`)Pi&^n|kFj?*)59lum)X<x8^
zmZkRPCCetI-l(uY(sH_d+x7SU-+o?S6SQ!5r-j`+i$}SdiG_ps5ILUW`rsKCWiIFO
z6Y$>nLVbz4;^k_m^g^0f#DJY_)2zfw)eslkqN!S$;Yp63Yh%i}U0VBF!smNB{<vTx
z7M9l}Z}Q`({y^V=Do?R~M6+l7X;L=+b5~C7hqCfT!Euky1@S}`KQ0c7?Er_m$x7U8
zglw^O@EBH6x@x`vAR6^C3>ycSnRQqv+sKgocTzJpGr`ozY-Xa(&2Vyzm74_xyR_7-
zOqwydIoxgr$j(y{es4OOu}=<|r_RntF#HDp#LZ=r-<b4#3y``HboitpH)S4SR{<DV
zcqn0FYcphqI&$bz0G~AWz+?i)NFc0=hlmutK2WJB*~01FBDjqIK%&J|-UzEpVRc}K
z6RhYXBuLi5sSofSh&OTD#MOB)Lp46NBcbH0CT`8br<%8M3P4wN2@COBY?OMh$ie?m
zm+fmon{1iX_IFV^savNsk~(gr2K8rcI**@vbmB&t4J~ssv~N8yYOXE%ah`YeQwO_(
zTlqnr4sWZkd}+9nR9f^sIt}Qp-0@b)vRVQ}i`ckqecR`iURN{qPkm~TFjxoQp`x05
zLcq(z_~wI`Sz#rSJ}vg$(;{wU{isn@EGul9@!;;Kpg-&f6{CA154=p+L+%W5T)c3z
z*VB*NIBr)<-9ffGs$JDv;0JaHn;$ah=7ke#rI6@H;Sm!G8xT05^jMfs@YwPzGBgK9
z7?bgzRgkE$G{l|93Zbk`tyfwa60OxVhISYi0c^C>P>Mp~V;GB&!kUaNBFh{-4&Ma^
z;er64GzPK7fqK_A4dYDhC<FT_zVPVHZ6c&fd5*O9hLTf(yf!Jt9Ce97Sx?07)$o5W
zm{hQ&!K`X|ybbPZ;HHVU+0{&reA6I1Ngk+-RPWmyb3W*j?_NuhYL^aa-wxyK5@O?}
zi1sZq30~4m6#dycc3UXl*`z1oso^bJcGcUUzBei9?aki(2~V%})op06`*LUF9-@T^
z@fo|RMrodXM4G}*xLNOLQ$cD1a-~(VUr7l{G3D=)wX#kFMPKD;(vi$>_=@~F>+gd9
z6@GUm_GwGacosO2`uOQ!blOj+8LAijAcGATZnkgysKxurBer{{oS8n~%C73pjVrey
z47ML)@XRX}--Jdm7#k1B;6*Gj7#kQG3d4~6cPM-mLnuh;`799%;jpE-nVR|#hl>Gj
zw9`<E!eL(;lg5lqQYJfQ)=*v@R;g>MsQ`{6=Z9?o#97eKfldQrXqN@Y$qxU3MuFuw
zF0eF)VQv2f=>lu8wu}cX4GJtj0OvI5&|@rk%YQszY0e+^)=mHidJ1Ml7GQae2P`$M
zVOZV(PBQ2ztif7C1IvUl<uTa-Gs`Hi4y)AEG>3651$bbmv;P^3^S0xIi^47wfC(Nr
zSj4qzd~m4``%4dCx(zx^RxHqK-T2_5(0&EL13QcUT01_t<_-I68NdWPlepN}ML0&k
zr!$m!akzd0e8cR(k@P+0t(~C{qxW{OH)7zu9klh6O~LDM5tWr&JmzK;6HrXY-V}jX
zYtWWO=4GS8j?r+KxoAUc2YcBA-aA2C7TF3cRl`HUIEzZT7-16b9%zV-H3`dT*h>!Z
z775zY$$h{i))JMrJI0+7S_{}q3h=T7+S17+&d~-%ea?c$M%f6V#*mNFx%Yq?>nDhD
zQ4NPm!bT`J<}C;cAK)PxTf>L<AkdaY_N2Cg<pa#+h@mlVMWDlh+=hm*gn(B9(3V9m
zn=qR1#<a-<Be2zHSl~xnI@yPNbQg|{*eC^7><}RVvGjmt2DGJ<J9y|1kt%d76bezx
zhy@z1!0I&G`U%OrqYreb+`$TC;3%;LQdq4<TRQn0LtDSawNU0jTRJ(Fj|tZfjr1Qn
zm|htbwck&$;%DSwq@IL$87-*7=P1bxewbcam16KZV}r<E3$Zw6k(H7OXjcYU=|Ec+
zSzLgrA!24LVjEt|9Hr=r@)xbLU`uzf28p&Ta)2Py;T@{mVec|h!C;Utpmu{TU{O!t
zH9}0efSTV>xEZBth;;z8j7=S|K!&z-@^d_^4j2i7!HR=M0b6ULX22m~riM?eu0jTk
zqOPJ2fCI(KX7D$xW}&TvpfZUm+e3;F5^?Bvw2v~)&{YY${6P&FZCT{QjLjm`jF$Ci
zxrI)x3mr3R99tcMg&wqJk-5QUlinVQYP}drY9s^&)#eFt0*ZqH?O_c7TOEM47__C6
zKL8w<jnnAnsdiLjRA#U)fTpp$hvf&ffgP0?%PycMK19I;@!{SCE8|nMKvkSM2hcKq
zjJg8q0ytE9l>*cS9fXHW8plu{o@Yj_8&L8+oN-tQn|eu|2ZUG#QKtx4)J{jI9ej>4
z^=MNb6w1+-PA-|keCRPiXf#I%fkFJCNqU+$3Z;V(K9vPmPy{?C6mUu~pyrBIW1$9)
zwsdkYV+2SBU{MiASqO6x1c050Q1nGxIyq8``DkE~(1c?Q2^hr!Z5{<!P}}ec^;@)c
z5KN>Qh64kEA}%qZ=2GH;>ba<-3iVvHrIYg*L!fXH!wAYN)9Nnj7Z@W{&7}`&6}Te8
z^o*q-QKh^>P!ySK3~d(+YRj)+(FfJ`J3uj~>|#NoN-lj+!Lq-@Qxt(~3|$us3e|Dx
zgAxY>``6)PPSwSNT0;{)sN$k6oqUNgDEe~^Ar(?YVOrD0qCz!X)K%1|jN}=*iWyWC
zs+)jiViZLeiwae6>7#1+J*b${bFrvU{gytefN8(QR}`|Z0KS5qj%tcUg{rsoQ3+1}
cEvT5%a?Q=SfVc!NeJbH{MRB+@z%~x|AGn+aLI3~&

delta 804
zcmexXbcfF^z?+#xgaHKZMsWv&8Bl^7NT-zM=2ph1B$g!V<5noaz%aRxM^YA{;sKgb
za$ptmNGi}RQwFP0k-=?-3d3Xt12wo29ANiaP2O@6XgGrl5bFVr*eIw!Nk~B&;w6ZZ
zqgQPnJO)Y!F+!9S@~BKctHnJzk?|=jSl#3XeNm9nlTYe%Po5*LBn8tA@`YQg?y0Xp
z-GxBi`d}Au7Zl~^7L*k0l~j~WKFaz%g^@{w0e1)j^#Xwa!+%E*jS`-?H3CBv3P2h)
zA>peV;Em9MC4`Z5#6!XxMF*A;N76AH68<{)>`;L@3hbRb;Lt#_0&9%GwQOsYV+6+!
zh7Pm{LekLz)B%kyPPoHB(ZvT>#IUV#I}<GSkX+BffEjzpnpBuUniP@ZGQgXa4Wxn-
N2p=#pFr=}7cmNG{gd_j}