Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make several audio datasets streamable #3290

Merged
merged 47 commits into from
Nov 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
25dca5f
Add test fixture for TAR WAV file
albertvillanova Oct 21, 2021
52cc44d
Add test iter_archive
albertvillanova Oct 21, 2021
8ff699d
Test dataset with Audio feature for TAR archive
albertvillanova Oct 21, 2021
3d20ee5
Add Audio method to decode from bytes instead of path
albertvillanova Oct 21, 2021
105ead7
Add Audio support for bytes besides path
albertvillanova Oct 21, 2021
a869469
Fix docstring
albertvillanova Oct 21, 2021
f0911cd
Stream TAR-based Audio datasets
lhoestq Oct 29, 2021
79465af
Merge remote-tracking branch 'upstream/master' into audio-tar
albertvillanova Nov 8, 2021
f224b68
Remove archived attribute from test audio with TAR archive
albertvillanova Nov 8, 2021
ebb1a1c
Remove archived attribute from Audio feature
albertvillanova Nov 8, 2021
1cc27a0
Implement Audio.encode_example
albertvillanova Nov 8, 2021
4579b76
Call Audio.encode_example from encode_nested_example
albertvillanova Nov 8, 2021
0d2a3d8
Fix docs
albertvillanova Nov 8, 2021
3d35ada
Enhance Audio.decode_example to accept a string
albertvillanova Nov 8, 2021
ec5f7b0
Fix docs
albertvillanova Nov 9, 2021
21488c0
Implement private Audio._storage_dtype to specify cached dtype
albertvillanova Nov 10, 2021
83f04cd
Change Audio._storage_dtype dynamically when encoding a string
albertvillanova Nov 10, 2021
7a3f066
Update test of Audio instantiation
albertvillanova Nov 10, 2021
ece5b97
Set ArrowWriter.schema property dynamically calculated from features
albertvillanova Nov 10, 2021
38c80cc
Update ArrowWriter.write_examples_on_file
albertvillanova Nov 10, 2021
7787985
Update ArrowWriter._build_writer
albertvillanova Nov 10, 2021
090723e
Fix code quality
albertvillanova Nov 10, 2021
7f58777
Replace _schema with schema and condition on schema in ArrowWriter
albertvillanova Nov 10, 2021
583be77
Add test for MP3 TAR audio file
albertvillanova Nov 10, 2021
8dbe0d7
Refactor Audio decode_example
albertvillanova Nov 10, 2021
c973209
Pass raw bytes to torchaudio.load
albertvillanova Nov 10, 2021
7363e9a
Revert "Pass raw bytes to torchaudio.load"
albertvillanova Nov 10, 2021
9f61ab8
Pass format to load in _decode_example_with_torchaudio
albertvillanova Nov 15, 2021
efa4c25
Fix filename extension in test
albertvillanova Nov 15, 2021
659fb78
Fix Audio tests CI
albertvillanova Nov 16, 2021
2fc997a
Fix Audio tests CI
albertvillanova Nov 16, 2021
416d1bf
Fix audio test CI by checking out PR HEAD commit instead of merge commit
albertvillanova Nov 16, 2021
1e5dc25
Merge remote-tracking branch 'upstream/master' into audio-tar
albertvillanova Nov 16, 2021
5f16240
Change default Audio storage dtype to string
albertvillanova Nov 16, 2021
488b74a
Rename Audio decode functions
albertvillanova Nov 16, 2021
0ae5d44
Refactor Audio decode_example
albertvillanova Nov 16, 2021
4679d8e
Force CI re-run
albertvillanova Nov 16, 2021
e178cc7
Refactor and rename
albertvillanova Nov 16, 2021
4c4a687
Fix docstring
albertvillanova Nov 16, 2021
eb923d2
Merge branch 'master' into stream-tar-audio
lhoestq Nov 17, 2021
adbcc25
put back the Audio feature
lhoestq Nov 17, 2021
de4d5f9
Merge branch 'audio-tar' into stream-tar-audio
lhoestq Nov 17, 2021
25f1806
fix openslr
lhoestq Nov 17, 2021
1b441dd
Merge branch 'master' into stream-tar-audio
lhoestq Nov 17, 2021
7f67477
fix common_voice
Nov 19, 2021
45ed8cd
update infos
Nov 19, 2021
63d0d47
fix dummy data
lhoestq Nov 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 49 additions & 36 deletions datasets/common_voice/common_voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
""" Common Voice Dataset"""


import os

import datasets
from datasets.tasks import AutomaticSpeechRecognition

Expand Down Expand Up @@ -613,6 +611,7 @@ def __init__(self, name, sub_version, **kwargs):

class CommonVoice(datasets.GeneratorBasedBuilder):

DEFAULT_WRITER_BATCH_SIZE = 1000
BUILDER_CONFIGS = [
CommonVoiceConfig(
name=lang_id,
Expand Down Expand Up @@ -658,78 +657,92 @@ def _info(self):

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
dl_path = dl_manager.download_and_extract(_DATA_URL.format(self.config.name))
abs_path_to_data = os.path.join(dl_path, "cv-corpus-6.1-2020-12-11", self.config.name)
abs_path_to_clips = os.path.join(abs_path_to_data, "clips")
archive = dl_manager.download(_DATA_URL.format(self.config.name))
path_to_data = "/".join(["cv-corpus-6.1-2020-12-11", self.config.name])
path_to_clips = "/".join([path_to_data, "clips"])

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(abs_path_to_data, "train.tsv"),
"path_to_clips": abs_path_to_clips,
"files": dl_manager.iter_archive(archive),
"filepath": "/".join([path_to_data, "train.tsv"]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is breaking no?

"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(abs_path_to_data, "test.tsv"),
"path_to_clips": abs_path_to_clips,
"files": dl_manager.iter_archive(archive),
"filepath": "/".join([path_to_data, "test.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": os.path.join(abs_path_to_data, "dev.tsv"),
"path_to_clips": abs_path_to_clips,
"files": dl_manager.iter_archive(archive),
"filepath": "/".join([path_to_data, "dev.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="other",
gen_kwargs={
"filepath": os.path.join(abs_path_to_data, "other.tsv"),
"path_to_clips": abs_path_to_clips,
"files": dl_manager.iter_archive(archive),
"filepath": "/".join([path_to_data, "other.tsv"]),
"path_to_clips": path_to_clips,
},
),
datasets.SplitGenerator(
name="invalidated",
gen_kwargs={
"filepath": os.path.join(abs_path_to_data, "invalidated.tsv"),
"path_to_clips": abs_path_to_clips,
"files": dl_manager.iter_archive(archive),
"filepath": "/".join([path_to_data, "invalidated.tsv"]),
"path_to_clips": path_to_clips,
},
),
]

def _generate_examples(self, filepath, path_to_clips):
def _generate_examples(self, files, filepath, path_to_clips):
"""Yields examples."""
data_fields = list(self._info().features.keys())

# audio is not a header of the csv files
data_fields.remove("audio")
path_idx = data_fields.index("path")

with open(filepath, encoding="utf-8") as f:
lines = f.readlines()
headline = lines[0]

column_names = headline.strip().split("\t")
assert (
column_names == data_fields
), f"The file should have {data_fields} as column names, but has {column_names}"

for id_, line in enumerate(lines[1:]):
field_values = line.strip().split("\t")
all_field_values = {}
metadata_found = False
for path, f in files:
if path == filepath:
metadata_found = True
lines = f.readlines()
headline = lines[0].decode("utf-8")

# set absolute path for mp3 audio file
field_values[path_idx] = os.path.join(path_to_clips, field_values[path_idx])
column_names = headline.strip().split("\t")
assert (
column_names == data_fields
), f"The file should have {data_fields} as column names, but has {column_names}"
for line in lines[1:]:
field_values = line.decode("utf-8").strip().split("\t")
# set full path for mp3 audio file
audio_path = "/".join([path_to_clips, field_values[path_idx]])
all_field_values[audio_path] = field_values
elif path.startswith(path_to_clips):
assert metadata_found, "Found audio clips before the metadata TSV file."
if not all_field_values:
break
if path in all_field_values:
field_values = all_field_values[path]

# if data is incomplete, fill with empty values
if len(field_values) < len(data_fields):
field_values += (len(data_fields) - len(field_values)) * ["''"]
# if data is incomplete, fill with empty values
if len(field_values) < len(data_fields):
field_values += (len(data_fields) - len(field_values)) * ["''"]

result = {key: value for key, value in zip(data_fields, field_values)}
result = {key: value for key, value in zip(data_fields, field_values)}

# set audio feature
result["audio"] = field_values[path_idx]
# set audio feature
result["audio"] = {"path": path, "bytes": f.read()}

yield id_, result
yield path, result
2 changes: 1 addition & 1 deletion datasets/common_voice/dataset_infos.json

Large diffs are not rendered by default.

Binary file modified datasets/common_voice/dummy/ab/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ar/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/br/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ca/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/cnh/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/cs/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/cv/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/cy/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/de/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/dv/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/el/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/en/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/eo/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/es/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/et/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/eu/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/fa/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/fi/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/fr/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/hi/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/hsb/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/hu/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ia/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/id/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/it/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ja/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ka/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/kab/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ky/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/lg/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/lt/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/lv/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/mn/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/nl/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/or/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/pl/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/pt/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ro/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ru/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/rw/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/sah/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/sl/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/ta/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/th/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/tr/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/tt/6.1.0/dummy_data.zip
Binary file not shown.
Binary file removed datasets/common_voice/dummy/uk/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file removed datasets/common_voice/dummy/vot/6.1.0/dummy_data.zip
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion datasets/librispeech_asr/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 11823891, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 43049490, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 894510, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 868614, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 56636505, "size_in_bytes": 30178014159}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 59561081, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 907644, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 934838, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 61403563, "size_in_bytes": 31297968940}}
{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}}
Loading