huggingface · lhoestq · Jul 5, 2021 · Jun 25, 2021 · Jun 28, 2021 · Jun 29, 2021
diff --git a/datasets/arabic_speech_corpus/README.md b/datasets/arabic_speech_corpus/README.md
@@ -1,4 +1,5 @@
 ---
+pretty_name: Arabic Speech Corpus
 annotations_creators:
 - expert-generated
 language_creators:
@@ -9,15 +10,15 @@ licenses:
 - cc-by-4.0
 multilinguality:
 - monolingual
+paperswithcode_id: arabic-speech-corpus
 size_categories:
 - 1K<n<10K
 source_datasets:
 - original
 task_categories:
-- other
+- automatic-speech-recognition
 task_ids:
-- other-other-automatic speech recognition
-paperswithcode_id: arabic-speech-corpus
+- speech-recognition
 ---
 
 # Dataset Card for Arabic Speech Corpus
@@ -152,7 +153,14 @@ CC BY 4.0
 
 ### Citation Information
 
-[Needs More Information]
+```
+@phdthesis{halabi2016modern,
+  title={Modern standard Arabic phonetics for speech synthesis},
+  author={Halabi, Nawar},
+  year={2016},
+  school={University of Southampton}
+}
+```
 
 ### Contributions
 

diff --git a/datasets/arabic_speech_corpus/arabic_speech_corpus.py b/datasets/arabic_speech_corpus/arabic_speech_corpus.py
@@ -20,6 +20,7 @@
 import os
 
 import datasets
+from datasets.tasks import AutomaticSpeechRecognition
 
 
 _CITATION = """\
@@ -91,6 +92,7 @@ def _info(self):
             supervised_keys=("file", "text"),
             homepage=_URL,
             citation=_CITATION,
+            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
         )
 
     def _split_generators(self, dl_manager):

diff --git a/datasets/arabic_speech_corpus/dataset_infos.json b/datasets/arabic_speech_corpus/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@phdthesis{halabi2016modern,\n  title={Modern standard Arabic phonetics for speech synthesis},\n  author={Halabi, Nawar},\n  year={2016},\n  school={University of Southampton}\n}\n", "homepage": "http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "phonetic": {"dtype": "string", "id": null, "_type": "Value"}, "orthographic": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "builder_name": "arabic_speech_corpus", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1002365, "num_examples": 1813, "dataset_name": "arabic_speech_corpus"}, "test": {"name": "test", "num_bytes": 65784, "num_examples": 100, "dataset_name": "arabic_speech_corpus"}}, "download_checksums": {"http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip": {"num_bytes": 1192302846, "checksum": "1df85219370fb1ebe8bfc46aa886265586411d04e7c1caa5a5b9847b3ad5f9de"}}, "download_size": 1192302846, "post_processing_size": null, "dataset_size": 1068149, "size_in_bytes": 1193370995}}
+{"clean": {"description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@phdthesis{halabi2016modern,\n  title={Modern standard Arabic phonetics for speech synthesis},\n  author={Halabi, Nawar},\n  year={2016},\n  school={University of Southampton}\n}\n", "homepage": "http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "phonetic": {"dtype": "string", "id": null, "_type": "Value"}, "orthographic": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "arabic_speech_corpus", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1002365, "num_examples": 1813, "dataset_name": "arabic_speech_corpus"}, "test": {"name": "test", "num_bytes": 65784, "num_examples": 100, "dataset_name": "arabic_speech_corpus"}}, "download_checksums": {"http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip": {"num_bytes": 1192302846, "checksum": "1df85219370fb1ebe8bfc46aa886265586411d04e7c1caa5a5b9847b3ad5f9de"}}, "download_size": 1192302846, "post_processing_size": null, "dataset_size": 1068149, "size_in_bytes": 1193370995}}
diff --git a/datasets/common_voice/README.md b/datasets/common_voice/README.md
@@ -1,4 +1,5 @@
 ---
+pretty_name: Common Voice
 annotations_creators:
 - crowdsourced
 language_creators:
@@ -190,11 +191,11 @@ size_categories:
   zh-TW:
   - 10K<n<100K
 source_datasets:
-- extended|other-common-voice
+- extended|common_voice
 task_categories:
-- other
+- automatic-speech-recognition
 task_ids:
-- other-other-automatic-speech-recognition
+- speech-recognition
 paperswithcode_id: common-voice
 ---
 
@@ -358,3 +359,7 @@ The dataset consists of people who have donated their voice online.  You agree t
   year = 2020
 }
 ```
+
+### Contributions
+
+Thanks to [@BirgerMoell](https://github.com/BirgerMoell) for adding this dataset.
diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
@@ -18,6 +18,7 @@
 import os
 
 import datasets
+from datasets.tasks import AutomaticSpeechRecognition
 
 
 _DATA_URL = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/{}.tar.gz"
@@ -649,6 +650,9 @@ def _info(self):
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
+            task_templates=[
+                AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence")
+            ],
         )
 
     def _split_generators(self, dl_manager):

diff --git a/datasets/common_voice/dataset_infos.json b/datasets/common_voice/dataset_infos.json
diff --git a/datasets/librispeech_asr/README.md b/datasets/librispeech_asr/README.md
@@ -1,4 +1,5 @@
 ---
+pretty_name: LibriSpeech
 annotations_creators:
 - expert-generated
 language_creators:
@@ -10,15 +11,15 @@ licenses:
 - cc-by-4.0
 multilinguality:
 - monolingual
+paperswithcode_id: librispeech-1
 size_categories:
 - 100K<n<1M
 source_datasets:
 - original
 task_categories:
-- other
+- automatic-speech-recognition
 task_ids:
-- other-other-automatic speech recognition
-paperswithcode_id: librispeech-1
+- speech-recognition
 ---
 
 # Dataset Card for librispeech_asr
@@ -181,7 +182,16 @@ CC BY 4.0
 
 ### Citation Information
 
-[Needs More Information]
+```
+@inproceedings{panayotov2015librispeech,
+  title={Librispeech: an ASR corpus based on public domain audio books},
+  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
+  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
+  pages={5206--5210},
+  year={2015},
+  organization={IEEE}
+}
+```
 
 ### Contributions
 

diff --git a/datasets/librispeech_asr/dataset_infos.json b/datasets/librispeech_asr/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 11823891, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 43049490, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 894510, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 868614, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 56636505, "size_in_bytes": 30178014159}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 59561081, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 907644, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 934838, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 61403563, "size_in_bytes": 31297968940}}
+{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 11823891, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 43049490, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 894510, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 868614, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 56636505, "size_in_bytes": 30178014159}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "speech", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 59561081, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 907644, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 934838, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 61403563, "size_in_bytes": 31297968940}}