Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Multidoc2dial Dataset #3205

Merged
merged 11 commits into from
Nov 24, 2021
465 changes: 465 additions & 0 deletions datasets/multidoc2dial/README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions datasets/multidoc2dial/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"dialogue_domain": {"description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents. \n", "citation": "@inproceedings{feng2021multidoc2dial,\n title={MultiDoc2Dial: Modeling Dialogues Grounded in Multiple Documents},\n author={Feng, Song and Patel, Siva Sankalp and Wan, Hui and Joshi, Sachindra},\n booktitle={EMNLP},\n year={2021}\n}\n", "homepage": "https://doc2dial.github.io/multidoc2dial/", "license": "", "features": {"dial_id": {"dtype": "string", "id": null, "_type": "Value"}, "domain": {"dtype": "string", "id": null, "_type": "Value"}, "turns": [{"turn_id": {"dtype": "int32", "id": null, "_type": "Value"}, "role": {"dtype": "string", "id": null, "_type": "Value"}, "da": {"dtype": "string", "id": null, "_type": "Value"}, "references": [{"id_sp": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}, "doc_id": {"dtype": "string", "id": null, "_type": "Value"}}], "utterance": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "multi_doc2dial", "config_name": "dialogue_domain", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 11700598, "num_examples": 3474, "dataset_name": "multi_doc2dial"}, "validation": {"name": "validation", "num_bytes": 2210378, "num_examples": 661, "dataset_name": "multi_doc2dial"}}, "download_checksums": {"https://doc2dial.github.io/multidoc2dial/file/multidoc2dial.zip": {"num_bytes": 6451144, "checksum": "a8051237dd3be50d81c06aca82ed5171716922e35f44bfa5b9c024f090903419"}}, "download_size": 6451144, "post_processing_size": null, "dataset_size": 13910976, "size_in_bytes": 20362120}, "document_domain": {"description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents. \n", "citation": "@inproceedings{feng2021multidoc2dial,\n title={MultiDoc2Dial: Modeling Dialogues Grounded in Multiple Documents},\n author={Feng, Song and Patel, Siva Sankalp and Wan, Hui and Joshi, Sachindra},\n booktitle={EMNLP},\n year={2021}\n}\n", "homepage": "https://doc2dial.github.io/multidoc2dial/", "license": "", "features": {"domain": {"dtype": "string", "id": null, "_type": "Value"}, "doc_id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "doc_text": {"dtype": "string", "id": null, "_type": "Value"}, "spans": [{"id_sp": {"dtype": "string", "id": null, "_type": "Value"}, "tag": {"dtype": "string", "id": null, "_type": "Value"}, "start_sp": {"dtype": "int32", "id": null, "_type": "Value"}, "end_sp": {"dtype": "int32", "id": null, "_type": "Value"}, "text_sp": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "parent_titles": {"feature": {"id_sp": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "id_sec": {"dtype": "string", "id": null, "_type": "Value"}, "start_sec": {"dtype": "int32", "id": null, "_type": "Value"}, "text_sec": {"dtype": "string", "id": null, "_type": "Value"}, "end_sec": {"dtype": "int32", "id": null, "_type": "Value"}}], "doc_html_ts": {"dtype": "string", "id": null, "_type": "Value"}, "doc_html_raw": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "multi_doc2dial", "config_name": "document_domain", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 29378955, "num_examples": 488, "dataset_name": "multi_doc2dial"}}, "download_checksums": {"https://doc2dial.github.io/multidoc2dial/file/multidoc2dial.zip": {"num_bytes": 6451144, "checksum": "a8051237dd3be50d81c06aca82ed5171716922e35f44bfa5b9c024f090903419"}}, "download_size": 6451144, "post_processing_size": null, "dataset_size": 29378955, "size_in_bytes": 35830099}, "multidoc2dial": {"description": "MultiDoc2Dial is a new task and dataset on modeling goal-oriented dialogues grounded in multiple documents. Most previous works treat document-grounded dialogue modeling as a machine reading comprehension task based on a single given document or passage. We aim to address more realistic scenarios where a goal-oriented information-seeking conversation involves multiple topics, and hence is grounded on different documents. \n", "citation": "@inproceedings{feng2021multidoc2dial,\n title={MultiDoc2Dial: Modeling Dialogues Grounded in Multiple Documents},\n author={Feng, Song and Patel, Siva Sankalp and Wan, Hui and Joshi, Sachindra},\n booktitle={EMNLP},\n year={2021}\n}\n", "homepage": "https://doc2dial.github.io/multidoc2dial/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "da": {"dtype": "string", "id": null, "_type": "Value"}, "answers": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "answer_start": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "utterance": {"dtype": "string", "id": null, "_type": "Value"}, "domain": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "multi_doc2dial", "config_name": "multidoc2dial", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 24331976, "num_examples": 4201, "dataset_name": "multi_doc2dial"}, "train": {"name": "train", "num_bytes": 126589982, "num_examples": 21451, "dataset_name": "multi_doc2dial"}, "test": {"name": "test", "num_bytes": 33032, "num_examples": 5, "dataset_name": "multi_doc2dial"}}, "download_checksums": {"https://doc2dial.github.io/multidoc2dial/file/multidoc2dial.zip": {"num_bytes": 6451144, "checksum": "a8051237dd3be50d81c06aca82ed5171716922e35f44bfa5b9c024f090903419"}}, "download_size": 6451144, "post_processing_size": null, "dataset_size": 150954990, "size_in_bytes": 157406134}}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading