use old url for conll2003

huggingface · Jan 19, 2022 · 1f946a2 · 1f946a2 · github-actions · Jan 19, 2022
1 parent b360737
commit 1f946a2
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/datasets/conll2003/conll2003.py b/datasets/conll2003/conll2003.py
@@ -50,7 +50,7 @@
 For more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419
 """
 
-_URL = "https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/"
+_URL = "https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/"
 _TRAINING_FILE = "train.txt"
 _DEV_FILE = "valid.txt"
 _TEST_FILE = "test.txt"

diff --git a/datasets/conll2003/dataset_infos.json b/datasets/conll2003/dataset_infos.json
@@ -1 +1 @@
-{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n    title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n    author = \"Tjong Kim Sang, Erik F.  and\n      De Meulder, Fien\",\n    booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n    year = \"2003\",\n    url = \"https://www.aclweb.org/anthology/W03-0419\",\n    pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931393, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739247, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582078, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt": {"num_bytes": 3283418, "checksum": "c99b26852dabf57ca9d30a0e892b84544cc8962003151e14a71077c55dc66db5"}, "https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt": {"num_bytes": 827441, "checksum": "f1f6469322876887de1d04acd43c59b02f59d5b02acf42c027132fa1bf349cb2"}, "https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt": {"num_bytes": 748093, "checksum": "82e0c72d262f86ad3e78b15c5d980bbf87cb205aa4bf6d2d97643f463f8d7ff7"}}, "download_size": 4858952, "post_processing_size": null, "dataset_size": 10252718, "size_in_bytes": 15111670}}
+{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n    title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n    author = \"Tjong Kim Sang, Erik F.  and\n      De Meulder, Fien\",\n    booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n    year = \"2003\",\n    url = \"https://www.aclweb.org/anthology/W03-0419\",\n    pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931393, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739247, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582078, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt": {"num_bytes": 3283418, "checksum": "c99b26852dabf57ca9d30a0e892b84544cc8962003151e14a71077c55dc66db5"}, "https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/valid.txt": {"num_bytes": 827441, "checksum": "f1f6469322876887de1d04acd43c59b02f59d5b02acf42c027132fa1bf349cb2"}, "https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/test.txt": {"num_bytes": 748093, "checksum": "82e0c72d262f86ad3e78b15c5d980bbf87cb205aa4bf6d2d97643f463f8d7ff7"}}, "download_size": 4858952, "post_processing_size": null, "dataset_size": 10252718, "size_in_bytes": 15111670}}