Skip to content

Commit

Permalink
use old url for conll2003
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Jan 19, 2022
1 parent b360737 commit 1f946a2
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion datasets/conll2003/conll2003.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
For more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419
"""

_URL = "https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/"
_URL = "https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/"
_TRAINING_FILE = "train.txt"
_DEV_FILE = "valid.txt"
_TEST_FILE = "test.txt"
Expand Down
2 changes: 1 addition & 1 deletion datasets/conll2003/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n author = \"Tjong Kim Sang, Erik F. and\n De Meulder, Fien\",\n booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n year = \"2003\",\n url = \"https://www.aclweb.org/anthology/W03-0419\",\n pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931393, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739247, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582078, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt": {"num_bytes": 3283418, "checksum": "c99b26852dabf57ca9d30a0e892b84544cc8962003151e14a71077c55dc66db5"}, "https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt": {"num_bytes": 827441, "checksum": "f1f6469322876887de1d04acd43c59b02f59d5b02acf42c027132fa1bf349cb2"}, "https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt": {"num_bytes": 748093, "checksum": "82e0c72d262f86ad3e78b15c5d980bbf87cb205aa4bf6d2d97643f463f8d7ff7"}}, "download_size": 4858952, "post_processing_size": null, "dataset_size": 10252718, "size_in_bytes": 15111670}}
{"conll2003": {"description": "The shared task of CoNLL-2003 concerns language-independent named entity recognition. We will concentrate on\nfour types of named entities: persons, locations, organizations and names of miscellaneous entities that do\nnot belong to the previous three groups.\n\nThe CoNLL-2003 shared task data files contain four columns separated by a single space. Each word has been put on\na separate line and there is an empty line after each sentence. The first item on each line is a word, the second\na part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags\nand the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE. Only\nif two phrases of the same type immediately follow each other, the first word of the second phrase will have tag\nB-TYPE to show that it starts a new phrase. A word with tag O is not part of a phrase. Note the dataset uses IOB2\ntagging scheme, whereas the original dataset uses IOB1.\n\nFor more details see https://www.clips.uantwerpen.be/conll2003/ner/ and https://www.aclweb.org/anthology/W03-0419\n", "citation": "@inproceedings{tjong-kim-sang-de-meulder-2003-introduction,\n title = \"Introduction to the {C}o{NLL}-2003 Shared Task: Language-Independent Named Entity Recognition\",\n author = \"Tjong Kim Sang, Erik F. and\n De Meulder, Fien\",\n booktitle = \"Proceedings of the Seventh Conference on Natural Language Learning at {HLT}-{NAACL} 2003\",\n year = \"2003\",\n url = \"https://www.aclweb.org/anthology/W03-0419\",\n pages = \"142--147\",\n}\n", "homepage": "https://www.aclweb.org/anthology/W03-0419/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pos_tags": {"feature": {"num_classes": 47, "names": ["\"", "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "chunk_tags": {"feature": {"num_classes": 23, "names": ["O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "ner_tags": {"feature": {"num_classes": 9, "names": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "conll2003", "config_name": "conll2003", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6931393, "num_examples": 14041, "dataset_name": "conll2003"}, "validation": {"name": "validation", "num_bytes": 1739247, "num_examples": 3250, "dataset_name": "conll2003"}, "test": {"name": "test", "num_bytes": 1582078, "num_examples": 3453, "dataset_name": "conll2003"}}, "download_checksums": {"https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/train.txt": {"num_bytes": 3283418, "checksum": "c99b26852dabf57ca9d30a0e892b84544cc8962003151e14a71077c55dc66db5"}, "https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/valid.txt": {"num_bytes": 827441, "checksum": "f1f6469322876887de1d04acd43c59b02f59d5b02acf42c027132fa1bf349cb2"}, "https://raw.githubusercontent.com/davidsbatista/NER-datasets/dcb6c7439a7de43abc2448bad5b1d81a47f26c0d/CONLL2003/test.txt": {"num_bytes": 748093, "checksum": "82e0c72d262f86ad3e78b15c5d980bbf87cb205aa4bf6d2d97643f463f8d7ff7"}}, "download_size": 4858952, "post_processing_size": null, "dataset_size": 10252718, "size_in_bytes": 15111670}}

1 comment on commit 1f946a2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.012773 / 0.011353 (0.001420) 0.004740 / 0.011008 (-0.006269) 0.040841 / 0.038508 (0.002333) 0.039872 / 0.023109 (0.016763) 0.415988 / 0.275898 (0.140090) 0.455962 / 0.323480 (0.132482) 0.009595 / 0.007986 (0.001609) 0.006329 / 0.004328 (0.002000) 0.011180 / 0.004250 (0.006930) 0.046790 / 0.037052 (0.009738) 0.400819 / 0.258489 (0.142330) 0.461075 / 0.293841 (0.167234) 0.048628 / 0.128546 (-0.079919) 0.014312 / 0.075646 (-0.061334) 0.325669 / 0.419271 (-0.093603) 0.064153 / 0.043533 (0.020620) 0.392701 / 0.255139 (0.137562) 0.448923 / 0.283200 (0.165724) 0.092442 / 0.141683 (-0.049240) 2.178602 / 1.452155 (0.726448) 2.230570 / 1.492716 (0.737853)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.389335 / 0.018006 (0.371329) 0.599638 / 0.000490 (0.599149) 0.029702 / 0.000200 (0.029502) 0.000361 / 0.000054 (0.000307)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.045308 / 0.037411 (0.007897) 0.028108 / 0.014526 (0.013582) 0.039074 / 0.176557 (-0.137483) 0.079134 / 0.737135 (-0.658001) 0.036496 / 0.296338 (-0.259843)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.655867 / 0.215209 (0.440658) 6.562538 / 2.077655 (4.484883) 2.492336 / 1.504120 (0.988216) 2.110166 / 1.541195 (0.568971) 2.159969 / 1.468490 (0.691479) 0.769896 / 4.584777 (-3.814881) 6.892635 / 3.745712 (3.146923) 3.203343 / 5.269862 (-2.066518) 1.569793 / 4.565676 (-2.995883) 0.088093 / 0.424275 (-0.336182) 0.014415 / 0.007607 (0.006808) 0.846519 / 0.226044 (0.620475) 8.257048 / 2.268929 (5.988119) 3.356645 / 55.444624 (-52.087980) 2.518962 / 6.876477 (-4.357514) 2.551192 / 2.142072 (0.409120) 0.961667 / 4.805227 (-3.843560) 0.195737 / 6.500664 (-6.304928) 0.079556 / 0.075469 (0.004086)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 2.070293 / 1.841788 (0.228505) 15.450512 / 8.074308 (7.376204) 43.869445 / 10.191392 (33.678053) 1.058489 / 0.680424 (0.378065) 0.720413 / 0.534201 (0.186212) 0.719004 / 0.579283 (0.139721) 0.836877 / 0.434364 (0.402513) 0.463364 / 0.540337 (-0.076974) 0.485377 / 1.386936 (-0.901559)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009782 / 0.011353 (-0.001571) 0.005312 / 0.011008 (-0.005697) 0.037127 / 0.038508 (-0.001381) 0.038308 / 0.023109 (0.015199) 0.395259 / 0.275898 (0.119361) 0.426703 / 0.323480 (0.103223) 0.007245 / 0.007986 (-0.000741) 0.006019 / 0.004328 (0.001690) 0.008077 / 0.004250 (0.003826) 0.043623 / 0.037052 (0.006571) 0.382801 / 0.258489 (0.124312) 0.413340 / 0.293841 (0.119499) 0.049109 / 0.128546 (-0.079437) 0.014264 / 0.075646 (-0.061382) 0.322160 / 0.419271 (-0.097112) 0.063561 / 0.043533 (0.020028) 0.395292 / 0.255139 (0.140153) 0.417135 / 0.283200 (0.133936) 0.090110 / 0.141683 (-0.051573) 2.190593 / 1.452155 (0.738439) 2.311176 / 1.492716 (0.818460)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.327813 / 0.018006 (0.309807) 0.659345 / 0.000490 (0.658855) 0.014058 / 0.000200 (0.013858) 0.000171 / 0.000054 (0.000117)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.038357 / 0.037411 (0.000946) 0.026465 / 0.014526 (0.011939) 0.038124 / 0.176557 (-0.138432) 0.094194 / 0.737135 (-0.642942) 0.037422 / 0.296338 (-0.258917)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.625071 / 0.215209 (0.409861) 6.274524 / 2.077655 (4.196870) 2.493929 / 1.504120 (0.989809) 2.056075 / 1.541195 (0.514880) 2.085981 / 1.468490 (0.617491) 0.768014 / 4.584777 (-3.816763) 7.052988 / 3.745712 (3.307276) 5.409434 / 5.269862 (0.139573) 1.560631 / 4.565676 (-3.005045) 0.101187 / 0.424275 (-0.323088) 0.018317 / 0.007607 (0.010710) 0.828119 / 0.226044 (0.602074) 8.026035 / 2.268929 (5.757106) 3.248080 / 55.444624 (-52.196544) 2.520038 / 6.876477 (-4.356439) 2.591103 / 2.142072 (0.449031) 0.935241 / 4.805227 (-3.869986) 0.196031 / 6.500664 (-6.304633) 0.078379 / 0.075469 (0.002910)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 2.143303 / 1.841788 (0.301515) 16.688227 / 8.074308 (8.613918) 45.669999 / 10.191392 (35.478607) 1.091218 / 0.680424 (0.410794) 0.786096 / 0.534201 (0.251895) 0.667789 / 0.579283 (0.088506) 0.799342 / 0.434364 (0.364978) 0.425615 / 0.540337 (-0.114722) 0.433635 / 1.386936 (-0.953301)

CML watermark

Please sign in to comment.