diff --git a/datasets/ronec/dataset_infos.json b/datasets/ronec/dataset_infos.json new file mode 100644 index 00000000000..acd5120e767 --- /dev/null +++ b/datasets/ronec/dataset_infos.json @@ -0,0 +1 @@ +{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition\n", "citation": "@article{dumitrescu2019introducing,\n title={Introducing RONEC--the Romanian Named Entity Corpus},\n author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n journal={arXiv preprint arXiv:1909.01247},\n year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}} \ No newline at end of file diff --git a/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip b/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip new file mode 100644 index 00000000000..6b66ef4cafd Binary files /dev/null and b/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip differ diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py new file mode 100644 index 00000000000..ad3845b706c --- /dev/null +++ b/datasets/ronec/ronec.py @@ -0,0 +1,212 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Introduction in RONEC: Named Entity Corpus for ROmanian language""" + +from __future__ import absolute_import, division, print_function + +import logging +import os + +import datasets + + +# Find for instance the citation on arxiv or on the dataset repo/website +_CITATION = """\ +@article{dumitrescu2019introducing, + title={Introducing RONEC--the Romanian Named Entity Corpus}, + author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius}, + journal={arXiv preprint arXiv:1909.01247}, + year={2019} +} +""" + +# You can copy an official description +_DESCRIPTION = """\ +The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence, +belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition +""" + +_HOMEPAGE = "https://github.com/dumitrescustefan/ronec" + +_LICENSE = "MIT License" + +_FILE_FORMAT = "ronec.conllup" + +# The HuggingFace dataset library don't host the datasets but only point to the original files +# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) +_URL = "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/" +_TRAINING_FILE = "train.conllu" +_TEST_FILE = "test.conllu" +_DEV_FILE = "dev.conllu" + + +class RONECConfig(datasets.BuilderConfig): + """BuilderConfig for RONEC dataset""" + + def __init__(self, **kwargs): + super(RONECConfig, self).__init__(**kwargs) + + +class RONEC(datasets.GeneratorBasedBuilder): + """RONEC dataset""" + + VERSION = datasets.Version("1.0.0") + DEFAULT_CONFIG_NAME = "ronec" + + BUILDER_CONFIGS = [ + RONECConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"), + ] + + def _info(self): + + features = datasets.Features( + { + "id": datasets.Value("string"), + "sentence": datasets.Sequence(datasets.Value("string")), + "start": datasets.Sequence(datasets.Value("int32")), + "end": datasets.Sequence(datasets.Value("int32")), + "ronec_class": datasets.Sequence( + datasets.features.ClassLabel( + names=[ + "O", + "DATETIME", + "EVENT", + "FACILITY", + "GPE", + "LANGUAGE", + "LOC", + "MONEY", + "NAT_REL_POL", + "NUMERIC_VALUE", + "ORDINAL", + "ORGANIZATION", + "PERIOD", + "PERSON", + "PRODUCT", + "QUANTITY", + "WORK_OF_ART", + ] + ) + ), + } + ) + + return datasets.DatasetInfo( + # This is the description that will appear on the datasets page. + description=_DESCRIPTION, + # This defines the different columns of the dataset and their types + features=features, # Here we define them above because they are different between the two configurations + # If there's a common (input, target) tuple from the features, + # specify them here. They'll be used if as_supervised=True in + # builder.as_dataset. + supervised_keys=None, + # Homepage of the dataset for documentation + homepage=_HOMEPAGE, + # License for the dataset if available + license=_LICENSE, + # Citation for the dataset + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + urls_to_download = { + "train": os.path.join(_URL, _TRAINING_FILE), + "dev": os.path.join(_URL, _DEV_FILE), + "test": os.path.join(_URL, _TEST_FILE), + } + + downloaded_files = dl_manager.download(urls_to_download) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": downloaded_files["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": downloaded_files["test"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": downloaded_files["dev"]}, + ), + ] + + def _generate_examples(self, filepath): + """ Yields examples. """ + + logging.info("⏳ Generating examples from = %s", filepath) + with open(filepath, encoding="utf-8") as f: + guid = 0 + sentence = [] + ronec_class = [] + start = [] + end = [] + sent = "" + has_started = False + for line in f: + if "#" in line or line == "\n": + if sent: + sentence.append(sent) + yield guid, { + "id": str(guid), + "sentence": sentence, + "start": start, + "end": end, + "ronec_class": ronec_class, + } + guid += 1 + sentence = [] + start = [] + end = [] + ronec_class = [] + sent = "" + else: + # ronec words are tab separated + line = line.replace("\n", "") + splits = line.split("\t") + if splits[9] == "SpaceAfter=No": + sent += splits[1] + else: + sent += splits[1] + " " + + if splits[10].startswith("O") and not has_started: + continue + elif splits[10].startswith("B-"): + begin = len(sent) - len(splits[1]) + last = len(sent) + label = splits[10][2:] + has_started = True + elif splits[10].startswith("I-"): + last = len(sent) + elif splits[10].startswith("O") and begin: + ronec_class.append(label) + start.append(begin) + end.append(last) + has_started = False + + # last example + yield guid, { + "id": str(guid), + "sentence": sentence, + "start": start, + "end": end, + "ronec_class": ronec_class, + } diff --git a/datasets/rosent/dataset_infos.json b/datasets/rosent/dataset_infos.json new file mode 100644 index 00000000000..7b38c2a88b3 --- /dev/null +++ b/datasets/rosent/dataset_infos.json @@ -0,0 +1 @@ +{"rosent": {"description": "This new dataset is a Romanian sentiment analysis dataset.\n", "citation": "", "homepage": "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "rosent_dataset", "config_name": "rosent", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8289100, "num_examples": 17941, "dataset_name": "rosent_dataset"}, "test": {"name": "test", "num_bytes": 6793523, "num_examples": 11005, "dataset_name": "rosent_dataset"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/train.csv": {"num_bytes": 8048544, "checksum": "5b5f36aba3895e75832d1f084459f23ebeec0418d55ab1fbaa015d154879ed0f"}, "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/test.csv": {"num_bytes": 6651513, "checksum": "2491ca36849e7055f1497575fa691f91671d64b7365f43a3be84ce552b6b65bd"}}, "download_size": 14700057, "post_processing_size": null, "dataset_size": 15082623, "size_in_bytes": 29782680}} \ No newline at end of file diff --git a/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip b/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip new file mode 100644 index 00000000000..20acab4f7a6 Binary files /dev/null and b/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip differ diff --git a/datasets/rosent/rosent.py b/datasets/rosent/rosent.py new file mode 100644 index 00000000000..8042a33342a --- /dev/null +++ b/datasets/rosent/rosent.py @@ -0,0 +1,126 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Introduction in a Romanian sentiment dataset.""" + +from __future__ import absolute_import, division, print_function + +import csv +import logging + +import datasets + + +# Find for instance the citation on arxiv or on the dataset repo/website +_CITATION = "" + +_DESCRIPTION = """\ +This new dataset is a Romanian sentiment analysis dataset. +""" + +_HOMEPAGE = "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets" + +_LICENSE = "" + +_URL = ( + "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/" +) +_TRAINING_FILE = "train.csv" +_TEST_FILE = "test.csv" + + +class ROSENTConfig(datasets.BuilderConfig): + """BuilderConfig for ROSENT dataset""" + + def __init__(self, **kwargs): + super(ROSENTConfig, self).__init__(**kwargs) + + +class ROSENTDataset(datasets.GeneratorBasedBuilder): + """Romanian sentiment dataset.""" + + VERSION = datasets.Version("1.0.0") + DEFAULT_CONFIG_NAME = "rosent" + + BUILDER_CONFIGS = [ + ROSENTConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="Romanian sentiment dataset."), + ] + + def _info(self): + + features = datasets.Features( + { + "id": datasets.Value("string"), + "sentence": datasets.Sequence(datasets.Value("string")), + "label": datasets.Sequence(datasets.Value("int32")), + } + ) + + return datasets.DatasetInfo( + # This is the description that will appear on the datasets page. + description=_DESCRIPTION, + # This defines the different columns of the dataset and their types + features=features, # Here we define them above because they are different between the two configurations + # If there's a common (input, target) tuple from the features, + # specify them here. They'll be used if as_supervised=True in + # builder.as_dataset. + supervised_keys=None, + # Homepage of the dataset for documentation + homepage=_HOMEPAGE, + # License for the dataset if available + license=_LICENSE, + # Citation for the dataset + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + urls_to_download_train = _URL + _TRAINING_FILE + urls_to_download_test = _URL + _TEST_FILE + + train_path = dl_manager.download(urls_to_download_train) + test_path = dl_manager.download(urls_to_download_test) + print("FISIERE LUATE", train_path, test_path) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": train_path}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": test_path}, + ), + ] + + def _generate_examples(self, filepath): + """ Yields examples. """ + + logging.info("⏳ Generating examples from = %s", filepath) + with open(filepath, encoding="utf-8") as f: + # data = pd.read_csv(filepath) + data = csv.reader(f, delimiter=",", quotechar='"') + + next(data, None) + for row_id, row in enumerate(data): + print("ROW", row) + id, txt, lbl = row + yield "{}_{}".format(row_id, id), { + "id": id, + "sentence": [txt], + "label": [lbl], + }