From 057f0eef16844289b7959ac340154ae2f9aeff58 Mon Sep 17 00:00:00 2001 From: iliemihai Date: Sat, 12 Dec 2020 00:12:42 +0200 Subject: [PATCH 1/6] Added RONEC dataset. --- datasets/ronec/dataset_infos.json | 1 + datasets/ronec/ronec.py | 238 ++++++++++++++++++++++++++++++ 2 files changed, 239 insertions(+) create mode 100644 datasets/ronec/dataset_infos.json create mode 100644 datasets/ronec/ronec.py diff --git a/datasets/ronec/dataset_infos.json b/datasets/ronec/dataset_infos.json new file mode 100644 index 00000000000..a97100fcc46 --- /dev/null +++ b/datasets/ronec/dataset_infos.json @@ -0,0 +1 @@ +{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition \n", "citation": "@article{dumitrescu2019introducing,\n title={Introducing RONEC--the Romanian Named Entity Corpus},\n author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n journal={arXiv preprint arXiv:1909.01247},\n year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}} \ No newline at end of file diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py new file mode 100644 index 00000000000..ef70b715cbb --- /dev/null +++ b/datasets/ronec/ronec.py @@ -0,0 +1,238 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Introduction in RONEC: Named Entity Corpus for ROmanian language""" + +from __future__ import absolute_import, division, print_function + +import csv +import json +import os +import logging +import datasets + + +# Find for instance the citation on arxiv or on the dataset repo/website +_CITATION = """\ +@article{dumitrescu2019introducing, + title={Introducing RONEC--the Romanian Named Entity Corpus}, + author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius}, + journal={arXiv preprint arXiv:1909.01247}, + year={2019} +} +""" + +# You can copy an official description +_DESCRIPTION = """\ +The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence, +belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition +""" + +_HOMEPAGE = "https://github.com/dumitrescustefan/ronec" + +_LICENSE = "MIT License" + +_FILE_FORMAT = "ronec.conllup" + +# The HuggingFace dataset library don't host the datasets but only point to the original files +# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) +_URL = ( + "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/" +) +_TRAINING_FILE = "train.conllu" +_TEST_FILE = "test.conllu" +_DEV_FILE = "dev.conllu" + + +class RONECConfig(datasets.BuilderConfig): + """BuilderConfig for RONEC dataset""" + + def __init__(self, **kwargs): + super(RONECConfig, self).__init__(**kwargs) + + +# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case +class RONEC(datasets.GeneratorBasedBuilder): + """RONEC dataset""" + + VERSION = datasets.Version("1.0.0") + DEFAULT_CONFIG_NAME = "ronec" + # This is an example of a dataset with multiple configurations. + # If you don't want/need to define several sub-sets in your dataset, + # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. + + # If you need to make complex sub-parts in the datasets with configurable options + # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig + # BUILDER_CONFIG_CLASS = MyBuilderConfig + + # You will be able to load one or the other configurations in the following list with + # data = datasets.load_dataset('my_dataset', 'first_domain') + # data = datasets.load_dataset('my_dataset', 'second_domain') + BUILDER_CONFIGS = [ + RONECConfig( + name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset" + ), + ] + + def _info(self): + # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset + + features = datasets.Features( + { + "id": datasets.Value("string"), + "sentence": datasets.Sequence(datasets.Value("string")), + "start": datasets.Sequence(datasets.Value("int32")), + "end": datasets.Sequence(datasets.Value("int32")), + "ronec_class": datasets.Sequence( + datasets.features.ClassLabel( + names=[ + "O", + "DATETIME", + "EVENT", + "FACILITY", + "GPE", + "LANGUAGE", + "LOC", + "MONEY", + "NAT_REL_POL", + "NUMERIC_VALUE", + "ORDINAL", + "ORGANIZATION", + "PERIOD", + "PERSON", + "PRODUCT", + "QUANTITY", + "WORK_OF_ART", + ] + ) + ), + } + ) + + return datasets.DatasetInfo( + # This is the description that will appear on the datasets page. + description=_DESCRIPTION, + # This defines the different columns of the dataset and their types + features=features, # Here we define them above because they are different between the two configurations + # If there's a common (input, target) tuple from the features, + # specify them here. They'll be used if as_supervised=True in + # builder.as_dataset. + supervised_keys=None, + # Homepage of the dataset for documentation + homepage=_HOMEPAGE, + # License for the dataset if available + license=_LICENSE, + # Citation for the dataset + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration + # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name + + # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs + # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. + # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive + urls_to_download = { + "train": os.path.join(_URL, _TRAINING_FILE), + "dev": os.path.join(_URL, _DEV_FILE), + "test": os.path.join(_URL, _TEST_FILE), + } + + # Download .zip file + downloaded_files = dl_manager.download(urls_to_download) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": downloaded_files["train"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": downloaded_files["test"]}, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": downloaded_files["dev"]}, + ), + ] + + def _generate_examples(self, filepath): + """ Yields examples. """ + # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method. + # It is in charge of opening the given file and yielding (key, example) tuples from the dataset + # The key is not important, it's more here for legacy reason (legacy from tfds) + logging.info("⏳ Generating examples from = %s", filepath) + with open(filepath, encoding="utf-8") as f: + guid = 0 + sentence = [] + ronec_class = [] + start = [] + end = [] + sent = "" + has_started = False + for line in f: + if "#" in line or line == "\n": + if sent: + sentence.append(sent) + yield guid, { + "id": str(guid), + "sentence": sentence, + "start": start, + "end": end, + "ronec_class": ronec_class, + } + guid += 1 + sentence = [] + start = [] + end = [] + ronec_class = [] + sent = "" + else: + # ronec words are tab separated + line = line.replace("\n", "") + splits = line.split("\t") + if splits[9] == "SpaceAfter=No": + sent += splits[1] + else: + sent += splits[1] + " " + + if splits[10].startswith("O") and not has_started: + continue + elif splits[10].startswith("B-"): + begin = len(sent) - len(splits[1]) + last = len(sent) + label = splits[10][2:] + has_started = True + elif splits[10].startswith("I-"): + last = len(sent) + elif splits[10].startswith("O") and begin: + # print("AICIA4",sent, ronec_class) + ronec_class.append(label) + start.append(begin) + end.append(last) + has_started = False + + # last example + yield guid, { + "id": str(guid), + "sentence": sentence, + "start": start, + "end": end, + "ronec_class": ronec_class, + } From 800b2d86638a4d4047863a1aed90daa6a6a7aa44 Mon Sep 17 00:00:00 2001 From: iliemihai Date: Sat, 12 Dec 2020 01:42:07 +0200 Subject: [PATCH 2/6] Added dummy data for RONEC --- .../ronec/dummy/ronec/1.0.0/dummy_data.zip | Bin 0 -> 486 bytes datasets/ronec/ronec.py | 33 +++--------------- 2 files changed, 5 insertions(+), 28 deletions(-) create mode 100644 datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip diff --git a/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip b/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..6b66ef4cafde89ca0fa45ef438651fec066e9303 GIT binary patch literal 486 zcmWIWW@Zs#0D(51oFQnJB8Zl+-dLO^i$;%(y)RajpQvTSpKJ#cME) z*gXf)$iT3qQ4>`ol2-$~5oTcXD#DC-G&7Jqj%)}vk0T73hH3~R7y`Um*+9xzfRL4u Jf#Eb*4*(PBR>A-P literal 0 HcmV?d00001 diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py index ef70b715cbb..85187df99f5 100644 --- a/datasets/ronec/ronec.py +++ b/datasets/ronec/ronec.py @@ -18,8 +18,9 @@ import csv import json -import os import logging +import os + import datasets @@ -47,9 +48,7 @@ # The HuggingFace dataset library don't host the datasets but only point to the original files # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) -_URL = ( - "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/" -) +_URL = "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/" _TRAINING_FILE = "train.conllu" _TEST_FILE = "test.conllu" _DEV_FILE = "dev.conllu" @@ -62,31 +61,17 @@ def __init__(self, **kwargs): super(RONECConfig, self).__init__(**kwargs) -# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case class RONEC(datasets.GeneratorBasedBuilder): """RONEC dataset""" VERSION = datasets.Version("1.0.0") DEFAULT_CONFIG_NAME = "ronec" - # This is an example of a dataset with multiple configurations. - # If you don't want/need to define several sub-sets in your dataset, - # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes. - # If you need to make complex sub-parts in the datasets with configurable options - # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig - # BUILDER_CONFIG_CLASS = MyBuilderConfig - - # You will be able to load one or the other configurations in the following list with - # data = datasets.load_dataset('my_dataset', 'first_domain') - # data = datasets.load_dataset('my_dataset', 'second_domain') BUILDER_CONFIGS = [ - RONECConfig( - name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset" - ), + RONECConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"), ] def _info(self): - # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset features = datasets.Features( { @@ -139,12 +124,7 @@ def _info(self): def _split_generators(self, dl_manager): """Returns SplitGenerators.""" - # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration - # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name - # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs - # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files. - # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive urls_to_download = { "train": os.path.join(_URL, _TRAINING_FILE), "dev": os.path.join(_URL, _DEV_FILE), @@ -174,9 +154,7 @@ def _split_generators(self, dl_manager): def _generate_examples(self, filepath): """ Yields examples. """ - # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method. - # It is in charge of opening the given file and yielding (key, example) tuples from the dataset - # The key is not important, it's more here for legacy reason (legacy from tfds) + logging.info("⏳ Generating examples from = %s", filepath) with open(filepath, encoding="utf-8") as f: guid = 0 @@ -222,7 +200,6 @@ def _generate_examples(self, filepath): elif splits[10].startswith("I-"): last = len(sent) elif splits[10].startswith("O") and begin: - # print("AICIA4",sent, ronec_class) ronec_class.append(label) start.append(begin) end.append(last) From 45e2c201e5d1fd08fb30a73626e09d5c9656434c Mon Sep 17 00:00:00 2001 From: iliemihai Date: Sat, 12 Dec 2020 12:36:02 +0200 Subject: [PATCH 3/6] Resolved coding style RONEC --- datasets/ronec/ronec.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py index 85187df99f5..ad3845b706c 100644 --- a/datasets/ronec/ronec.py +++ b/datasets/ronec/ronec.py @@ -16,8 +16,6 @@ from __future__ import absolute_import, division, print_function -import csv -import json import logging import os @@ -37,7 +35,7 @@ # You can copy an official description _DESCRIPTION = """\ The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence, -belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition +belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition """ _HOMEPAGE = "https://github.com/dumitrescustefan/ronec" @@ -131,7 +129,6 @@ def _split_generators(self, dl_manager): "test": os.path.join(_URL, _TEST_FILE), } - # Download .zip file downloaded_files = dl_manager.download(urls_to_download) return [ From 60cf0631ad7bf28e2d68281b2833b347ce9dfed1 Mon Sep 17 00:00:00 2001 From: iliemihai Date: Sat, 12 Dec 2020 13:35:59 +0200 Subject: [PATCH 4/6] Resolved errors RONEC --- datasets/ronec/dataset_infos.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/ronec/dataset_infos.json b/datasets/ronec/dataset_infos.json index a97100fcc46..acd5120e767 100644 --- a/datasets/ronec/dataset_infos.json +++ b/datasets/ronec/dataset_infos.json @@ -1 +1 @@ -{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition \n", "citation": "@article{dumitrescu2019introducing,\n title={Introducing RONEC--the Romanian Named Entity Corpus},\n author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n journal={arXiv preprint arXiv:1909.01247},\n year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}} \ No newline at end of file +{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition\n", "citation": "@article{dumitrescu2019introducing,\n title={Introducing RONEC--the Romanian Named Entity Corpus},\n author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n journal={arXiv preprint arXiv:1909.01247},\n year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}} \ No newline at end of file From 2723e2bc550ff5bb0ea71272ba2c9b84e7378f3e Mon Sep 17 00:00:00 2001 From: iliemihai Date: Sun, 13 Dec 2020 03:52:16 +0200 Subject: [PATCH 5/6] Added Romanian sentiment dataset --- datasets/rosent/dataset_infos.json | 1 + .../rosent/1.0.0/dummy_data/dummy_data.zip | Bin 0 -> 312 bytes datasets/rosent/rosent.py | 129 ++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 datasets/rosent/dataset_infos.json create mode 100644 datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip create mode 100644 datasets/rosent/rosent.py diff --git a/datasets/rosent/dataset_infos.json b/datasets/rosent/dataset_infos.json new file mode 100644 index 00000000000..7b38c2a88b3 --- /dev/null +++ b/datasets/rosent/dataset_infos.json @@ -0,0 +1 @@ +{"rosent": {"description": "This new dataset is a Romanian sentiment analysis dataset.\n", "citation": "", "homepage": "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "rosent_dataset", "config_name": "rosent", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8289100, "num_examples": 17941, "dataset_name": "rosent_dataset"}, "test": {"name": "test", "num_bytes": 6793523, "num_examples": 11005, "dataset_name": "rosent_dataset"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/train.csv": {"num_bytes": 8048544, "checksum": "5b5f36aba3895e75832d1f084459f23ebeec0418d55ab1fbaa015d154879ed0f"}, "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/test.csv": {"num_bytes": 6651513, "checksum": "2491ca36849e7055f1497575fa691f91671d64b7365f43a3be84ce552b6b65bd"}}, "download_size": 14700057, "post_processing_size": null, "dataset_size": 15082623, "size_in_bytes": 29782680}} \ No newline at end of file diff --git a/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip b/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..20acab4f7a6f5a141535b0d7dd8b37449ff0ad95 GIT binary patch literal 312 zcmWIWW@h1H00DuHoC#Y zUIK{#xDmo=MsR?PNG&dbn;;6*4Z;{EFfz$8<8p=s+*L~&K};k^u|gb$$2d$UAsgom aG7j!QplMhg$I1q>mI(;w0_j!|hXDY0D>gU) literal 0 HcmV?d00001 diff --git a/datasets/rosent/rosent.py b/datasets/rosent/rosent.py new file mode 100644 index 00000000000..a357b5b9492 --- /dev/null +++ b/datasets/rosent/rosent.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Introduction in a Romanian sentiment dataset.""" + +from __future__ import absolute_import, division, print_function + +import csv +import logging +import os + +import pandas as pd + +import datasets + + +# Find for instance the citation on arxiv or on the dataset repo/website +_CITATION = "" + +_DESCRIPTION = """\ +This new dataset is a Romanian sentiment analysis dataset. +""" + +_HOMEPAGE = "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets" + +_LICENSE = "" + +_URL = ( + "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/" +) +_TRAINING_FILE = "train.csv" +_TEST_FILE = "test.csv" + + +class ROSENTConfig(datasets.BuilderConfig): + """BuilderConfig for ROSENT dataset""" + + def __init__(self, **kwargs): + super(ROSENTConfig, self).__init__(**kwargs) + + +class ROSENTDataset(datasets.GeneratorBasedBuilder): + """Romanian sentiment dataset.""" + + VERSION = datasets.Version("1.0.0") + DEFAULT_CONFIG_NAME = "rosent" + + BUILDER_CONFIGS = [ + ROSENTConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="Romanian sentiment dataset."), + ] + + def _info(self): + + features = datasets.Features( + { + "id": datasets.Value("string"), + "sentence": datasets.Sequence(datasets.Value("string")), + "label": datasets.Sequence(datasets.Value("int32")), + } + ) + + return datasets.DatasetInfo( + # This is the description that will appear on the datasets page. + description=_DESCRIPTION, + # This defines the different columns of the dataset and their types + features=features, # Here we define them above because they are different between the two configurations + # If there's a common (input, target) tuple from the features, + # specify them here. They'll be used if as_supervised=True in + # builder.as_dataset. + supervised_keys=None, + # Homepage of the dataset for documentation + homepage=_HOMEPAGE, + # License for the dataset if available + license=_LICENSE, + # Citation for the dataset + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + urls_to_download_train = _URL + _TRAINING_FILE + urls_to_download_test = _URL + _TEST_FILE + + train_path = dl_manager.download(urls_to_download_train) + test_path = dl_manager.download(urls_to_download_test) + print("FISIERE LUATE", train_path, test_path) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": train_path}, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={"filepath": test_path}, + ), + ] + + def _generate_examples(self, filepath): + """ Yields examples. """ + + logging.info("⏳ Generating examples from = %s", filepath) + with open(filepath, encoding="utf-8") as f: + # data = pd.read_csv(filepath) + data = csv.reader(f, delimiter=",", quotechar='"') + + next(data, None) + for row_id, row in enumerate(data): + print("ROW", row) + id, txt, lbl = row + yield "{}_{}".format(row_id, id), { + "id": id, + "sentence": [txt], + "label": [lbl], + } From 429f274ad6d1af3e7e53ee0df68fec9b17579c47 Mon Sep 17 00:00:00 2001 From: iliemihai Date: Sun, 13 Dec 2020 03:54:49 +0200 Subject: [PATCH 6/6] Removed unused libraries --- datasets/rosent/rosent.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/datasets/rosent/rosent.py b/datasets/rosent/rosent.py index a357b5b9492..8042a33342a 100644 --- a/datasets/rosent/rosent.py +++ b/datasets/rosent/rosent.py @@ -18,9 +18,6 @@ import csv import logging -import os - -import pandas as pd import datasets