From 057f0eef16844289b7959ac340154ae2f9aeff58 Mon Sep 17 00:00:00 2001
From: iliemihai <ily_sin_mike@yahoo.com>
Date: Sat, 12 Dec 2020 00:12:42 +0200
Subject: [PATCH 1/6] Added RONEC dataset.

---
 datasets/ronec/dataset_infos.json |   1 +
 datasets/ronec/ronec.py           | 238 ++++++++++++++++++++++++++++++
 2 files changed, 239 insertions(+)
 create mode 100644 datasets/ronec/dataset_infos.json
 create mode 100644 datasets/ronec/ronec.py

diff --git a/datasets/ronec/dataset_infos.json b/datasets/ronec/dataset_infos.json
new file mode 100644
index 00000000000..a97100fcc46
--- /dev/null
+++ b/datasets/ronec/dataset_infos.json
@@ -0,0 +1 @@
+{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition \n", "citation": "@article{dumitrescu2019introducing,\n  title={Introducing RONEC--the Romanian Named Entity Corpus},\n  author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n  journal={arXiv preprint arXiv:1909.01247},\n  year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}}
\ No newline at end of file
diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py
new file mode 100644
index 00000000000..ef70b715cbb
--- /dev/null
+++ b/datasets/ronec/ronec.py
@@ -0,0 +1,238 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Introduction in RONEC: Named Entity Corpus for ROmanian language"""
+
+from __future__ import absolute_import, division, print_function
+
+import csv
+import json
+import os
+import logging
+import datasets
+
+
+# Find for instance the citation on arxiv or on the dataset repo/website
+_CITATION = """\
+@article{dumitrescu2019introducing,
+  title={Introducing RONEC--the Romanian Named Entity Corpus},
+  author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},
+  journal={arXiv preprint arXiv:1909.01247},
+  year={2019}
+}
+"""
+
+# You can copy an official description
+_DESCRIPTION = """\
+The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,
+belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition 
+"""
+
+_HOMEPAGE = "https://github.com/dumitrescustefan/ronec"
+
+_LICENSE = "MIT License"
+
+_FILE_FORMAT = "ronec.conllup"
+
+# The HuggingFace dataset library don't host the datasets but only point to the original files
+# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URL = (
+    "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/"
+)
+_TRAINING_FILE = "train.conllu"
+_TEST_FILE = "test.conllu"
+_DEV_FILE = "dev.conllu"
+
+
+class RONECConfig(datasets.BuilderConfig):
+    """BuilderConfig for RONEC dataset"""
+
+    def __init__(self, **kwargs):
+        super(RONECConfig, self).__init__(**kwargs)
+
+
+# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
+class RONEC(datasets.GeneratorBasedBuilder):
+    """RONEC dataset"""
+
+    VERSION = datasets.Version("1.0.0")
+    DEFAULT_CONFIG_NAME = "ronec"
+    # This is an example of a dataset with multiple configurations.
+    # If you don't want/need to define several sub-sets in your dataset,
+    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
+
+    # If you need to make complex sub-parts in the datasets with configurable options
+    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
+    # BUILDER_CONFIG_CLASS = MyBuilderConfig
+
+    # You will be able to load one or the other configurations in the following list with
+    # data = datasets.load_dataset('my_dataset', 'first_domain')
+    # data = datasets.load_dataset('my_dataset', 'second_domain')
+    BUILDER_CONFIGS = [
+        RONECConfig(
+            name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"
+        ),
+    ]
+
+    def _info(self):
+        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
+
+        features = datasets.Features(
+            {
+                "id": datasets.Value("string"),
+                "sentence": datasets.Sequence(datasets.Value("string")),
+                "start": datasets.Sequence(datasets.Value("int32")),
+                "end": datasets.Sequence(datasets.Value("int32")),
+                "ronec_class": datasets.Sequence(
+                    datasets.features.ClassLabel(
+                        names=[
+                            "O",
+                            "DATETIME",
+                            "EVENT",
+                            "FACILITY",
+                            "GPE",
+                            "LANGUAGE",
+                            "LOC",
+                            "MONEY",
+                            "NAT_REL_POL",
+                            "NUMERIC_VALUE",
+                            "ORDINAL",
+                            "ORGANIZATION",
+                            "PERIOD",
+                            "PERSON",
+                            "PRODUCT",
+                            "QUANTITY",
+                            "WORK_OF_ART",
+                        ]
+                    )
+                ),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
+        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
+        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
+        urls_to_download = {
+            "train": os.path.join(_URL, _TRAINING_FILE),
+            "dev": os.path.join(_URL, _DEV_FILE),
+            "test": os.path.join(_URL, _TEST_FILE),
+        }
+
+        # Download .zip file
+        downloaded_files = dl_manager.download(urls_to_download)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": downloaded_files["train"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": downloaded_files["test"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": downloaded_files["dev"]},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """ Yields examples. """
+        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
+        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
+        # The key is not important, it's more here for legacy reason (legacy from tfds)
+        logging.info("⏳ Generating examples from = %s", filepath)
+        with open(filepath, encoding="utf-8") as f:
+            guid = 0
+            sentence = []
+            ronec_class = []
+            start = []
+            end = []
+            sent = ""
+            has_started = False
+            for line in f:
+                if "#" in line or line == "\n":
+                    if sent:
+                        sentence.append(sent)
+                        yield guid, {
+                            "id": str(guid),
+                            "sentence": sentence,
+                            "start": start,
+                            "end": end,
+                            "ronec_class": ronec_class,
+                        }
+                        guid += 1
+                        sentence = []
+                        start = []
+                        end = []
+                        ronec_class = []
+                        sent = ""
+                else:
+                    # ronec words are tab separated
+                    line = line.replace("\n", "")
+                    splits = line.split("\t")
+                    if splits[9] == "SpaceAfter=No":
+                        sent += splits[1]
+                    else:
+                        sent += splits[1] + " "
+
+                    if splits[10].startswith("O") and not has_started:
+                        continue
+                    elif splits[10].startswith("B-"):
+                        begin = len(sent) - len(splits[1])
+                        last = len(sent)
+                        label = splits[10][2:]
+                        has_started = True
+                    elif splits[10].startswith("I-"):
+                        last = len(sent)
+                    elif splits[10].startswith("O") and begin:
+                        # print("AICIA4",sent, ronec_class)
+                        ronec_class.append(label)
+                        start.append(begin)
+                        end.append(last)
+                        has_started = False
+
+            # last example
+            yield guid, {
+                "id": str(guid),
+                "sentence": sentence,
+                "start": start,
+                "end": end,
+                "ronec_class": ronec_class,
+            }

From 800b2d86638a4d4047863a1aed90daa6a6a7aa44 Mon Sep 17 00:00:00 2001
From: iliemihai <ily_sin_mike@yahoo.com>
Date: Sat, 12 Dec 2020 01:42:07 +0200
Subject: [PATCH 2/6] Added dummy data for RONEC

---
 .../ronec/dummy/ronec/1.0.0/dummy_data.zip    | Bin 0 -> 486 bytes
 datasets/ronec/ronec.py                       |  33 +++---------------
 2 files changed, 5 insertions(+), 28 deletions(-)
 create mode 100644 datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip

diff --git a/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip b/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..6b66ef4cafde89ca0fa45ef438651fec066e9303
GIT binary patch
literal 486
zcmWIWW@Zs#0D(51o<J}IN^k?|l+xVX%J`JTl0^Lgs6q~gW~f3Ys6sJRg(a!QC3?yE
zc{w?y%nS%!Fympm#L;vWC1&Pf(N>FQnJB8Zl+-dLO^i$;%(y)RajpQvTSpKJ#cME)
z*gXf)$iT3qQ4>`ol2-$~5oTcXD#DC-G&7Jqj%)}vk0T73hH3~R7y`Um*+9xzfRL4u
Jf#Eb*4*(PBR>A-P

literal 0
HcmV?d00001

diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py
index ef70b715cbb..85187df99f5 100644
--- a/datasets/ronec/ronec.py
+++ b/datasets/ronec/ronec.py
@@ -18,8 +18,9 @@
 
 import csv
 import json
-import os
 import logging
+import os
+
 import datasets
 
 
@@ -47,9 +48,7 @@
 
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
-_URL = (
-    "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/"
-)
+_URL = "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/"
 _TRAINING_FILE = "train.conllu"
 _TEST_FILE = "test.conllu"
 _DEV_FILE = "dev.conllu"
@@ -62,31 +61,17 @@ def __init__(self, **kwargs):
         super(RONECConfig, self).__init__(**kwargs)
 
 
-# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
 class RONEC(datasets.GeneratorBasedBuilder):
     """RONEC dataset"""
 
     VERSION = datasets.Version("1.0.0")
     DEFAULT_CONFIG_NAME = "ronec"
-    # This is an example of a dataset with multiple configurations.
-    # If you don't want/need to define several sub-sets in your dataset,
-    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
 
-    # If you need to make complex sub-parts in the datasets with configurable options
-    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
-    # BUILDER_CONFIG_CLASS = MyBuilderConfig
-
-    # You will be able to load one or the other configurations in the following list with
-    # data = datasets.load_dataset('my_dataset', 'first_domain')
-    # data = datasets.load_dataset('my_dataset', 'second_domain')
     BUILDER_CONFIGS = [
-        RONECConfig(
-            name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"
-        ),
+        RONECConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"),
     ]
 
     def _info(self):
-        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
 
         features = datasets.Features(
             {
@@ -139,12 +124,7 @@ def _info(self):
 
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
-        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
 
-        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
-        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
-        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
         urls_to_download = {
             "train": os.path.join(_URL, _TRAINING_FILE),
             "dev": os.path.join(_URL, _DEV_FILE),
@@ -174,9 +154,7 @@ def _split_generators(self, dl_manager):
 
     def _generate_examples(self, filepath):
         """ Yields examples. """
-        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
-        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
-        # The key is not important, it's more here for legacy reason (legacy from tfds)
+
         logging.info("⏳ Generating examples from = %s", filepath)
         with open(filepath, encoding="utf-8") as f:
             guid = 0
@@ -222,7 +200,6 @@ def _generate_examples(self, filepath):
                     elif splits[10].startswith("I-"):
                         last = len(sent)
                     elif splits[10].startswith("O") and begin:
-                        # print("AICIA4",sent, ronec_class)
                         ronec_class.append(label)
                         start.append(begin)
                         end.append(last)

From 45e2c201e5d1fd08fb30a73626e09d5c9656434c Mon Sep 17 00:00:00 2001
From: iliemihai <ily_sin_mike@yahoo.com>
Date: Sat, 12 Dec 2020 12:36:02 +0200
Subject: [PATCH 3/6] Resolved coding style RONEC

---
 datasets/ronec/ronec.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py
index 85187df99f5..ad3845b706c 100644
--- a/datasets/ronec/ronec.py
+++ b/datasets/ronec/ronec.py
@@ -16,8 +16,6 @@
 
 from __future__ import absolute_import, division, print_function
 
-import csv
-import json
 import logging
 import os
 
@@ -37,7 +35,7 @@
 # You can copy an official description
 _DESCRIPTION = """\
 The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,
-belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition 
+belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition
 """
 
 _HOMEPAGE = "https://github.com/dumitrescustefan/ronec"
@@ -131,7 +129,6 @@ def _split_generators(self, dl_manager):
             "test": os.path.join(_URL, _TEST_FILE),
         }
 
-        # Download .zip file
         downloaded_files = dl_manager.download(urls_to_download)
 
         return [

From 60cf0631ad7bf28e2d68281b2833b347ce9dfed1 Mon Sep 17 00:00:00 2001
From: iliemihai <ily_sin_mike@yahoo.com>
Date: Sat, 12 Dec 2020 13:35:59 +0200
Subject: [PATCH 4/6] Resolved errors RONEC

---
 datasets/ronec/dataset_infos.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/ronec/dataset_infos.json b/datasets/ronec/dataset_infos.json
index a97100fcc46..acd5120e767 100644
--- a/datasets/ronec/dataset_infos.json
+++ b/datasets/ronec/dataset_infos.json
@@ -1 +1 @@
-{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition \n", "citation": "@article{dumitrescu2019introducing,\n  title={Introducing RONEC--the Romanian Named Entity Corpus},\n  author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n  journal={arXiv preprint arXiv:1909.01247},\n  year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}}
\ No newline at end of file
+{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition\n", "citation": "@article{dumitrescu2019introducing,\n  title={Introducing RONEC--the Romanian Named Entity Corpus},\n  author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n  journal={arXiv preprint arXiv:1909.01247},\n  year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}}
\ No newline at end of file

From 2723e2bc550ff5bb0ea71272ba2c9b84e7378f3e Mon Sep 17 00:00:00 2001
From: iliemihai <ily_sin_mike@yahoo.com>
Date: Sun, 13 Dec 2020 03:52:16 +0200
Subject: [PATCH 5/6] Added Romanian sentiment dataset

---
 datasets/rosent/dataset_infos.json            |   1 +
 .../rosent/1.0.0/dummy_data/dummy_data.zip    | Bin 0 -> 312 bytes
 datasets/rosent/rosent.py                     | 129 ++++++++++++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 datasets/rosent/dataset_infos.json
 create mode 100644 datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip
 create mode 100644 datasets/rosent/rosent.py

diff --git a/datasets/rosent/dataset_infos.json b/datasets/rosent/dataset_infos.json
new file mode 100644
index 00000000000..7b38c2a88b3
--- /dev/null
+++ b/datasets/rosent/dataset_infos.json
@@ -0,0 +1 @@
+{"rosent": {"description": "This new dataset is a Romanian sentiment analysis dataset.\n", "citation": "", "homepage": "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "rosent_dataset", "config_name": "rosent", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8289100, "num_examples": 17941, "dataset_name": "rosent_dataset"}, "test": {"name": "test", "num_bytes": 6793523, "num_examples": 11005, "dataset_name": "rosent_dataset"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/train.csv": {"num_bytes": 8048544, "checksum": "5b5f36aba3895e75832d1f084459f23ebeec0418d55ab1fbaa015d154879ed0f"}, "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/test.csv": {"num_bytes": 6651513, "checksum": "2491ca36849e7055f1497575fa691f91671d64b7365f43a3be84ce552b6b65bd"}}, "download_size": 14700057, "post_processing_size": null, "dataset_size": 15082623, "size_in_bytes": 29782680}}
\ No newline at end of file
diff --git a/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip b/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..20acab4f7a6f5a141535b0d7dd8b37449ff0ad95
GIT binary patch
literal 312
zcmWIWW@h1H00DuHo<J}IN^mmBFq9M}X6ET77ng;Ga56CS-@XzL!lf1542&!<m>C#Y
zUIK{#xDmo=MsR?PNG&dbn;;6*4Z;{EFfz$8<8p=s+*L~&K};k^u|gb$$2d$UAsgom
aG7j!QplMhg$I1q>mI(;w0_j!|hXDY0D>gU)

literal 0
HcmV?d00001

diff --git a/datasets/rosent/rosent.py b/datasets/rosent/rosent.py
new file mode 100644
index 00000000000..a357b5b9492
--- /dev/null
+++ b/datasets/rosent/rosent.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Introduction in a Romanian sentiment dataset."""
+
+from __future__ import absolute_import, division, print_function
+
+import csv
+import logging
+import os
+
+import pandas as pd
+
+import datasets
+
+
+# Find for instance the citation on arxiv or on the dataset repo/website
+_CITATION = ""
+
+_DESCRIPTION = """\
+This new dataset is a Romanian sentiment analysis dataset.
+"""
+
+_HOMEPAGE = "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets"
+
+_LICENSE = ""
+
+_URL = (
+    "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/"
+)
+_TRAINING_FILE = "train.csv"
+_TEST_FILE = "test.csv"
+
+
+class ROSENTConfig(datasets.BuilderConfig):
+    """BuilderConfig for ROSENT dataset"""
+
+    def __init__(self, **kwargs):
+        super(ROSENTConfig, self).__init__(**kwargs)
+
+
+class ROSENTDataset(datasets.GeneratorBasedBuilder):
+    """Romanian sentiment dataset."""
+
+    VERSION = datasets.Version("1.0.0")
+    DEFAULT_CONFIG_NAME = "rosent"
+
+    BUILDER_CONFIGS = [
+        ROSENTConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="Romanian sentiment dataset."),
+    ]
+
+    def _info(self):
+
+        features = datasets.Features(
+            {
+                "id": datasets.Value("string"),
+                "sentence": datasets.Sequence(datasets.Value("string")),
+                "label": datasets.Sequence(datasets.Value("int32")),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+
+        urls_to_download_train = _URL + _TRAINING_FILE
+        urls_to_download_test = _URL + _TEST_FILE
+
+        train_path = dl_manager.download(urls_to_download_train)
+        test_path = dl_manager.download(urls_to_download_test)
+        print("FISIERE LUATE", train_path, test_path)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": train_path},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": test_path},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """ Yields examples. """
+
+        logging.info("⏳ Generating examples from = %s", filepath)
+        with open(filepath, encoding="utf-8") as f:
+            # data = pd.read_csv(filepath)
+            data = csv.reader(f, delimiter=",", quotechar='"')
+
+            next(data, None)
+            for row_id, row in enumerate(data):
+                print("ROW", row)
+                id, txt, lbl = row
+                yield "{}_{}".format(row_id, id), {
+                    "id": id,
+                    "sentence": [txt],
+                    "label": [lbl],
+                }

From 429f274ad6d1af3e7e53ee0df68fec9b17579c47 Mon Sep 17 00:00:00 2001
From: iliemihai <ily_sin_mike@yahoo.com>
Date: Sun, 13 Dec 2020 03:54:49 +0200
Subject: [PATCH 6/6] Removed unused libraries

---
 datasets/rosent/rosent.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/datasets/rosent/rosent.py b/datasets/rosent/rosent.py
index a357b5b9492..8042a33342a 100644
--- a/datasets/rosent/rosent.py
+++ b/datasets/rosent/rosent.py
@@ -18,9 +18,6 @@
 
 import csv
 import logging
-import os
-
-import pandas as pd
 
 import datasets