huggingface · iliemihai · Dec 11, 2020 · Dec 11, 2020 · Dec 12, 2020 · Dec 12, 2020
diff --git a/datasets/ronec/dataset_infos.json b/datasets/ronec/dataset_infos.json
@@ -0,0 +1 @@
+{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition\n", "citation": "@article{dumitrescu2019introducing,\n  title={Introducing RONEC--the Romanian Named Entity Corpus},\n  author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n  journal={arXiv preprint arXiv:1909.01247},\n  year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}}
diff --git a/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip b/datasets/ronec/dummy/ronec/1.0.0/dummy_data.zip
diff --git a/datasets/ronec/ronec.py b/datasets/ronec/ronec.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Introduction in RONEC: Named Entity Corpus for ROmanian language"""
+
+from __future__ import absolute_import, division, print_function
+
+import logging
+import os
+
+import datasets
+
+
+# Find for instance the citation on arxiv or on the dataset repo/website
+_CITATION = """\
+@article{dumitrescu2019introducing,
+  title={Introducing RONEC--the Romanian Named Entity Corpus},
+  author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},
+  journal={arXiv preprint arXiv:1909.01247},
+  year={2019}
+}
+"""
+
+# You can copy an official description
+_DESCRIPTION = """\
+The RONEC (Named Entity Corpus for the Romanian language) dataset  contains over 26000 entities in ~5000 annotated sentence,
+belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition
+"""
+
+_HOMEPAGE = "https://github.com/dumitrescustefan/ronec"
+
+_LICENSE = "MIT License"
+
+_FILE_FORMAT = "ronec.conllup"
+
+# The HuggingFace dataset library don't host the datasets but only point to the original files
+# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URL = "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/"
+_TRAINING_FILE = "train.conllu"
+_TEST_FILE = "test.conllu"
+_DEV_FILE = "dev.conllu"
+
+
+class RONECConfig(datasets.BuilderConfig):
+    """BuilderConfig for RONEC dataset"""
+
+    def __init__(self, **kwargs):
+        super(RONECConfig, self).__init__(**kwargs)
+
+
+class RONEC(datasets.GeneratorBasedBuilder):
+    """RONEC dataset"""
+
+    VERSION = datasets.Version("1.0.0")
+    DEFAULT_CONFIG_NAME = "ronec"
+
+    BUILDER_CONFIGS = [
+        RONECConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"),
+    ]
+
+    def _info(self):
+
+        features = datasets.Features(
+            {
+                "id": datasets.Value("string"),
+                "sentence": datasets.Sequence(datasets.Value("string")),
+                "start": datasets.Sequence(datasets.Value("int32")),
+                "end": datasets.Sequence(datasets.Value("int32")),
+                "ronec_class": datasets.Sequence(
+                    datasets.features.ClassLabel(
+                        names=[
+                            "O",
+                            "DATETIME",
+                            "EVENT",
+                            "FACILITY",
+                            "GPE",
+                            "LANGUAGE",
+                            "LOC",
+                            "MONEY",
+                            "NAT_REL_POL",
+                            "NUMERIC_VALUE",
+                            "ORDINAL",
+                            "ORGANIZATION",
+                            "PERIOD",
+                            "PERSON",
+                            "PRODUCT",
+                            "QUANTITY",
+                            "WORK_OF_ART",
+                        ]
+                    )
+                ),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+
+        urls_to_download = {
+            "train": os.path.join(_URL, _TRAINING_FILE),
+            "dev": os.path.join(_URL, _DEV_FILE),
+            "test": os.path.join(_URL, _TEST_FILE),
+        }
+
+        downloaded_files = dl_manager.download(urls_to_download)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": downloaded_files["train"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": downloaded_files["test"]},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": downloaded_files["dev"]},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """ Yields examples. """
+
+        logging.info("⏳ Generating examples from = %s", filepath)
+        with open(filepath, encoding="utf-8") as f:
+            guid = 0
+            sentence = []
+            ronec_class = []
+            start = []
+            end = []
+            sent = ""
+            has_started = False
+            for line in f:
+                if "#" in line or line == "\n":
+                    if sent:
+                        sentence.append(sent)
+                        yield guid, {
+                            "id": str(guid),
+                            "sentence": sentence,
+                            "start": start,
+                            "end": end,
+                            "ronec_class": ronec_class,
+                        }
+                        guid += 1
+                        sentence = []
+                        start = []
+                        end = []
+                        ronec_class = []
+                        sent = ""
+                else:
+                    # ronec words are tab separated
+                    line = line.replace("\n", "")
+                    splits = line.split("\t")
+                    if splits[9] == "SpaceAfter=No":
+                        sent += splits[1]
+                    else:
+                        sent += splits[1] + " "
+
+                    if splits[10].startswith("O") and not has_started:
+                        continue
+                    elif splits[10].startswith("B-"):
+                        begin = len(sent) - len(splits[1])
+                        last = len(sent)
+                        label = splits[10][2:]
+                        has_started = True
+                    elif splits[10].startswith("I-"):
+                        last = len(sent)
+                    elif splits[10].startswith("O") and begin:
+                        ronec_class.append(label)
+                        start.append(begin)
+                        end.append(last)
+                        has_started = False
+
+            # last example
+            yield guid, {
+                "id": str(guid),
+                "sentence": sentence,
+                "start": start,
+                "end": end,
+                "ronec_class": ronec_class,
+            }
diff --git a/datasets/rosent/dataset_infos.json b/datasets/rosent/dataset_infos.json
@@ -0,0 +1 @@
+{"rosent": {"description": "This new dataset is a Romanian sentiment analysis dataset.\n", "citation": "", "homepage": "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "rosent_dataset", "config_name": "rosent", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8289100, "num_examples": 17941, "dataset_name": "rosent_dataset"}, "test": {"name": "test", "num_bytes": 6793523, "num_examples": 11005, "dataset_name": "rosent_dataset"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/train.csv": {"num_bytes": 8048544, "checksum": "5b5f36aba3895e75832d1f084459f23ebeec0418d55ab1fbaa015d154879ed0f"}, "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/test.csv": {"num_bytes": 6651513, "checksum": "2491ca36849e7055f1497575fa691f91671d64b7365f43a3be84ce552b6b65bd"}}, "download_size": 14700057, "post_processing_size": null, "dataset_size": 15082623, "size_in_bytes": 29782680}}
diff --git a/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip b/datasets/rosent/dummy/rosent/1.0.0/dummy_data/dummy_data.zip
diff --git a/datasets/rosent/rosent.py b/datasets/rosent/rosent.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Introduction in a Romanian sentiment dataset."""
+
+from __future__ import absolute_import, division, print_function
+
+import csv
+import logging
+
+import datasets
+
+
+# Find for instance the citation on arxiv or on the dataset repo/website
+_CITATION = ""
+
+_DESCRIPTION = """\
+This new dataset is a Romanian sentiment analysis dataset.
+"""
+
+_HOMEPAGE = "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets"
+
+_LICENSE = ""
+
+_URL = (
+    "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/"
+)
+_TRAINING_FILE = "train.csv"
+_TEST_FILE = "test.csv"
+
+
+class ROSENTConfig(datasets.BuilderConfig):
+    """BuilderConfig for ROSENT dataset"""
+
+    def __init__(self, **kwargs):
+        super(ROSENTConfig, self).__init__(**kwargs)
+
+
+class ROSENTDataset(datasets.GeneratorBasedBuilder):
+    """Romanian sentiment dataset."""
+
+    VERSION = datasets.Version("1.0.0")
+    DEFAULT_CONFIG_NAME = "rosent"
+
+    BUILDER_CONFIGS = [
+        ROSENTConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="Romanian sentiment dataset."),
+    ]
+
+    def _info(self):
+
+        features = datasets.Features(
+            {
+                "id": datasets.Value("string"),
+                "sentence": datasets.Sequence(datasets.Value("string")),
+                "label": datasets.Sequence(datasets.Value("int32")),
+            }
+        )
+
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
-            supervised_keys=None,
+            supervised_keys=("sentence", "label"),
-            supervised_keys=None,
+            supervised_keys=("sentence", "label"),
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+
+        urls_to_download_train = _URL + _TRAINING_FILE
+        urls_to_download_test = _URL + _TEST_FILE
+
+        train_path = dl_manager.download(urls_to_download_train)
+        test_path = dl_manager.download(urls_to_download_test)
-        urls_to_download_train = _URL + _TRAINING_FILE
-        urls_to_download_test = _URL + _TEST_FILE
-
-        train_path = dl_manager.download(urls_to_download_train)
-        test_path = dl_manager.download(urls_to_download_test)
+        urls = {
+            "train": _URL + _TRAINING_FILE,
+            "test": _URL + _TEST_FILE,
+        }
+        paths = dl_manager.download(urls)
-        urls_to_download_train = _URL + _TRAINING_FILE
-        urls_to_download_test = _URL + _TEST_FILE
-
-        train_path = dl_manager.download(urls_to_download_train)
-        test_path = dl_manager.download(urls_to_download_test)
+        urls = {
+            "train": _URL + _TRAINING_FILE,
+            "test": _URL + _TEST_FILE,
+        }
+        paths = dl_manager.download(urls)
+        print("FISIERE LUATE", train_path, test_path)
-        print("FISIERE LUATE", train_path, test_path)
-        print("FISIERE LUATE", train_path, test_path)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": train_path},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": test_path},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """ Yields examples. """
+
+        logging.info("⏳ Generating examples from = %s", filepath)
+        with open(filepath, encoding="utf-8") as f:
+            # data = pd.read_csv(filepath)
-            # data = pd.read_csv(filepath)
-            # data = pd.read_csv(filepath)
+            data = csv.reader(f, delimiter=",", quotechar='"')
+
+            next(data, None)
+            for row_id, row in enumerate(data):
+                print("ROW", row)
-                print("ROW", row)
-                print("ROW", row)
+                id, txt, lbl = row
+                yield "{}_{}".format(row_id, id), {
+                    "id": id,
+                    "sentence": [txt],
+                    "label": [lbl],
-                    "sentence": [txt],
-                    "label": [lbl],
+                    "sentence": txt,
+                    "label": lbl,
-                    "sentence": [txt],
-                    "label": [lbl],
+                    "sentence": txt,
+                    "label": lbl,
+                }