-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Ro sent #1529
Ro sent #1529
Changes from all commits
057f0ee
800b2d8
45e2c20
60cf063
2723e2b
429f274
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"ronec": {"description": "The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence,\nbelonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition\n", "citation": "@article{dumitrescu2019introducing,\n title={Introducing RONEC--the Romanian Named Entity Corpus},\n author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius},\n journal={arXiv preprint arXiv:1909.01247},\n year={2019}\n}\n", "homepage": "https://github.com/dumitrescustefan/ronec", "license": "MIT License", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "start": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "end": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "ronec_class": {"feature": {"num_classes": 17, "names": ["O", "DATETIME", "EVENT", "FACILITY", "GPE", "LANGUAGE", "LOC", "MONEY", "NAT_REL_POL", "NUMERIC_VALUE", "ORDINAL", "ORGANIZATION", "PERIOD", "PERSON", "PRODUCT", "QUANTITY", "WORK_OF_ART"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "ronec", "config_name": "ronec", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1273965, "num_examples": 4174, "dataset_name": "ronec"}, "test": {"name": "test", "num_bytes": 160118, "num_examples": 523, "dataset_name": "ronec"}, "validation": {"name": "validation", "num_bytes": 163194, "num_examples": 523, "dataset_name": "ronec"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/train.conllu": {"num_bytes": 11583235, "checksum": "686b230c30866afa4b9c4b2b511062c2562e84af4d9c71035a92712cc90aba21"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/dev.conllu": {"num_bytes": 1478451, "checksum": "b99f1b7d9fef24635ee66b925e039663ef7fbe5c5a7851ffdb3efa0d9395ad41"}, "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/test.conllu": {"num_bytes": 1464643, "checksum": "7b1f71cfdbd5fc71e44051e1028369f5245afbfd7b17730b31b785bbff3f9c56"}}, "download_size": 14526329, "post_processing_size": null, "dataset_size": 1597277, "size_in_bytes": 16123606}} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
# coding=utf-8 | ||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Introduction in RONEC: Named Entity Corpus for ROmanian language""" | ||
|
||
from __future__ import absolute_import, division, print_function | ||
|
||
import logging | ||
import os | ||
|
||
import datasets | ||
|
||
|
||
# Find for instance the citation on arxiv or on the dataset repo/website | ||
_CITATION = """\ | ||
@article{dumitrescu2019introducing, | ||
title={Introducing RONEC--the Romanian Named Entity Corpus}, | ||
author={Dumitrescu, Stefan Daniel and Avram, Andrei-Marius}, | ||
journal={arXiv preprint arXiv:1909.01247}, | ||
year={2019} | ||
} | ||
""" | ||
|
||
# You can copy an official description | ||
_DESCRIPTION = """\ | ||
The RONEC (Named Entity Corpus for the Romanian language) dataset contains over 26000 entities in ~5000 annotated sentence, | ||
belonging to 16 distinct classes. It represents the first initiative in the Romanian language space specifically targeted for named entity recognition | ||
""" | ||
|
||
_HOMEPAGE = "https://github.com/dumitrescustefan/ronec" | ||
|
||
_LICENSE = "MIT License" | ||
|
||
_FILE_FORMAT = "ronec.conllup" | ||
|
||
# The HuggingFace dataset library don't host the datasets but only point to the original files | ||
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) | ||
_URL = "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/ronec/conllup/raw/" | ||
_TRAINING_FILE = "train.conllu" | ||
_TEST_FILE = "test.conllu" | ||
_DEV_FILE = "dev.conllu" | ||
|
||
|
||
class RONECConfig(datasets.BuilderConfig): | ||
"""BuilderConfig for RONEC dataset""" | ||
|
||
def __init__(self, **kwargs): | ||
super(RONECConfig, self).__init__(**kwargs) | ||
|
||
|
||
class RONEC(datasets.GeneratorBasedBuilder): | ||
"""RONEC dataset""" | ||
|
||
VERSION = datasets.Version("1.0.0") | ||
DEFAULT_CONFIG_NAME = "ronec" | ||
|
||
BUILDER_CONFIGS = [ | ||
RONECConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="RONEC dataset"), | ||
] | ||
|
||
def _info(self): | ||
|
||
features = datasets.Features( | ||
{ | ||
"id": datasets.Value("string"), | ||
"sentence": datasets.Sequence(datasets.Value("string")), | ||
"start": datasets.Sequence(datasets.Value("int32")), | ||
"end": datasets.Sequence(datasets.Value("int32")), | ||
"ronec_class": datasets.Sequence( | ||
datasets.features.ClassLabel( | ||
names=[ | ||
"O", | ||
"DATETIME", | ||
"EVENT", | ||
"FACILITY", | ||
"GPE", | ||
"LANGUAGE", | ||
"LOC", | ||
"MONEY", | ||
"NAT_REL_POL", | ||
"NUMERIC_VALUE", | ||
"ORDINAL", | ||
"ORGANIZATION", | ||
"PERIOD", | ||
"PERSON", | ||
"PRODUCT", | ||
"QUANTITY", | ||
"WORK_OF_ART", | ||
] | ||
) | ||
), | ||
} | ||
) | ||
|
||
return datasets.DatasetInfo( | ||
# This is the description that will appear on the datasets page. | ||
description=_DESCRIPTION, | ||
# This defines the different columns of the dataset and their types | ||
features=features, # Here we define them above because they are different between the two configurations | ||
# If there's a common (input, target) tuple from the features, | ||
# specify them here. They'll be used if as_supervised=True in | ||
# builder.as_dataset. | ||
supervised_keys=None, | ||
# Homepage of the dataset for documentation | ||
homepage=_HOMEPAGE, | ||
# License for the dataset if available | ||
license=_LICENSE, | ||
# Citation for the dataset | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
|
||
urls_to_download = { | ||
"train": os.path.join(_URL, _TRAINING_FILE), | ||
"dev": os.path.join(_URL, _DEV_FILE), | ||
"test": os.path.join(_URL, _TEST_FILE), | ||
} | ||
|
||
downloaded_files = dl_manager.download(urls_to_download) | ||
|
||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
# These kwargs will be passed to _generate_examples | ||
gen_kwargs={"filepath": downloaded_files["train"]}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
# These kwargs will be passed to _generate_examples | ||
gen_kwargs={"filepath": downloaded_files["test"]}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
# These kwargs will be passed to _generate_examples | ||
gen_kwargs={"filepath": downloaded_files["dev"]}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath): | ||
""" Yields examples. """ | ||
|
||
logging.info("⏳ Generating examples from = %s", filepath) | ||
with open(filepath, encoding="utf-8") as f: | ||
guid = 0 | ||
sentence = [] | ||
ronec_class = [] | ||
start = [] | ||
end = [] | ||
sent = "" | ||
has_started = False | ||
for line in f: | ||
if "#" in line or line == "\n": | ||
if sent: | ||
sentence.append(sent) | ||
yield guid, { | ||
"id": str(guid), | ||
"sentence": sentence, | ||
"start": start, | ||
"end": end, | ||
"ronec_class": ronec_class, | ||
} | ||
guid += 1 | ||
sentence = [] | ||
start = [] | ||
end = [] | ||
ronec_class = [] | ||
sent = "" | ||
else: | ||
# ronec words are tab separated | ||
line = line.replace("\n", "") | ||
splits = line.split("\t") | ||
if splits[9] == "SpaceAfter=No": | ||
sent += splits[1] | ||
else: | ||
sent += splits[1] + " " | ||
|
||
if splits[10].startswith("O") and not has_started: | ||
continue | ||
elif splits[10].startswith("B-"): | ||
begin = len(sent) - len(splits[1]) | ||
last = len(sent) | ||
label = splits[10][2:] | ||
has_started = True | ||
elif splits[10].startswith("I-"): | ||
last = len(sent) | ||
elif splits[10].startswith("O") and begin: | ||
ronec_class.append(label) | ||
start.append(begin) | ||
end.append(last) | ||
has_started = False | ||
|
||
# last example | ||
yield guid, { | ||
"id": str(guid), | ||
"sentence": sentence, | ||
"start": start, | ||
"end": end, | ||
"ronec_class": ronec_class, | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"rosent": {"description": "This new dataset is a Romanian sentiment analysis dataset.\n", "citation": "", "homepage": "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"feature": {"dtype": "int32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "rosent_dataset", "config_name": "rosent", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8289100, "num_examples": 17941, "dataset_name": "rosent_dataset"}, "test": {"name": "test", "num_bytes": 6793523, "num_examples": 11005, "dataset_name": "rosent_dataset"}}, "download_checksums": {"https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/train.csv": {"num_bytes": 8048544, "checksum": "5b5f36aba3895e75832d1f084459f23ebeec0418d55ab1fbaa015d154879ed0f"}, "https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/test.csv": {"num_bytes": 6651513, "checksum": "2491ca36849e7055f1497575fa691f91671d64b7365f43a3be84ce552b6b65bd"}}, "download_size": 14700057, "post_processing_size": null, "dataset_size": 15082623, "size_in_bytes": 29782680}} |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,126 @@ | ||||||||||||||||||||||
# coding=utf-8 | ||||||||||||||||||||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | ||||||||||||||||||||||
# | ||||||||||||||||||||||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||||||||||||||
# you may not use this file except in compliance with the License. | ||||||||||||||||||||||
# You may obtain a copy of the License at | ||||||||||||||||||||||
# | ||||||||||||||||||||||
# http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||||||
# | ||||||||||||||||||||||
# Unless required by applicable law or agreed to in writing, software | ||||||||||||||||||||||
# distributed under the License is distributed on an "AS IS" BASIS, | ||||||||||||||||||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||||||||||||||||||
# See the License for the specific language governing permissions and | ||||||||||||||||||||||
# limitations under the License. | ||||||||||||||||||||||
"""Introduction in a Romanian sentiment dataset.""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
from __future__ import absolute_import, division, print_function | ||||||||||||||||||||||
|
||||||||||||||||||||||
import csv | ||||||||||||||||||||||
import logging | ||||||||||||||||||||||
|
||||||||||||||||||||||
import datasets | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
# Find for instance the citation on arxiv or on the dataset repo/website | ||||||||||||||||||||||
_CITATION = "" | ||||||||||||||||||||||
|
||||||||||||||||||||||
_DESCRIPTION = """\ | ||||||||||||||||||||||
This new dataset is a Romanian sentiment analysis dataset. | ||||||||||||||||||||||
""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
_HOMEPAGE = "https://github.com/katakonst/sentiment-analysis-tensorflow/tree/master/datasets" | ||||||||||||||||||||||
|
||||||||||||||||||||||
_LICENSE = "" | ||||||||||||||||||||||
|
||||||||||||||||||||||
_URL = ( | ||||||||||||||||||||||
"https://raw.githubusercontent.com/dumitrescustefan/Romanian-Transformers/examples/examples/sentiment_analysis/ro/" | ||||||||||||||||||||||
) | ||||||||||||||||||||||
_TRAINING_FILE = "train.csv" | ||||||||||||||||||||||
_TEST_FILE = "test.csv" | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class ROSENTConfig(datasets.BuilderConfig): | ||||||||||||||||||||||
"""BuilderConfig for ROSENT dataset""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
def __init__(self, **kwargs): | ||||||||||||||||||||||
super(ROSENTConfig, self).__init__(**kwargs) | ||||||||||||||||||||||
|
||||||||||||||||||||||
|
||||||||||||||||||||||
class ROSENTDataset(datasets.GeneratorBasedBuilder): | ||||||||||||||||||||||
"""Romanian sentiment dataset.""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
VERSION = datasets.Version("1.0.0") | ||||||||||||||||||||||
DEFAULT_CONFIG_NAME = "rosent" | ||||||||||||||||||||||
|
||||||||||||||||||||||
BUILDER_CONFIGS = [ | ||||||||||||||||||||||
ROSENTConfig(name=DEFAULT_CONFIG_NAME, version=VERSION, description="Romanian sentiment dataset."), | ||||||||||||||||||||||
] | ||||||||||||||||||||||
|
||||||||||||||||||||||
def _info(self): | ||||||||||||||||||||||
|
||||||||||||||||||||||
features = datasets.Features( | ||||||||||||||||||||||
{ | ||||||||||||||||||||||
"id": datasets.Value("string"), | ||||||||||||||||||||||
"sentence": datasets.Sequence(datasets.Value("string")), | ||||||||||||||||||||||
"label": datasets.Sequence(datasets.Value("int32")), | ||||||||||||||||||||||
Comment on lines
+65
to
+66
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are these two features Sequences ? It looks like each id has 1 sentence and its associated label There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes, it should not be sequence : There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or actually
;) |
||||||||||||||||||||||
} | ||||||||||||||||||||||
) | ||||||||||||||||||||||
|
||||||||||||||||||||||
return datasets.DatasetInfo( | ||||||||||||||||||||||
# This is the description that will appear on the datasets page. | ||||||||||||||||||||||
description=_DESCRIPTION, | ||||||||||||||||||||||
# This defines the different columns of the dataset and their types | ||||||||||||||||||||||
features=features, # Here we define them above because they are different between the two configurations | ||||||||||||||||||||||
# If there's a common (input, target) tuple from the features, | ||||||||||||||||||||||
# specify them here. They'll be used if as_supervised=True in | ||||||||||||||||||||||
# builder.as_dataset. | ||||||||||||||||||||||
supervised_keys=None, | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it makes sens to have supervised keys here:
Suggested change
|
||||||||||||||||||||||
# Homepage of the dataset for documentation | ||||||||||||||||||||||
homepage=_HOMEPAGE, | ||||||||||||||||||||||
# License for the dataset if available | ||||||||||||||||||||||
license=_LICENSE, | ||||||||||||||||||||||
# Citation for the dataset | ||||||||||||||||||||||
citation=_CITATION, | ||||||||||||||||||||||
) | ||||||||||||||||||||||
|
||||||||||||||||||||||
def _split_generators(self, dl_manager): | ||||||||||||||||||||||
"""Returns SplitGenerators.""" | ||||||||||||||||||||||
|
||||||||||||||||||||||
urls_to_download_train = _URL + _TRAINING_FILE | ||||||||||||||||||||||
urls_to_download_test = _URL + _TEST_FILE | ||||||||||||||||||||||
|
||||||||||||||||||||||
train_path = dl_manager.download(urls_to_download_train) | ||||||||||||||||||||||
test_path = dl_manager.download(urls_to_download_test) | ||||||||||||||||||||||
Comment on lines
+90
to
+94
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd recommend using this syntax, like you did in your previous PR:
Suggested change
As it enables parrallelism in the download-extract task |
||||||||||||||||||||||
print("FISIERE LUATE", train_path, test_path) | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't forget to remove debugging assets :)
Suggested change
|
||||||||||||||||||||||
|
||||||||||||||||||||||
return [ | ||||||||||||||||||||||
datasets.SplitGenerator( | ||||||||||||||||||||||
name=datasets.Split.TRAIN, | ||||||||||||||||||||||
# These kwargs will be passed to _generate_examples | ||||||||||||||||||||||
gen_kwargs={"filepath": train_path}, | ||||||||||||||||||||||
), | ||||||||||||||||||||||
datasets.SplitGenerator( | ||||||||||||||||||||||
name=datasets.Split.TEST, | ||||||||||||||||||||||
# These kwargs will be passed to _generate_examples | ||||||||||||||||||||||
gen_kwargs={"filepath": test_path}, | ||||||||||||||||||||||
), | ||||||||||||||||||||||
] | ||||||||||||||||||||||
|
||||||||||||||||||||||
def _generate_examples(self, filepath): | ||||||||||||||||||||||
""" Yields examples. """ | ||||||||||||||||||||||
|
||||||||||||||||||||||
logging.info("⏳ Generating examples from = %s", filepath) | ||||||||||||||||||||||
with open(filepath, encoding="utf-8") as f: | ||||||||||||||||||||||
# data = pd.read_csv(filepath) | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||
data = csv.reader(f, delimiter=",", quotechar='"') | ||||||||||||||||||||||
|
||||||||||||||||||||||
next(data, None) | ||||||||||||||||||||||
for row_id, row in enumerate(data): | ||||||||||||||||||||||
Comment on lines
+116
to
+119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might want to have a look at the csv.DictReader object It takes cares of the header line for you |
||||||||||||||||||||||
print("ROW", row) | ||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||
id, txt, lbl = row | ||||||||||||||||||||||
yield "{}_{}".format(row_id, id), { | ||||||||||||||||||||||
"id": id, | ||||||||||||||||||||||
"sentence": [txt], | ||||||||||||||||||||||
"label": [lbl], | ||||||||||||||||||||||
Comment on lines
+124
to
+125
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Provided you change the features declaration, there's no need to encapsulate this in a list:
Suggested change
|
||||||||||||||||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like this could be a
ClassLabel
featureThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking about "label": datasets.Sequence(datasets.Value("int32")), as being 0 or 1 for positive or negative sentiment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ClassLabel is useful to add names to integer (often binary) values.
Here you can use for example
where
negative
andpositive
are meaningful labels for sentiment analysis.In practice the values that are stored will still be 0 and 1, but we'll know their string representation as well.