From 07e2cbbd057f0b8460f7505486bee4f8ced33742 Mon Sep 17 00:00:00 2001 From: vasily Date: Mon, 18 Jul 2022 16:45:37 +0300 Subject: [PATCH 01/57] Add mrpc binary head config --- .../classifiers/glue/glue_mrpc_binary.json | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json diff --git a/deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json b/deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json new file mode 100644 index 0000000000..e676860f98 --- /dev/null +++ b/deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json @@ -0,0 +1,81 @@ +{ + "dataset_reader": { + "class_name": "huggingface_dataset_reader", + "path": "glue", + "name": "mrpc", + "train": "train", + "valid": "validation", + "test": "test" + }, + "dataset_iterator": { + "class_name": "huggingface_dataset_iterator", + "features": ["sentence1", "sentence2"], + "label": "label", + "use_label_name": false, + "seed": 42 + }, + "chainer": { + "in": ["sentence1", "sentence2"], + "in_y": ["y_ids"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 100, + "in": ["sentence1", "sentence2"], + "out": ["bert_features"] + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "is_binary": "{BINARY_CLASSIFICATION}", + "confidence_threshold": 0.5 + } + ], + "out": ["y_pred_ids"] + }, + "train": { + "batch_size": 10, + "metrics": [ + "f1", + "accuracy" + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODEL_PATH": "/tmp/glue_mrpc_binary", + "BASE_MODEL": "distilbert-base-uncased", + "BINARY_CLASSIFICATION": true + } + } +} From 43f8a9a914eb8130b76bcf149475bcd0b19e27d4 Mon Sep 17 00:00:00 2001 From: vasily Date: Mon, 18 Jul 2022 16:45:55 +0300 Subject: [PATCH 02/57] Fix binary head --- .../models/torch_bert/torch_transformers_classifier.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/deeppavlov/models/torch_bert/torch_transformers_classifier.py b/deeppavlov/models/torch_bert/torch_transformers_classifier.py index 064908d7f5..ac1faa4455 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_classifier.py +++ b/deeppavlov/models/torch_bert/torch_transformers_classifier.py @@ -270,7 +270,8 @@ def __init__(self, pretrained_bert, config): self.pretrained_bert = pretrained_bert self.config = config - self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config) +# self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config) + self.model = AutoModel.from_pretrained(self.pretrained_bert) self.classifier = BinaryClassificationHead(config) self.classifier.init_weights() @@ -291,8 +292,8 @@ def forward(self, outputs = self.model(input_ids, attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, + #token_type_ids=token_type_ids, + #position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, @@ -323,7 +324,8 @@ def __init__(self, config): self.config = config self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size) - self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) + #self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) + self.dropout = torch.nn.Dropout(0.1) self.out_proj = torch.nn.Linear(config.hidden_size, 1) def init_weights(self): From 612c053b127fb73aa9616130fd1de0594904e625 Mon Sep 17 00:00:00 2001 From: nastyachizhikova Date: Tue, 23 Aug 2022 16:13:20 +0300 Subject: [PATCH 03/57] add few-shot infer support --- deeppavlov/core/common/registry.json | 2 + .../dataset_iterators/few_shot_iterator.py | 80 ++++++++++++++ .../classifiers/proba2labels_few_shot.py | 103 ++++++++++++++++++ few_shot_infer.json | 90 +++++++++++++++ 4 files changed, 275 insertions(+) create mode 100644 deeppavlov/dataset_iterators/few_shot_iterator.py create mode 100644 deeppavlov/models/classifiers/proba2labels_few_shot.py create mode 100644 few_shot_infer.json diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 42f0df484e..b344c36da8 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -15,6 +15,7 @@ "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser", "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker", "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader", + "few_shot_iterator": "deeppavlov.dataset_iterators.few_shot_iterator:FewShotIterator", "fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder", "fit_trainer": "deeppavlov.core.trainers.fit_trainer:FitTrainer", "hashing_tfidf_vectorizer": "deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer", @@ -40,6 +41,7 @@ "paraphraser_reader": "deeppavlov.dataset_readers.paraphraser_reader:ParaphraserReader", "pop_ranker": "deeppavlov.models.doc_retrieval.pop_ranker:PopRanker", "proba2labels": "deeppavlov.models.classifiers.proba2labels:Proba2Labels", + "proba2labels_few_shot": "deeppavlov.models.classifiers.proba2labels_few_shot:Proba2LabelsFewShot", "query_generator": "deeppavlov.models.kbqa.query_generator:QueryGenerator", "question_sign_checker": "deeppavlov.models.entity_extraction.entity_detection_parser:question_sign_checker", "re_classifier": "deeppavlov.models.relation_extraction.relation_extraction_bert:REBertModel", diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py new file mode 100644 index 0000000000..a01700c872 --- /dev/null +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -0,0 +1,80 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +from typing import Dict, Any, List, Tuple, Generator, Optional + +import numpy as np + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator + + +@register('few_shot_iterator') +class FewShotIterator(DataLearningIterator): + """Dataset iterator for multiparagraph-SQuAD dataset. + + reads data from jsonl files + + With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context + from the same article, but without an answer. Contexts without an answer are sampled from uniform distribution. + If ``with_answer_rate`` is None than we compute actual ratio for each data example. + + It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset. + Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is + no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1. + + Args: + data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values + seed: random seed for data shuffling + shuffle: whether to shuffle data during batching + + Attributes: + shuffle: whether to shuffle data during batching + random: instance of ``Random`` initialized with a seed + """ + + def __init__(self, data, *args, **kwargs) -> None: + + self.train = data.get('train', []) + self.valid = data.get('valid', []) + self.test = data.get('test', []) + + self.data = { + 'train': self.train, + 'valid': self.valid, + 'test': self.test, + } + + + def gen_batches(self, batch_size: int, data_type: str, shuffle: bool): + train_examples = self.data['train'] + test_examples = self.data[data_type] + + for test_example, test_cat in test_examples: + batch = [] + for train_example, train_cat in train_examples: + + if isinstance(train_cat, list) or isinstance(train_cat, tuple): + train_cat = train_cat[0] + + if isinstance(test_cat, list) or isinstance(test_cat, tuple): + test_cat = test_cat[0] + + + batch.append(((train_example, test_example, train_cat), test_cat)) + + if batch: + yield tuple(zip(*batch)) diff --git a/deeppavlov/models/classifiers/proba2labels_few_shot.py b/deeppavlov/models/classifiers/proba2labels_few_shot.py new file mode 100644 index 0000000000..6cd1c13b70 --- /dev/null +++ b/deeppavlov/models/classifiers/proba2labels_few_shot.py @@ -0,0 +1,103 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger +from typing import List, Union + +import numpy as np +import torch + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) + + +@register('proba2labels_few_shot') +class Proba2LabelsFewShot(Component): + """ + Class implements probability to labels processing using the following ways: \ + choosing one or top_n indices with maximal probability or choosing any number of indices \ + which probabilities to belong with are higher than given confident threshold + + Args: + max_proba: whether to choose label with maximal probability + confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label) + top_n: how many top labels with the highest probabilities to return + + Attributes: + max_proba: whether to choose label with maximal probability + confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label) + top_n: how many top labels with the highest probabilities to return + """ + + def __init__(self, + max_proba: bool = None, + confidence_threshold: float = None, + top_n: int = None, + is_binary: bool = False, + pooling: str = 'mean', + **kwargs) -> None: + """ Initialize class with given parameters""" + + self.confidence_threshold = confidence_threshold + self.is_binary = is_binary + self.pooling = pooling + + def __call__(self, + data: Union[np.ndarray, + List[List[float]], + List[List[int]]], + train_cat: List[str], + test_cat: List[str], + *args, + **kwargs): + """ + Process probabilities to labels + + Args: + data: list of vectors with probability distribution + + Returns: + list of labels + """ + probas_by_class = [] + + unique_cats = list(sorted(set(train_cat))) + train_cat = np.array(train_cat) + + for cat in unique_cats: + ind_mask = np.where(train_cat == cat) + + if self.pooling == 'mean': + class_proba = np.mean(data[ind_mask]) + elif self.pooling == 'max': + class_proba = np.max(data[ind_mask]) + + probas_by_class.append(class_proba) + + if self.confidence_threshold: + max_conf = np.max(probas_by_class) + + if max_conf > self.confidence_threshold: + pred_id = np.argmax(probas_by_class) + y_pred = unique_cats[pred_id] + else: + y_pred = 'oos' + + return [[y_pred], [test_cat[0]]] + + + \ No newline at end of file diff --git a/few_shot_infer.json b/few_shot_infer.json new file mode 100644 index 0000000000..a4154d752c --- /dev/null +++ b/few_shot_infer.json @@ -0,0 +1,90 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "class_sep": ",", + "x": "text", + "y": "category", + "data_path": "./clinc_5 copy/", + "train": "train.csv", + "valid": "valid.csv", + "test": "test.csv" + }, + "dataset_iterator": { + "class_name": "few_shot_iterator" + }, + "chainer": { + "in": ["hypothesis", "premise", "train_cat"], + "in_y": ["test_cat"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["hypothesis", "premise"], + "out": ["bert_features"] + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 1.5, + "in": ["bert_features"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas", "train_cat", "test_cat"], + "out": ["y_pred_cat", "test_cat"], + "class_name": "proba2labels_few_shot", + "is_binary": "{BINARY_CLASSIFICATION}", + "confidence_threshold": 0.8 + } + ], + "out": ["y_pred_cat"] + }, + "train": { + "batch_size": 64, + "metrics": [ + { + "name": "accuracy", + "inputs": [ + "test_cat", + "y_pred_cat" + ], + "exclude_oos": true + }, + { + "name": "oos_scores", + "inputs": [ + "test_cat", + "y_pred_cat"] + } + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODEL_PATH": "./glue_nli_binary/", + "BASE_MODEL": "roberta-base", + "BINARY_CLASSIFICATION": true + } + } +} From 9b49988944501831f558a4bb5e3b2c29b12757e7 Mon Sep 17 00:00:00 2001 From: nastyachizhikova Date: Tue, 23 Aug 2022 16:22:05 +0300 Subject: [PATCH 04/57] add few-shot metrics --- deeppavlov/core/common/metrics_registry.json | 1 + deeppavlov/metrics/accuracy.py | 11 ++++++++++- deeppavlov/metrics/roc_auc_score.py | 17 +++++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index c1f1a6c7a0..49b88dc61d 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -19,6 +19,7 @@ "multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy", "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", "ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1", + "oos_scores": "deeppavlov.metrics.roc_auc_score:oos_scores", "pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation", "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu", "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy", diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index 560d92ee71..5d08bed5a7 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -22,7 +22,7 @@ @register_metric('accuracy') -def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float: +def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray], exclude_oos: bool = False) -> float: """ Calculate accuracy in terms of absolute coincidence @@ -33,6 +33,15 @@ def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> flo Returns: fraction of absolutely coincidental samples """ + if exclude_oos: + y_true = np.array(y_true) + y_predicted = np.array(y_predicted) + + ind_mask = np.where(y_true == 'oos') + + y_true = np.delete(y_true, ind_mask, 0) + y_predicted = np.delete(y_predicted, ind_mask, 0) + examples_len = len(y_true) # if y1 and y2 are both arrays, == can be erroneously interpreted as element-wise equality diff --git a/deeppavlov/metrics/roc_auc_score.py b/deeppavlov/metrics/roc_auc_score.py index d19c61a8b6..911f0fec0d 100644 --- a/deeppavlov/metrics/roc_auc_score.py +++ b/deeppavlov/metrics/roc_auc_score.py @@ -39,3 +39,20 @@ def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray], np.squeeze(np.array(y_pred)), average="macro") except ValueError: return 0. + +@register_metric('oos_scores') +def oos_scores(y_true: Union[List[List[float]], List[List[int]], np.ndarray], + y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float: + """ + Args: + y_true: true binary labels + y_pred: target scores, can either be probability estimates of the positive class + + Returns: + """ + try: + report = sklearn.metrics.classification_report(np.squeeze(np.array(y_true)), + np.squeeze(np.array(y_pred)), output_dict=True) + return report['oos'] + except ValueError: + return 0. \ No newline at end of file From 40f941362d04335ce16cd73fb172df9c7f26a311 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 21 Sep 2022 11:49:03 +0300 Subject: [PATCH 05/57] dnnc infer eval --- .../configs/classifiers/dnnc_test_2.json | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 deeppavlov/configs/classifiers/dnnc_test_2.json diff --git a/deeppavlov/configs/classifiers/dnnc_test_2.json b/deeppavlov/configs/classifiers/dnnc_test_2.json new file mode 100644 index 0000000000..0b675fb40d --- /dev/null +++ b/deeppavlov/configs/classifiers/dnnc_test_2.json @@ -0,0 +1,80 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "class_sep": ",", + "x": "text", + "y": "category", + "data_path": "/home/nchizhikova/binary/DeepPavlov/clinc_5 copy", + "train": "train.csv", + "valid": "valid.csv", + "test": "test.csv" + }, + "dataset_iterator": { + "class_name": "few_shot_iterator" + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "dnnc_input_preprocessor", + "in": ["x"], + "out": ["infer_support_texts", "x_populated", "infer_support_labels"], + "x": "text", + "y": "category", + "class_sep": ",", + "support_dataset_path": "/archive/savkin/parsed_datasets/clinc/clinc_5/train.csv" + }, + { + "config_path": "{CONFIGS_PATH}/classifiers/roberta_nli.json", + "in": ["infer_support_texts", "x_populated"], + "out": ["simmilarity_scores"] + }, + { + "class_name": "proba2labels_few_shot", + "in": ["simmilarity_scores", "x", "x_populated", "infer_support_labels", "y"], + "out": ["predicted_labels"], + "is_binary": "{BINARY_CLASSIFICATION}", + "confidence_threshold": 0.8 + } + ], + "out": ["predicted_labels"] + }, + "train": { + "batch_size": 1, + "metrics": [ + { + "name": "accuracy", + "inputs": [ + "y", + "predicted_labels" + ], + "exclude_oos": true + }, + { + "name": "oos_scores", + "inputs": [ + "y", + "predicted_labels"] + } + ], + "validation_patience": 0, + "val_every_n_epochs": 0, + "log_every_n_epochs": 0, + "show_examples": false, + "evaluation_targets": ["valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", + "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", + "BINARY_CLASSIFICATION": true + } + } + } + From ee36fe6ba8e887a95d4aa603f16320ea91ab9d2e Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 21 Sep 2022 12:06:10 +0300 Subject: [PATCH 06/57] add preprocessor --- .../configs/classifiers/roberta_nli.json | 55 +++++ deeppavlov/core/common/registry.json | 5 + .../models/preprocessors/dnnc_preprocessor.py | 203 ++++++++++++++++++ 3 files changed, 263 insertions(+) create mode 100644 deeppavlov/configs/classifiers/roberta_nli.json create mode 100644 deeppavlov/models/preprocessors/dnnc_preprocessor.py diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json new file mode 100644 index 0000000000..7f0b7c554a --- /dev/null +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -0,0 +1,55 @@ +{ + "chainer": { + "in": ["hypothesis", "premise"], + "in_y": ["binary_label"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "in": ["hypothesis", "premise"], + "out": ["bert_features"], + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128 + }, + { + "class_name": "torch_transformers_classifier", + "in": ["bert_features"], + "in_y": ["binary_label"], + "out": ["simmilarity_scores"], + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 3e-04, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-08 + }, + "learning_rate_drop_patience": 24, + "learning_rate_drop_div": 2, + "min_learning_rate": 1e-5 + } + ], + "out": ["simmilarity_scores"] + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", + "BASE_MODEL": "roberta-base", + "BINARY_CLASSIFICATION": true + } + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/few_shot_infer.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] +} \ No newline at end of file diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index b344c36da8..582db213fc 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -12,6 +12,11 @@ "dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor", "docred_reader": "deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader", "document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker", + "dnnc_input_preprocessor": "deeppavlov.models.preprocessors.dnnc_preprocessor:InputPreprocessor", + "dnnc_labels_to_binary": "deeppavlov.models.preprocessors.dnnc_preprocessor:Labels2Binary", + "dnnc_pair_maker": "deeppavlov.models.preprocessors.dnnc_preprocessor:PairMaker", + "dnnc_printer": "deeppavlov.models.preprocessors.dnnc_preprocessor:Printer", + "dnnc_support_dataset_loader": "deeppavlov.models.preprocessors.dnnc_preprocessor:SupportDatasetLoader", "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser", "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker", "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader", diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py new file mode 100644 index 0000000000..00b276ee0e --- /dev/null +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -0,0 +1,203 @@ +from pathlib import Path +import pandas as pd +from logging import exception, getLogger +from typing import List +import numpy as np + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + + +log = getLogger(__name__) + +@register('dnnc_input_preprocessor') +class InputPreprocessor(Component): + def __init__(self, + support_dataset_path: str, + format: str = "csv", + class_sep: str = ",", + *args, **kwargs) -> None: + file = Path(support_dataset_path) + if file.exists(): + if format == 'csv': + keys = ('sep', 'header', 'names') + options = {k: kwargs[k] for k in keys if k in kwargs} + df = pd.read_csv(file, **options) + elif format == 'json': + keys = ('orient', 'lines') + options = {k: kwargs[k] for k in keys if k in kwargs} + df = pd.read_json(file, **options) + else: + raise Exception('Unsupported file format: {}'.format(format)) + + x = kwargs.get("x", "text") + y = kwargs.get('y', 'labels') + + multi_label = lambda labels: class_sep in str(labels) + self.support_dataset = [[row[x], str(row[y])] for _, row in df.iterrows() if not multi_label(row[y])] + else: + log.warning("Cannot find {} file".format(support_dataset_path)) + + def __call__(self, + input_texts : List[str]) -> List[List[str]]: + # TODO: перепиши на numpy + hypotesis_batch = [] + premise_batch = [] + hypotesis_labels_batch = [] + + for [premise, [hypotesis, hypotesis_labels]] in zip(input_texts * len(self.support_dataset), + np.repeat(self.support_dataset, len(input_texts), axis=0)): + premise_batch.append(premise) + hypotesis_batch.append(hypotesis) + hypotesis_labels_batch.append(hypotesis_labels) + + return hypotesis_batch, premise_batch, hypotesis_labels_batch + + + +# @register('test42') +# class Test42(Component): +# def __init__(self, *args, **kwargs) -> None: +# pass + +# def __call__(self, input): +# return input[:2] + + +# @register('dnnc_input_preprocessor') +# class PairMaker(Component): # TODO: а тут точно нужен component а не что-то ещё ? +# def __init__(self, *args, **kwargs) -> None: +# pass +# def __call__(self, input_texts, support_texts, support_labels): + +# pairs = [] +# for x, y, z in zip(input_texts, support_texts, support_labels): +# for i, j in zip(y, z): +# pairs.append([x, i, j]) +# pairs = np.array(pairs) +# print() +# print("PAIRS SHAPE = ", pairs.shape) +# pairs = pairs.T.tolist() +# transformed_input_texts = pairs[0] +# transformed_input_texts = pairs[1] +# transformed_support_labels = pairs[2] + +# print(f"PAIRS: transformed_inputs_batch = {transformed_input_texts[:3]}, len = {len(transformed_input_texts)}") +# print(f"PAIRS: transformed_support_batch = {transformed_input_texts[:3]}, len = {len(transformed_input_texts)}") +# print(f"PAIRS: transformed_support_labels = {transformed_support_labels[:3]}, len = {len(transformed_support_labels)}") +# return transformed_input_texts, transformed_input_texts, transformed_support_labels + +@register('dnnc_pair_maker') +class PairMaker(Component): # TODO: а тут точно нужен component а не что-то ещё ? + def __init__(self, *args, **kwargs) -> None: + pass + def __call__(self, input_batch, support_batch): + + pairs = [] + for x, y in zip(input_batch, support_batch): + for i in y: + pairs.append([x, i]) + pairs = np.array(pairs) + print() + print("PAIRS SHAPE = ", pairs.shape) + pairs = pairs.T.tolist() + transformed_inputs_batch = pairs[0] + transformed_support_batch = pairs[1] + + print(f"PAIRS: transformed_inputs_batch = {transformed_inputs_batch[:3]}, len = {len(transformed_inputs_batch)}") + print(f"PAIRS: transformed_support_batch = {transformed_support_batch[:3]}, len = {len(transformed_support_batch)}") + return transformed_inputs_batch, transformed_support_batch + +@register('dnnc_printer') +class Printer(Component): + def __init__(self, *args, **kwargs) -> None: + pass + + def __call__(self, *args, **kwargs): + print("________________PRINTER__________") + print(**kwargs) + print("_________________________________") + +@register('dnnc_labels_to_binary') +class Labels2Binary(Component): + def __init__(self, *args, **kwargs) -> None: + pass + + def __call__(self, true_labels, support_labels): + return (true_labels == support_labels) + +@register('dnnc_support_dataset_loader') +class SupportDatasetLoader(Component): + def __init__(self, + path: str, + format: str = "csv", + class_sep: str = ",", + *args, **kwargs) -> None: + file = Path(path) + if file.exists(): + if format == 'csv': + keys = ('sep', 'header', 'names') + options = {k: kwargs[k] for k in keys if k in kwargs} + df = pd.read_csv(file, **options) + elif format == 'json': + keys = ('orient', 'lines') + options = {k: kwargs[k] for k in keys if k in kwargs} + df = pd.read_json(file, **options) + else: + raise Exception('Unsupported file format: {}'.format(format)) + + x = kwargs.get("x", "text") + y = kwargs.get('y', 'labels') + + multi_label = lambda labels: class_sep in str(labels) + support_dataset = [[row[x], str(row[y])] for _, row in df.iterrows() if not multi_label(row[y])] + support_dataset = np.array(support_dataset) + + self.infer_support_texts = support_dataset[:, 0] + self.infer_support_labels = support_dataset[:, 1] + + else: + log.warning("Cannot find {} file".format(path)) + + def __call__(self, + input_texts : List[str]) -> List[List[str]]: + input_size = len(input_texts) + + print("input_size = ", input_size) + print("self.infer_support_texts = ", self.infer_support_texts.size) + print("self.infer_support_labels = ", self.infer_support_labels.size) + return ([self.infer_support_texts] * input_size, [self.infer_support_labels] * input_size) + + +# @register('dnnc_support_dataset_loader') +# class SupportDatasetLoader(Component): +# def __init__(self, +# path: str, +# format: str = "csv", +# class_sep: str = ",", +# *args, **kwargs) -> None: +# file = Path(path) +# if file.exists(): +# if format == 'csv': +# keys = ('sep', 'header', 'names') +# options = {k: kwargs[k] for k in keys if k in kwargs} +# df = pd.read_csv(file, **options) +# elif format == 'json': +# keys = ('orient', 'lines') +# options = {k: kwargs[k] for k in keys if k in kwargs} +# df = pd.read_json(file, **options) +# else: +# raise Exception('Unsupported file format: {}'.format(format)) + +# x = kwargs.get("x", "text") +# y = kwargs.get('y', 'labels') + +# multi_label = lambda labels: class_sep in str(labels) +# self.support_dataset = [[row[x], str(row[y])] for _, row in df.iterrows() if not multi_label(row[y])] + +# else: +# log.warning("Cannot find {} file".format(path)) + +# def __call__(self, input_texts): +# return np.array(self.support_dataset).T.tolist() \ No newline at end of file From a976260cf5e1d85ef48c51686c4cf40f90f9ba99 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 28 Sep 2022 11:14:02 +0300 Subject: [PATCH 07/57] init dnnc --- deeppavlov/configs/classifiers/dnnc.json | 104 ++++++++++++ .../configs/classifiers/roberta_nli.json | 60 +++++-- deeppavlov/core/common/metrics_registry.json | 8 +- deeppavlov/core/common/registry.json | 6 +- .../dataset_iterators/few_shot_iterator.py | 145 +++++++++++------ deeppavlov/metrics/accuracy.py | 35 ---- deeppavlov/metrics/few_shot_metrics.py | 68 ++++++++ deeppavlov/metrics/roc_auc_score.py | 19 +-- .../models/classifiers/dnnc_proba2labels.py | 93 +++++++++++ .../models/preprocessors/dnnc_preprocessor.py | 150 +----------------- 10 files changed, 419 insertions(+), 269 deletions(-) create mode 100644 deeppavlov/configs/classifiers/dnnc.json create mode 100644 deeppavlov/metrics/few_shot_metrics.py create mode 100644 deeppavlov/models/classifiers/dnnc_proba2labels.py diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json new file mode 100644 index 0000000000..d4391d4516 --- /dev/null +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -0,0 +1,104 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "x": "text", + "y": "category", + "data_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc", + "valid": "dev.csv", + "test": "test.csv" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42, + "shuffle": true + }, + "chainer": { + "in": ["x"], + "in_y": ["y_true"], + "pipe": [ + { + "class_name": "dnnc_input_preprocessor", + "in": ["x"], + "out": ["x_support", "x_populated", "y_support"], + "x": "text", + "y": "category", + "class_sep": ",", + "support_dataset_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc/train.csv" + }, + { + "class_name": "torch_transformers_preprocessor", + "in": ["x_support", "x_populated"], + "out": ["bert_features"], + "vocab_file": "{BASE_MODEL}", + "do_lower_case": true, + "max_seq_length": 128 + }, + { + "class_name": "torch_transformers_classifier", + "main": true, + "in": ["bert_features"], + "out": ["simmilarity_scores"], + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer_parameters": { + "lr": 3e-04 + } + }, + { + "class_name": "dnnc_proba2labels", + "in": ["simmilarity_scores", "x", "x_populated", "y_support"], + "out": ["y_pred"], + "multilabel": false, + "confidence_threshold": 0.8 + } + ], + "out": ["y_pred"] + }, + "train": { + "batch_size": 1, + "metrics": [ + { + "name": "sklearn_accuracy", + "inputs": ["y_true", "y_pred"], + "exclude_oos": true + }, + { + "name": "sklearn_balanced_accuracy", + "inputs": ["y_true", "y_pred"], + "exclude_oos": true + }, + { + "name": "oos_scores", + "inputs": ["y_true", "y_pred"] + }, + { + "name": "joint_accuracy_in_recall_oos", + "inputs": ["y_true", "y_pred"] + } + ], + "show_examples": false, + "evaluation_targets": ["test"], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", + "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", + "BINARY_CLASSIFICATION": true, + "BASE_MODEL": "roberta-base" + } + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/classifiers/few_shot_infer.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] +} \ No newline at end of file diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 7f0b7c554a..38e405b880 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -1,7 +1,24 @@ { + "dataset_reader": { + "class_name": "basic_classification_reader", + "class_sep": ",", + "x": "text", + "y": "category", + "data_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc", + "train": "train.csv", + "valid": "dev.csv", + "test": "test.csv" + }, + "dataset_iterator": { + "class_name": "few_shot_iterator", + "seed": 42, + "shuffle": true, + "shot": 5, + "save_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" + }, "chainer": { "in": ["hypothesis", "premise"], - "in_y": ["binary_label"], + "in_y": ["y_true"], "pipe": [ { "class_name": "torch_transformers_preprocessor", @@ -13,8 +30,9 @@ }, { "class_name": "torch_transformers_classifier", + "main": true, "in": ["bert_features"], - "in_y": ["binary_label"], + "in_y": ["y_true"], "out": ["simmilarity_scores"], "n_classes": 2, "return_probas": true, @@ -24,31 +42,47 @@ "is_binary": "{BINARY_CLASSIFICATION}", "optimizer": "AdamW", "optimizer_parameters": { - "lr": 3e-04, - "weight_decay": 0.01, - "betas": [0.9, 0.999], - "eps": 1e-08 - }, - "learning_rate_drop_patience": 24, - "learning_rate_drop_div": 2, - "min_learning_rate": 1e-5 + "lr": 2e-05 + } + }, + { + "class_name": "proba2labels", + "in": ["simmilarity_scores"], + "out": ["y_pred"], + "max_proba": true + } + ], + "out": ["y_pred"] + }, + "train": { + "batch_size": 900, + "epochs": 7, + "log_every_n_epochs": 1, + "val_every_n_epochs": 1, + "validate_first": false, + "metrics": [ + { + "name": "sklearn_accuracy", + "inputs": ["y_true", "y_pred"] } ], - "out": ["simmilarity_scores"] + "evaluation_targets": ["valid"], + "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", + "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli", + "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": true } }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/few_shot_infer.tar.gz", + "url": "http://files.deeppavlov.ai/classifiers/few_shot_infer.tar.gz", "subdir": "{MODEL_PATH}" } ] diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index 49b88dc61d..84c52b944e 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -1,6 +1,5 @@ { "acc": "deeppavlov.metrics.accuracy:round_accuracy", - "accuracy": "deeppavlov.metrics.accuracy:accuracy", "average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1", "average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1", "bleu": "deeppavlov.metrics.bleu:bleu", @@ -10,6 +9,7 @@ "f1_macro": "deeppavlov.metrics.fmeasure:round_f1_macro", "f1_weighted": "deeppavlov.metrics.fmeasure:round_f1_weighted", "google_bleu": "deeppavlov.metrics.bleu:google_bleu", + "joint_accuracy_in_recall_oos": "deeppavlov.metrics.few_shot_metrics:joint_accuracy_in_recall_oos", "kbqa_accuracy": "deeppavlov.metrics.accuracy:kbqa_accuracy", "log_loss": "deeppavlov.metrics.log_loss:sk_log_loss", "matthews_correlation": "deeppavlov.metrics.correlation:matthews_correlation", @@ -19,7 +19,7 @@ "multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy", "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", "ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1", - "oos_scores": "deeppavlov.metrics.roc_auc_score:oos_scores", + "oos_scores": "deeppavlov.metrics.few_shot_metrics:oos_scores", "pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation", "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu", "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy", @@ -33,6 +33,10 @@ "rank_response": "deeppavlov.models.ranking.metrics:rank_response", "roc_auc": "deeppavlov.metrics.roc_auc_score:roc_auc_score", "sets_accuracy": "deeppavlov.metrics.accuracy:sets_accuracy", + "sklearn_accuracy": "deeppavlov.metrics.few_shot_metrics:accuracy", + "sklearn_balanced_accuracy": "deeppavlov.metrics.few_shot_metrics:balanced_accuracy", + "sklearn_classification_report": "deeppavlov.metrics.few_shot_metrics:report", + "sklearn_precision_recall_fscore_support": "deeppavlov.metrics.few_shot_metrics:recision_recall_fscore", "slots_accuracy": "deeppavlov.metrics.accuracy:slots_accuracy", "spearman_correlation": "deeppavlov.metrics.correlation:spearman_correlation", "squad_v1_em": "deeppavlov.metrics.squad_metrics:squad_v1_exact_match", diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 582db213fc..27fd236482 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -13,10 +13,7 @@ "docred_reader": "deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader", "document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker", "dnnc_input_preprocessor": "deeppavlov.models.preprocessors.dnnc_preprocessor:InputPreprocessor", - "dnnc_labels_to_binary": "deeppavlov.models.preprocessors.dnnc_preprocessor:Labels2Binary", - "dnnc_pair_maker": "deeppavlov.models.preprocessors.dnnc_preprocessor:PairMaker", - "dnnc_printer": "deeppavlov.models.preprocessors.dnnc_preprocessor:Printer", - "dnnc_support_dataset_loader": "deeppavlov.models.preprocessors.dnnc_preprocessor:SupportDatasetLoader", + "dnnc_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:Proba2Labels", "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser", "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker", "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader", @@ -46,7 +43,6 @@ "paraphraser_reader": "deeppavlov.dataset_readers.paraphraser_reader:ParaphraserReader", "pop_ranker": "deeppavlov.models.doc_retrieval.pop_ranker:PopRanker", "proba2labels": "deeppavlov.models.classifiers.proba2labels:Proba2Labels", - "proba2labels_few_shot": "deeppavlov.models.classifiers.proba2labels_few_shot:Proba2LabelsFewShot", "query_generator": "deeppavlov.models.kbqa.query_generator:QueryGenerator", "question_sign_checker": "deeppavlov.models.entity_extraction.entity_detection_parser:question_sign_checker", "re_classifier": "deeppavlov.models.relation_extraction.relation_extraction_bert:REBertModel", diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index a01700c872..96f4412966 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -14,67 +14,118 @@ import json +from random import Random from typing import Dict, Any, List, Tuple, Generator, Optional +from collections import defaultdict +from logging import getLogger import numpy as np +from overrides import overrides +from pyparsing import null_debug_action from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +import wandb -@register('few_shot_iterator') -class FewShotIterator(DataLearningIterator): - """Dataset iterator for multiparagraph-SQuAD dataset. - - reads data from jsonl files - - With ``with_answer_rate`` rate samples context with answer and with ``1 - with_answer_rate`` samples context - from the same article, but without an answer. Contexts without an answer are sampled from uniform distribution. - If ``with_answer_rate`` is None than we compute actual ratio for each data example. - - It extracts ``context``, ``question``, ``answer_text`` and ``answer_start`` position from dataset. - Example from a dataset is a tuple of ``(context, question)`` and ``(answer_text, answer_start)``. If there is - no answer in context, then ``answer_text`` is empty string and `answer_start` is equal to -1. - - Args: - data: dict with keys ``'train'``, ``'valid'`` and ``'test'`` and values - seed: random seed for data shuffling - shuffle: whether to shuffle data during batching - - Attributes: - shuffle: whether to shuffle data during batching - random: instance of ``Random`` initialized with a seed - """ - def __init__(self, data, *args, **kwargs) -> None: +log = getLogger(__name__) - self.train = data.get('train', []) - self.valid = data.get('valid', []) - self.test = data.get('test', []) +@register('few_shot_iterator') +class FewShotIterator(DataLearningIterator): + def __init__(self, + data: Dict[str, List[Tuple[Any, Any]]], + seed: int = None, + shuffle: bool = True, + shot: Optional[int] = None, + save_path: Optional[str] = None, + *args, **kwargs) -> None: + data = self._remove_multiclass_examples(data) + self.shot = shot + self.shuffle = shuffle + self.random = Random(seed) + self.train = self.preprocess(self._get_shot_examples(data.get('train', [])), *args, **kwargs) + self.valid = self.preprocess(data.get('valid', []), *args, **kwargs) + self.test = self.preprocess(data.get('test', []), *args, **kwargs) + self.split(*args, **kwargs) # TODO: delete it self.data = { 'train': self.train, 'valid': self.valid, 'test': self.test, + 'all': self.train + self.test + self.valid } - - def gen_batches(self, batch_size: int, data_type: str, shuffle: bool): - train_examples = self.data['train'] - test_examples = self.data[data_type] - - for test_example, test_cat in test_examples: - batch = [] - for train_example, train_cat in train_examples: - - if isinstance(train_cat, list) or isinstance(train_cat, tuple): - train_cat = train_cat[0] - - if isinstance(test_cat, list) or isinstance(test_cat, tuple): - test_cat = test_cat[0] - - - batch.append(((train_example, test_example, train_cat), test_cat)) - - if batch: - yield tuple(zip(*batch)) + if save_path: + with open(save_path, "w") as file: + json.dump(self.data, file, indent=4) + + + @overrides + def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]: + if len(data) == 0: + return data + + unique_labels = list(set([label for text, label in data])) + + labels_dict = {} + for label in unique_labels: + labels_dict[label] = [] + + for text, label in data: + labels_dict[label].append(text) + + negative_labels = {} + for i, label in enumerate(unique_labels): + negative_labels[label] = unique_labels.copy() + del negative_labels[label][i] + + nli_triplets = [] + # negative examples + for text, label in data: + for negative_label in negative_labels[label]: + for negative_example in labels_dict[negative_label]: + nli_triplets.append([[text, negative_example], 0]) + # positive examples + for text, label in data: + for positive_example in labels_dict[label]: + nli_triplets.append([[text, positive_example], 1]) + + if self.shuffle: + self.random.shuffle(nli_triplets) + return nli_triplets + + def _remove_multiclass_examples(self, data: Dict[str, List[Tuple[Any, Any]]]): + new_data = {"train": [], "valid": [], "test": []} + for key in new_data.keys(): + for text, labels_list in data[key]: + if len(labels_list) == 1: + new_data[key].append((text, labels_list[0])) + + return new_data + + + def _get_shot_examples(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: + if self.shot is None: + return data + + # shuffle data to select shot-examples + if self.shuffle: + self.random.shuffle(data) + + data_dict = {} + for _, label in data: + data_dict[label] = [] + + for text, label in data: + if len(data_dict[label]) < self.shot: + data_dict[label].append(text) + + if max(len(x) for x in data_dict.values()) < self.shot: + log.warning(f"Some labels have less than \"shot\"={self.shot} examples") + + new_data = [] + for label in data_dict.keys(): + for text in data_dict[label]: + new_data.append((text, label)) + return new_data diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index 5d08bed5a7..1042a74e78 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -21,41 +21,6 @@ from deeppavlov.core.common.metrics_registry import register_metric -@register_metric('accuracy') -def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray], exclude_oos: bool = False) -> float: - """ - Calculate accuracy in terms of absolute coincidence - - Args: - y_true: array of true values - y_predicted: array of predicted values - - Returns: - fraction of absolutely coincidental samples - """ - if exclude_oos: - y_true = np.array(y_true) - y_predicted = np.array(y_predicted) - - ind_mask = np.where(y_true == 'oos') - - y_true = np.delete(y_true, ind_mask, 0) - y_predicted = np.delete(y_predicted, ind_mask, 0) - - examples_len = len(y_true) - # if y1 and y2 are both arrays, == can be erroneously interpreted as element-wise equality - - def _are_equal(y1, y2): - answer = (y1 == y2) - if isinstance(answer, np.ndarray): - answer = answer.all() - return answer - - equalities = [_are_equal(y1, y2) for y1, y2 in zip(y_true, y_predicted)] - correct = sum(equalities) - return correct / examples_len if examples_len else 0 - - @register_metric('multitask_accuracy') def multitask_accuracy(*args) -> float: """ diff --git a/deeppavlov/metrics/few_shot_metrics.py b/deeppavlov/metrics/few_shot_metrics.py new file mode 100644 index 0000000000..a084c7f197 --- /dev/null +++ b/deeppavlov/metrics/few_shot_metrics.py @@ -0,0 +1,68 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Union, Optional + +import numpy as np +from deeppavlov.core.common.metrics_registry import register_metric +from sklearn.metrics import accuracy_score, \ + balanced_accuracy_score, \ + precision_recall_fscore_support, \ + classification_report + +def delete_oos(y_true, y_pred): + y_true = np.array(y_true) + y_pred = np.array(y_pred) + + ind_mask = np.where(y_true == 'oos') + + y_true = np.delete(y_true, ind_mask, 0) + y_pred = np.delete(y_pred, ind_mask, 0) + return y_true, y_pred + +@register_metric('sklearn_accuracy') +def accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: + if exclude_oos: + y_true, y_pred = delete_oos(y_true, y_pred) + return accuracy_score(y_true, y_pred) + + +@register_metric('sklearn_balanced_accuracy') +def balanced_accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: + if exclude_oos: + y_true, y_pred = delete_oos(y_true, y_pred) + + return balanced_accuracy_score(y_true, y_pred) + +@register_metric('joint_accuracy_in_recall_oos') +def joint_accuracy_in_recall_oos(y_true, y_pred) -> float: + return accuracy(y_true, y_pred, exclude_oos=True) + oos_scores(y_true, y_pred)["recall"] + +@register_metric('sklearn_precision_recall_fscore_support') +def recision_recall_fscore(*args, **kwargs): + return precision_recall_fscore_support(*args, **kwargs) + + +@register_metric('oos_scores') +def oos_scores(y_true, y_pred): + y_true_binary = (np.array(y_true) == "oos") + y_pred_binary = (np.array(y_pred) == "oos") + scores = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary') + return dict(zip(["precision", "recall", "fbeta_score"], scores[:3])) + + +@register_metric('sklearn_classification_report') +def report(*args, **kwargs): + return classification_report(output_dict=True, *args, **kwargs) diff --git a/deeppavlov/metrics/roc_auc_score.py b/deeppavlov/metrics/roc_auc_score.py index 911f0fec0d..a2a3ebb841 100644 --- a/deeppavlov/metrics/roc_auc_score.py +++ b/deeppavlov/metrics/roc_auc_score.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import List, Union +from typing import List, Union, Optional import numpy as np import sklearn.metrics @@ -37,22 +37,5 @@ def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray], try: return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)), np.squeeze(np.array(y_pred)), average="macro") - except ValueError: - return 0. - -@register_metric('oos_scores') -def oos_scores(y_true: Union[List[List[float]], List[List[int]], np.ndarray], - y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float: - """ - Args: - y_true: true binary labels - y_pred: target scores, can either be probability estimates of the positive class - - Returns: - """ - try: - report = sklearn.metrics.classification_report(np.squeeze(np.array(y_true)), - np.squeeze(np.array(y_pred)), output_dict=True) - return report['oos'] except ValueError: return 0. \ No newline at end of file diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py new file mode 100644 index 0000000000..d9ec39bd37 --- /dev/null +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -0,0 +1,93 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from doctest import Example +from logging import getLogger +from typing import List, Union + +import numpy as np +import torch + +from deeppavlov.core.common.errors import ConfigError +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.component import Component + +log = getLogger(__name__) + +@register('dnnc_proba2labels') +class Proba2Labels(Component): + + def __init__(self, + confidence_threshold: float = 0.8, + pooling: str = 'mean', + multilabel: bool = False, + **kwargs) -> None: + + self.confidence_threshold = confidence_threshold + self.pooling = pooling + self.multilabel = multilabel + + def __call__(self, + simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], + x: List[str], + x_populated: List[str], + y_support: List[str], + *args, + **kwargs): + y_pred = [] + simmilarity_scores = np.array(simmilarity_scores) + x = np.array(x) + x_populated = np.array(x_populated) + y_support = np.array(y_support) + unique_labels = np.unique(y_support) + + for example in x: + example_mask = np.where(x_populated == example) + example_simmilarity_scores = simmilarity_scores[example_mask] + example_y_support = y_support[example_mask] + + probability_by_label = [] + for label in unique_labels: + ind_mask = np.where(example_y_support == label) + if self.pooling == 'mean': + label_probability = np.mean(example_simmilarity_scores[ind_mask]) + elif self.pooling == 'max': + label_probability = np.max(example_simmilarity_scores[ind_mask]) + probability_by_label.append(label_probability) + probability_by_label = np.array(probability_by_label) + + + + if self.multilabel: + threshold_mask = np.where(probability_by_label >= self.confidence_threshold) + threshold_y_support = unique_labels[threshold_mask] + prediction = ["oos"] if threshold_y_support.size == 0 else threshold_y_support + else: + max_probability = max(probability_by_label) + max_probability_label = unique_labels[np.argmax(probability_by_label)] + prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label + + y_pred.append(prediction) + + # print(f"example = {example}") + # print(f"example_simmilarity_scores = {example_simmilarity_scores}") + # print(f"probability_by_label = {example_simmilarity_scores}") + # print(f"prediction = {prediction}") + # print(f"max(probability_by_label) = {max(probability_by_label)}") + # raise Exception + + return y_pred + + + \ No newline at end of file diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 00b276ee0e..2e2217e64c 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -52,152 +52,4 @@ def __call__(self, hypotesis_batch.append(hypotesis) hypotesis_labels_batch.append(hypotesis_labels) - return hypotesis_batch, premise_batch, hypotesis_labels_batch - - - -# @register('test42') -# class Test42(Component): -# def __init__(self, *args, **kwargs) -> None: -# pass - -# def __call__(self, input): -# return input[:2] - - -# @register('dnnc_input_preprocessor') -# class PairMaker(Component): # TODO: а тут точно нужен component а не что-то ещё ? -# def __init__(self, *args, **kwargs) -> None: -# pass -# def __call__(self, input_texts, support_texts, support_labels): - -# pairs = [] -# for x, y, z in zip(input_texts, support_texts, support_labels): -# for i, j in zip(y, z): -# pairs.append([x, i, j]) -# pairs = np.array(pairs) -# print() -# print("PAIRS SHAPE = ", pairs.shape) -# pairs = pairs.T.tolist() -# transformed_input_texts = pairs[0] -# transformed_input_texts = pairs[1] -# transformed_support_labels = pairs[2] - -# print(f"PAIRS: transformed_inputs_batch = {transformed_input_texts[:3]}, len = {len(transformed_input_texts)}") -# print(f"PAIRS: transformed_support_batch = {transformed_input_texts[:3]}, len = {len(transformed_input_texts)}") -# print(f"PAIRS: transformed_support_labels = {transformed_support_labels[:3]}, len = {len(transformed_support_labels)}") -# return transformed_input_texts, transformed_input_texts, transformed_support_labels - -@register('dnnc_pair_maker') -class PairMaker(Component): # TODO: а тут точно нужен component а не что-то ещё ? - def __init__(self, *args, **kwargs) -> None: - pass - def __call__(self, input_batch, support_batch): - - pairs = [] - for x, y in zip(input_batch, support_batch): - for i in y: - pairs.append([x, i]) - pairs = np.array(pairs) - print() - print("PAIRS SHAPE = ", pairs.shape) - pairs = pairs.T.tolist() - transformed_inputs_batch = pairs[0] - transformed_support_batch = pairs[1] - - print(f"PAIRS: transformed_inputs_batch = {transformed_inputs_batch[:3]}, len = {len(transformed_inputs_batch)}") - print(f"PAIRS: transformed_support_batch = {transformed_support_batch[:3]}, len = {len(transformed_support_batch)}") - return transformed_inputs_batch, transformed_support_batch - -@register('dnnc_printer') -class Printer(Component): - def __init__(self, *args, **kwargs) -> None: - pass - - def __call__(self, *args, **kwargs): - print("________________PRINTER__________") - print(**kwargs) - print("_________________________________") - -@register('dnnc_labels_to_binary') -class Labels2Binary(Component): - def __init__(self, *args, **kwargs) -> None: - pass - - def __call__(self, true_labels, support_labels): - return (true_labels == support_labels) - -@register('dnnc_support_dataset_loader') -class SupportDatasetLoader(Component): - def __init__(self, - path: str, - format: str = "csv", - class_sep: str = ",", - *args, **kwargs) -> None: - file = Path(path) - if file.exists(): - if format == 'csv': - keys = ('sep', 'header', 'names') - options = {k: kwargs[k] for k in keys if k in kwargs} - df = pd.read_csv(file, **options) - elif format == 'json': - keys = ('orient', 'lines') - options = {k: kwargs[k] for k in keys if k in kwargs} - df = pd.read_json(file, **options) - else: - raise Exception('Unsupported file format: {}'.format(format)) - - x = kwargs.get("x", "text") - y = kwargs.get('y', 'labels') - - multi_label = lambda labels: class_sep in str(labels) - support_dataset = [[row[x], str(row[y])] for _, row in df.iterrows() if not multi_label(row[y])] - support_dataset = np.array(support_dataset) - - self.infer_support_texts = support_dataset[:, 0] - self.infer_support_labels = support_dataset[:, 1] - - else: - log.warning("Cannot find {} file".format(path)) - - def __call__(self, - input_texts : List[str]) -> List[List[str]]: - input_size = len(input_texts) - - print("input_size = ", input_size) - print("self.infer_support_texts = ", self.infer_support_texts.size) - print("self.infer_support_labels = ", self.infer_support_labels.size) - return ([self.infer_support_texts] * input_size, [self.infer_support_labels] * input_size) - - -# @register('dnnc_support_dataset_loader') -# class SupportDatasetLoader(Component): -# def __init__(self, -# path: str, -# format: str = "csv", -# class_sep: str = ",", -# *args, **kwargs) -> None: -# file = Path(path) -# if file.exists(): -# if format == 'csv': -# keys = ('sep', 'header', 'names') -# options = {k: kwargs[k] for k in keys if k in kwargs} -# df = pd.read_csv(file, **options) -# elif format == 'json': -# keys = ('orient', 'lines') -# options = {k: kwargs[k] for k in keys if k in kwargs} -# df = pd.read_json(file, **options) -# else: -# raise Exception('Unsupported file format: {}'.format(format)) - -# x = kwargs.get("x", "text") -# y = kwargs.get('y', 'labels') - -# multi_label = lambda labels: class_sep in str(labels) -# self.support_dataset = [[row[x], str(row[y])] for _, row in df.iterrows() if not multi_label(row[y])] - -# else: -# log.warning("Cannot find {} file".format(path)) - -# def __call__(self, input_texts): -# return np.array(self.support_dataset).T.tolist() \ No newline at end of file + return hypotesis_batch, premise_batch, hypotesis_labels_batch \ No newline at end of file From a4dc57bf068867012797c28c81ff296348901f6e Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 19 Oct 2022 11:30:31 +0300 Subject: [PATCH 08/57] add dnnc training --- deeppavlov/configs/classifiers/dnnc.json | 228 ++++++++++-------- .../configs/classifiers/dnnc_test_2.json | 80 ------ .../configs/classifiers/roberta_nli.json | 198 ++++++++------- deeppavlov/core/common/registry.json | 1 + .../dataset_iterators/few_shot_iterator.py | 66 ++--- .../basic_classification_reader.py | 2 +- .../models/classifiers/dnnc_proba2labels.py | 17 +- .../classifiers/proba2labels_few_shot.py | 103 -------- .../models/preprocessors/dnnc_preprocessor.py | 25 +- few_shot_infer.json | 90 ------- 10 files changed, 298 insertions(+), 512 deletions(-) delete mode 100644 deeppavlov/configs/classifiers/dnnc_test_2.json delete mode 100644 deeppavlov/models/classifiers/proba2labels_few_shot.py delete mode 100644 few_shot_infer.json diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index d4391d4516..bd9fd8773c 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -1,104 +1,134 @@ { - "dataset_reader": { - "class_name": "basic_classification_reader", - "x": "text", - "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc", - "valid": "dev.csv", - "test": "test.csv" - }, - "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "shuffle": true - }, - "chainer": { - "in": ["x"], - "in_y": ["y_true"], - "pipe": [ - { - "class_name": "dnnc_input_preprocessor", - "in": ["x"], - "out": ["x_support", "x_populated", "y_support"], + "dataset_reader": { + "class_name": "basic_classification_reader", + "format": "json", + "orient": "split", "x": "text", "y": "category", - "class_sep": ",", - "support_dataset_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc/train.csv" - }, - { - "class_name": "torch_transformers_preprocessor", - "in": ["x_support", "x_populated"], - "out": ["bert_features"], - "vocab_file": "{BASE_MODEL}", - "do_lower_case": true, - "max_seq_length": 128 - }, - { - "class_name": "torch_transformers_classifier", - "main": true, - "in": ["bert_features"], - "out": ["simmilarity_scores"], - "n_classes": 2, - "return_probas": true, - "pretrained_bert": "{BASE_MODEL}", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer_parameters": { - "lr": 3e-04 + "data_path": "/archive/savkin/parsed_datasets/clinc150/all&oos/dnnc", + "valid": "dev.json", + "test": "test.json" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42, + "shuffle": true + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y_true" + ], + "pipe": [ + { + "class_name": "dnnc_input_preprocessor", + "in": [ + "x" + ], + "out": [ + "x_support", + "x_populated", + "y_support" + ], + "x": "text", + "y": "category", + "format": "json", + "orient": "split", + "support_dataset_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" + }, + { + "class_name": "torch_transformers_preprocessor", + "in": [ + "x_populated", + "x_support" + ], + "out": [ + "bert_features" + ], + "vocab_file": "{BASE_MODEL}", + "do_lower_case": true, + "max_seq_length": 128 + }, + { + "class_name": "torch_transformers_classifier", + "main": true, + "in": [ + "bert_features" + ], + "out": [ + "simmilarity_scores" + ], + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer_parameters": { + "lr": 2e-05 + } + }, + { + "class_name": "dnnc_proba2labels", + "in": [ + "simmilarity_scores", + "x", + "x_populated", + "y_support" + ], + "out": [ + "y_pred" + ], + "multilabel": false, + "confidence_threshold": 0.2 + } + ], + "out": [ + "y_pred" + ] + }, + "train": { + "batch_size": 4, + "metrics": [ + { + "name": "sklearn_balanced_accuracy", + "inputs": [ + "y_true", + "y_pred" + ], + "exclude_oos": true + }, + { + "name": "oos_scores", + "inputs": [ + "y_true", + "y_pred" + ] + } + ], + "show_examples": false, + "evaluation_targets": [ + "valid" + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", + "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", + "BINARY_CLASSIFICATION": true, + "BASE_MODEL": "roberta-base" } - }, - { - "class_name": "dnnc_proba2labels", - "in": ["simmilarity_scores", "x", "x_populated", "y_support"], - "out": ["y_pred"], - "multilabel": false, - "confidence_threshold": 0.8 - } - ], - "out": ["y_pred"] - }, - "train": { - "batch_size": 1, - "metrics": [ - { - "name": "sklearn_accuracy", - "inputs": ["y_true", "y_pred"], - "exclude_oos": true - }, - { - "name": "sklearn_balanced_accuracy", - "inputs": ["y_true", "y_pred"], - "exclude_oos": true - }, - { - "name": "oos_scores", - "inputs": ["y_true", "y_pred"] - }, - { - "name": "joint_accuracy_in_recall_oos", - "inputs": ["y_true", "y_pred"] - } - ], - "show_examples": false, - "evaluation_targets": ["test"], - "class_name": "torch_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", - "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", - "BINARY_CLASSIFICATION": true, - "BASE_MODEL": "roberta-base" - } - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/classifiers/few_shot_infer.tar.gz", - "subdir": "{MODEL_PATH}" - } - ] + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/few_shot_infer.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] } \ No newline at end of file diff --git a/deeppavlov/configs/classifiers/dnnc_test_2.json b/deeppavlov/configs/classifiers/dnnc_test_2.json deleted file mode 100644 index 0b675fb40d..0000000000 --- a/deeppavlov/configs/classifiers/dnnc_test_2.json +++ /dev/null @@ -1,80 +0,0 @@ -{ - "dataset_reader": { - "class_name": "basic_classification_reader", - "class_sep": ",", - "x": "text", - "y": "category", - "data_path": "/home/nchizhikova/binary/DeepPavlov/clinc_5 copy", - "train": "train.csv", - "valid": "valid.csv", - "test": "test.csv" - }, - "dataset_iterator": { - "class_name": "few_shot_iterator" - }, - "chainer": { - "in": ["x"], - "in_y": ["y"], - "pipe": [ - { - "class_name": "dnnc_input_preprocessor", - "in": ["x"], - "out": ["infer_support_texts", "x_populated", "infer_support_labels"], - "x": "text", - "y": "category", - "class_sep": ",", - "support_dataset_path": "/archive/savkin/parsed_datasets/clinc/clinc_5/train.csv" - }, - { - "config_path": "{CONFIGS_PATH}/classifiers/roberta_nli.json", - "in": ["infer_support_texts", "x_populated"], - "out": ["simmilarity_scores"] - }, - { - "class_name": "proba2labels_few_shot", - "in": ["simmilarity_scores", "x", "x_populated", "infer_support_labels", "y"], - "out": ["predicted_labels"], - "is_binary": "{BINARY_CLASSIFICATION}", - "confidence_threshold": 0.8 - } - ], - "out": ["predicted_labels"] - }, - "train": { - "batch_size": 1, - "metrics": [ - { - "name": "accuracy", - "inputs": [ - "y", - "predicted_labels" - ], - "exclude_oos": true - }, - { - "name": "oos_scores", - "inputs": [ - "y", - "predicted_labels"] - } - ], - "validation_patience": 0, - "val_every_n_epochs": 0, - "log_every_n_epochs": 0, - "show_examples": false, - "evaluation_targets": ["valid"], - "class_name": "torch_trainer", - "tensorboard_log_dir": "{MODEL_PATH}/" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", - "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", - "BINARY_CLASSIFICATION": true - } - } - } - diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 38e405b880..573ba51aaf 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -1,89 +1,115 @@ { - "dataset_reader": { - "class_name": "basic_classification_reader", - "class_sep": ",", - "x": "text", - "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc", - "train": "train.csv", - "valid": "dev.csv", - "test": "test.csv" - }, - "dataset_iterator": { - "class_name": "few_shot_iterator", - "seed": 42, - "shuffle": true, - "shot": 5, - "save_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" - }, - "chainer": { - "in": ["hypothesis", "premise"], - "in_y": ["y_true"], - "pipe": [ - { - "class_name": "torch_transformers_preprocessor", - "in": ["hypothesis", "premise"], - "out": ["bert_features"], - "vocab_file": "{BASE_MODEL}", - "do_lower_case": false, - "max_seq_length": 128 - }, - { - "class_name": "torch_transformers_classifier", - "main": true, - "in": ["bert_features"], - "in_y": ["y_true"], - "out": ["simmilarity_scores"], - "n_classes": 2, - "return_probas": true, - "pretrained_bert": "{BASE_MODEL}", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer": "AdamW", - "optimizer_parameters": { - "lr": 2e-05 + "dataset_reader": { + "class_name": "basic_classification_reader", + "format": "json", + "orient": "split", + "x": "text", + "y": "category", + "data_path": "/archive/savkin/parsed_datasets/clinc150/all/dnnc", + "train": "train.json", + "valid": "dev.json", + "test": "test.json" + }, + "dataset_iterator": { + "class_name": "few_shot_iterator", + "seed": 0, + "shuffle": true, + "shot": 10, + "save_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" + }, + "chainer": { + "in": [ + "hypothesis", + "premise" + ], + "in_y": [ + "y_true" + ], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "in": [ + "hypothesis", + "premise" + ], + "out": [ + "bert_features" + ], + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128 + }, + { + "class_name": "torch_transformers_classifier", + "in": [ + "bert_features" + ], + "in_y": [ + "y_true" + ], + "out": [ + "simmilarity_scores" + ], + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{DNNC_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05 + } + }, + { + "class_name": "nli_proba2labels", + "in": [ + "simmilarity_scores" + ], + "out": [ + "y_pred" + ] + } + ], + "out": [ + "y_pred" + ] + }, + "train": { + "batch_size": 500, + "epochs": 7, + "log_every_n_epochs": 1, + "validate_first": false, + "validation_patience": -1, + "metrics": [ + { + "name": "sklearn_balanced_accuracy", + "inputs": [ + "y_true", + "y_pred" + ] + } + ], + "evaluation_targets": [ + "valid" + ], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli", + "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", + "BASE_MODEL": "roberta-base", + "BINARY_CLASSIFICATION": true } - }, - { - "class_name": "proba2labels", - "in": ["simmilarity_scores"], - "out": ["y_pred"], - "max_proba": true - } - ], - "out": ["y_pred"] - }, - "train": { - "batch_size": 900, - "epochs": 7, - "log_every_n_epochs": 1, - "val_every_n_epochs": 1, - "validate_first": false, - "metrics": [ - { - "name": "sklearn_accuracy", - "inputs": ["y_true", "y_pred"] - } - ], - "evaluation_targets": ["valid"], - "class_name": "torch_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli", - "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", - "BASE_MODEL": "roberta-base", - "BINARY_CLASSIFICATION": true - } - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/classifiers/few_shot_infer.tar.gz", - "subdir": "{MODEL_PATH}" - } - ] + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/classifiers/few_shot_infer.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] } \ No newline at end of file diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 27fd236482..561eb59d22 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -34,6 +34,7 @@ "ner_chunk_model": "deeppavlov.models.entity_extraction.ner_chunker:NerChunkModel", "ner_chunker": "deeppavlov.models.entity_extraction.ner_chunker:NerChunker", "ner_vocab": "deeppavlov.models.preprocessors.ner_preprocessor:NerVocab", + "nli_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:NLIProba2Labels", "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer", "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer", "nn_trainer": "deeppavlov.core.trainers.nn_trainer:NNTrainer", diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index 96f4412966..d209c1d88f 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -41,14 +41,14 @@ def __init__(self, shot: Optional[int] = None, save_path: Optional[str] = None, *args, **kwargs) -> None: - data = self._remove_multiclass_examples(data) self.shot = shot self.shuffle = shuffle self.random = Random(seed) - self.train = self.preprocess(self._get_shot_examples(data.get('train', [])), *args, **kwargs) + + train_shot_examples = self._get_shot_examples(data.get('train', [])) + self.train = self.preprocess(train_shot_examples, *args, **kwargs) self.valid = self.preprocess(data.get('valid', []), *args, **kwargs) self.test = self.preprocess(data.get('test', []), *args, **kwargs) - self.split(*args, **kwargs) # TODO: delete it self.data = { 'train': self.train, 'valid': self.valid, @@ -56,53 +56,55 @@ def __init__(self, 'all': self.train + self.test + self.valid } - if save_path: - with open(save_path, "w") as file: - json.dump(self.data, file, indent=4) - - - @overrides - def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]: - if len(data) == 0: - return data + if save_path is None: + return + + with open(save_path, "w") as file: + json_dict = {"columns": ["text","category"]} + json_dict["data"] = [[text, label] for text, label in train_shot_examples] + json.dump(json_dict, file, indent=4) + + def _gather_info(self, data: List[Tuple[Any, Any]]): unique_labels = list(set([label for text, label in data])) - labels_dict = {} + label2examples = {} for label in unique_labels: - labels_dict[label] = [] - + label2examples[label] = [] for text, label in data: - labels_dict[label].append(text) + label2examples[label].append(text) - negative_labels = {} + label2negative = {} for i, label in enumerate(unique_labels): - negative_labels[label] = unique_labels.copy() - del negative_labels[label][i] + label2negative[label] = unique_labels.copy() + del label2negative[label][i] + + return label2examples, label2negative + + + @overrides + def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]: + if len(data) == 0: + return data + + label2examples, label2negative = self._gather_info(data) nli_triplets = [] # negative examples for text, label in data: - for negative_label in negative_labels[label]: - for negative_example in labels_dict[negative_label]: + for negative_label in label2negative[label]: + for negative_example in label2examples[negative_label]: nli_triplets.append([[text, negative_example], 0]) + # positive examples for text, label in data: - for positive_example in labels_dict[label]: - nli_triplets.append([[text, positive_example], 1]) + for positive_example in label2examples[label]: + if positive_example != text: + nli_triplets.append([[text, positive_example], 1]) if self.shuffle: self.random.shuffle(nli_triplets) return nli_triplets - - def _remove_multiclass_examples(self, data: Dict[str, List[Tuple[Any, Any]]]): - new_data = {"train": [], "valid": [], "test": []} - for key in new_data.keys(): - for text, labels_list in data[key]: - if len(labels_list) == 1: - new_data[key].append((text, labels_list[0])) - - return new_data def _get_shot_examples(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index c354d2dc11..b0a069c917 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -60,7 +60,7 @@ def read(self, data_path: str, url: str = None, data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') - + print("PATH = ", Path(data_path, train_file)) if not Path(data_path, train_file).exists(): if url is None: raise Exception( diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index d9ec39bd37..6af0ba2f97 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -79,15 +79,14 @@ def __call__(self, prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label y_pred.append(prediction) - - # print(f"example = {example}") - # print(f"example_simmilarity_scores = {example_simmilarity_scores}") - # print(f"probability_by_label = {example_simmilarity_scores}") - # print(f"prediction = {prediction}") - # print(f"max(probability_by_label) = {max(probability_by_label)}") - # raise Exception - return y_pred - \ No newline at end of file +@register('nli_proba2labels') +class NLIProba2Labels(Component): + def __init__(self, **kwargs): + pass + + def __call__(self, simmilarity_scores, *args, **kwargs): + # print("SCORES = ", simmilarity_scores) + return (np.array(simmilarity_scores) > 0.5).astype(int) \ No newline at end of file diff --git a/deeppavlov/models/classifiers/proba2labels_few_shot.py b/deeppavlov/models/classifiers/proba2labels_few_shot.py deleted file mode 100644 index 6cd1c13b70..0000000000 --- a/deeppavlov/models/classifiers/proba2labels_few_shot.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2017 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from logging import getLogger -from typing import List, Union - -import numpy as np -import torch - -from deeppavlov.core.common.errors import ConfigError -from deeppavlov.core.common.registry import register -from deeppavlov.core.models.component import Component - -log = getLogger(__name__) - - -@register('proba2labels_few_shot') -class Proba2LabelsFewShot(Component): - """ - Class implements probability to labels processing using the following ways: \ - choosing one or top_n indices with maximal probability or choosing any number of indices \ - which probabilities to belong with are higher than given confident threshold - - Args: - max_proba: whether to choose label with maximal probability - confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label) - top_n: how many top labels with the highest probabilities to return - - Attributes: - max_proba: whether to choose label with maximal probability - confidence_threshold: boundary probability value for sample to belong with the class (best use for multi-label) - top_n: how many top labels with the highest probabilities to return - """ - - def __init__(self, - max_proba: bool = None, - confidence_threshold: float = None, - top_n: int = None, - is_binary: bool = False, - pooling: str = 'mean', - **kwargs) -> None: - """ Initialize class with given parameters""" - - self.confidence_threshold = confidence_threshold - self.is_binary = is_binary - self.pooling = pooling - - def __call__(self, - data: Union[np.ndarray, - List[List[float]], - List[List[int]]], - train_cat: List[str], - test_cat: List[str], - *args, - **kwargs): - """ - Process probabilities to labels - - Args: - data: list of vectors with probability distribution - - Returns: - list of labels - """ - probas_by_class = [] - - unique_cats = list(sorted(set(train_cat))) - train_cat = np.array(train_cat) - - for cat in unique_cats: - ind_mask = np.where(train_cat == cat) - - if self.pooling == 'mean': - class_proba = np.mean(data[ind_mask]) - elif self.pooling == 'max': - class_proba = np.max(data[ind_mask]) - - probas_by_class.append(class_proba) - - if self.confidence_threshold: - max_conf = np.max(probas_by_class) - - if max_conf > self.confidence_threshold: - pred_id = np.argmax(probas_by_class) - y_pred = unique_cats[pred_id] - else: - y_pred = 'oos' - - return [[y_pred], [test_cat[0]]] - - - \ No newline at end of file diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 2e2217e64c..724ea8dc31 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -16,7 +16,6 @@ class InputPreprocessor(Component): def __init__(self, support_dataset_path: str, format: str = "csv", - class_sep: str = ",", *args, **kwargs) -> None: file = Path(support_dataset_path) if file.exists(): @@ -34,22 +33,24 @@ def __init__(self, x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') - multi_label = lambda labels: class_sep in str(labels) - self.support_dataset = [[row[x], str(row[y])] for _, row in df.iterrows() if not multi_label(row[y])] + self.support_dataset = [(row[x], str(row[y])) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(support_dataset_path)) def __call__(self, input_texts : List[str]) -> List[List[str]]: # TODO: перепиши на numpy - hypotesis_batch = [] - premise_batch = [] - hypotesis_labels_batch = [] + if self.support_dataset: + hypotesis_batch = [] + premise_batch = [] + hypotesis_labels_batch = [] - for [premise, [hypotesis, hypotesis_labels]] in zip(input_texts * len(self.support_dataset), - np.repeat(self.support_dataset, len(input_texts), axis=0)): - premise_batch.append(premise) - hypotesis_batch.append(hypotesis) - hypotesis_labels_batch.append(hypotesis_labels) + for [premise, [hypotesis, hypotesis_labels]] in zip(input_texts * len(self.support_dataset), + np.repeat(self.support_dataset, len(input_texts), axis=0)): + premise_batch.append(premise) + hypotesis_batch.append(hypotesis) + hypotesis_labels_batch.append(hypotesis_labels) - return hypotesis_batch, premise_batch, hypotesis_labels_batch \ No newline at end of file + return hypotesis_batch, premise_batch, hypotesis_labels_batch + else: + log.warning("Error while reading support dataset") \ No newline at end of file diff --git a/few_shot_infer.json b/few_shot_infer.json deleted file mode 100644 index a4154d752c..0000000000 --- a/few_shot_infer.json +++ /dev/null @@ -1,90 +0,0 @@ -{ - "dataset_reader": { - "class_name": "basic_classification_reader", - "class_sep": ",", - "x": "text", - "y": "category", - "data_path": "./clinc_5 copy/", - "train": "train.csv", - "valid": "valid.csv", - "test": "test.csv" - }, - "dataset_iterator": { - "class_name": "few_shot_iterator" - }, - "chainer": { - "in": ["hypothesis", "premise", "train_cat"], - "in_y": ["test_cat"], - "pipe": [ - { - "class_name": "torch_transformers_preprocessor", - "vocab_file": "{BASE_MODEL}", - "do_lower_case": false, - "max_seq_length": 128, - "in": ["hypothesis", "premise"], - "out": ["bert_features"] - }, - { - "class_name": "torch_transformers_classifier", - "n_classes": 2, - "return_probas": true, - "pretrained_bert": "{BASE_MODEL}", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer": "AdamW", - "optimizer_parameters": { - "lr": 2e-05 - }, - "learning_rate_drop_patience": 3, - "learning_rate_drop_div": 1.5, - "in": ["bert_features"], - "out": ["y_pred_probas"] - }, - { - "in": ["y_pred_probas", "train_cat", "test_cat"], - "out": ["y_pred_cat", "test_cat"], - "class_name": "proba2labels_few_shot", - "is_binary": "{BINARY_CLASSIFICATION}", - "confidence_threshold": 0.8 - } - ], - "out": ["y_pred_cat"] - }, - "train": { - "batch_size": 64, - "metrics": [ - { - "name": "accuracy", - "inputs": [ - "test_cat", - "y_pred_cat" - ], - "exclude_oos": true - }, - { - "name": "oos_scores", - "inputs": [ - "test_cat", - "y_pred_cat"] - } - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_epochs": 1, - "show_examples": false, - "evaluation_targets": ["valid"], - "class_name": "torch_trainer", - "tensorboard_log_dir": "{MODEL_PATH}/", - "pytest_max_batches": 2 - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODEL_PATH": "./glue_nli_binary/", - "BASE_MODEL": "roberta-base", - "BINARY_CLASSIFICATION": true - } - } -} From 45ee6e3bf70df8d47a046d1a739fb28873f7d2cc Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 2 Nov 2022 11:15:12 +0300 Subject: [PATCH 09/57] modified data processing --- deeppavlov/configs/classifiers/dnnc.json | 28 +++++++------ .../configs/classifiers/roberta_nli.json | 25 ++++++------ deeppavlov/core/common/registry.json | 1 + .../dataset_iterators/few_shot_iterator.py | 31 ++++++++------- .../models/classifiers/dnnc_proba2labels.py | 39 +++++++++++++++---- .../models/preprocessors/dnnc_preprocessor.py | 1 - 6 files changed, 78 insertions(+), 47 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index bd9fd8773c..9841d1c275 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -5,7 +5,8 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/all&oos/dnnc", + "data_path": "/archive/savkin/parsed_datasets/clinc150/all_oos/dnnc", + "train": "train.json", "valid": "dev.json", "test": "test.json" }, @@ -72,6 +73,7 @@ }, { "class_name": "dnnc_proba2labels", + "is_binary": "{BINARY_CLASSIFICATION}", "in": [ "simmilarity_scores", "x", @@ -82,7 +84,7 @@ "y_pred" ], "multilabel": false, - "confidence_threshold": 0.2 + "confidence_threshold": 0.0 } ], "out": [ @@ -90,8 +92,16 @@ ] }, "train": { - "batch_size": 4, + "batch_size": 5, "metrics": [ + { + "name": "sklearn_accuracy", + "inputs": [ + "y_true", + "y_pred" + ], + "exclude_oos": true + }, { "name": "sklearn_balanced_accuracy", "inputs": [ @@ -110,7 +120,7 @@ ], "show_examples": false, "evaluation_targets": [ - "valid" + "test" ], "class_name": "torch_trainer" }, @@ -121,14 +131,8 @@ "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", - "BINARY_CLASSIFICATION": true, + "BINARY_CLASSIFICATION": false, "BASE_MODEL": "roberta-base" } - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/v1/classifiers/few_shot_infer.tar.gz", - "subdir": "{MODEL_PATH}" - } - ] + } } \ No newline at end of file diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 573ba51aaf..7167d1b17e 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -39,6 +39,15 @@ "do_lower_case": false, "max_seq_length": 128 }, + { + "class_name": "nli_label2ids", + "in": [ + "y_true" + ], + "out": [ + "y_true" + ] + }, { "class_name": "torch_transformers_classifier", "in": [ @@ -63,6 +72,7 @@ }, { "class_name": "nli_proba2labels", + "is_binary": "{BINARY_CLASSIFICATION}", "in": [ "simmilarity_scores" ], @@ -79,11 +89,10 @@ "batch_size": 500, "epochs": 7, "log_every_n_epochs": 1, - "validate_first": false, "validation_patience": -1, "metrics": [ { - "name": "sklearn_balanced_accuracy", + "name": "sklearn_accuracy", "inputs": [ "y_true", "y_pred" @@ -100,16 +109,10 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli", + "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_finetune", "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", "BASE_MODEL": "roberta-base", - "BINARY_CLASSIFICATION": true - } - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/classifiers/few_shot_infer.tar.gz", - "subdir": "{MODEL_PATH}" + "BINARY_CLASSIFICATION": false } - ] + } } \ No newline at end of file diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 561eb59d22..f053459c1c 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -34,6 +34,7 @@ "ner_chunk_model": "deeppavlov.models.entity_extraction.ner_chunker:NerChunkModel", "ner_chunker": "deeppavlov.models.entity_extraction.ner_chunker:NerChunker", "ner_vocab": "deeppavlov.models.preprocessors.ner_preprocessor:NerVocab", + "nli_label2ids": "deeppavlov.models.classifiers.dnnc_proba2labels:NLIPLabel2Ids", "nli_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:NLIProba2Labels", "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer", "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer", diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index d209c1d88f..ab52ba249c 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -15,20 +15,18 @@ import json from random import Random -from typing import Dict, Any, List, Tuple, Generator, Optional -from collections import defaultdict +from typing import Dict, Any, List, Tuple, Optional from logging import getLogger import numpy as np from overrides import overrides -from pyparsing import null_debug_action +from tqdm import tqdm from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator import wandb - log = getLogger(__name__) @register('few_shot_iterator') @@ -39,16 +37,19 @@ def __init__(self, seed: int = None, shuffle: bool = True, shot: Optional[int] = None, + shot_test: Optional[int] = None, save_path: Optional[str] = None, *args, **kwargs) -> None: self.shot = shot + self.shot_test = shot_test self.shuffle = shuffle self.random = Random(seed) - - train_shot_examples = self._get_shot_examples(data.get('train', [])) + train_shot_examples = self._get_shot_examples(data.get('train', []), self.shot) + valid_shot_examples = self._get_shot_examples(data.get('valid', []), self.shot_test) + test_shot_examples = self._get_shot_examples(data.get('test', []), self.shot_test) self.train = self.preprocess(train_shot_examples, *args, **kwargs) - self.valid = self.preprocess(data.get('valid', []), *args, **kwargs) - self.test = self.preprocess(data.get('test', []), *args, **kwargs) + self.valid = self.preprocess(valid_shot_examples, *args, **kwargs) + self.test = self.preprocess(test_shot_examples, *args, **kwargs) self.data = { 'train': self.train, 'valid': self.valid, @@ -91,13 +92,13 @@ def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple nli_triplets = [] # negative examples - for text, label in data: + for text, label in tqdm(data, desc='Negative examples generation'): for negative_label in label2negative[label]: for negative_example in label2examples[negative_label]: nli_triplets.append([[text, negative_example], 0]) # positive examples - for text, label in data: + for text, label in tqdm(data, desc='Positive examples generation'): for positive_example in label2examples[label]: if positive_example != text: nli_triplets.append([[text, positive_example], 1]) @@ -107,8 +108,8 @@ def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple return nli_triplets - def _get_shot_examples(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: - if self.shot is None: + def _get_shot_examples(self, data, shot): + if shot is None: return data # shuffle data to select shot-examples @@ -120,11 +121,11 @@ def _get_shot_examples(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any data_dict[label] = [] for text, label in data: - if len(data_dict[label]) < self.shot: + if len(data_dict[label]) < shot: data_dict[label].append(text) - if max(len(x) for x in data_dict.values()) < self.shot: - log.warning(f"Some labels have less than \"shot\"={self.shot} examples") + if max(len(x) for x in data_dict.values()) < shot: + log.warning(f"Some labels have less than \"shot\"={shot} examples") new_data = [] for label in data_dict.keys(): diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 6af0ba2f97..b6af99f0bb 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -12,19 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from doctest import Example from logging import getLogger from typing import List, Union import numpy as np -import torch -from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) +def preprocess_scores(scores, is_binary, class_id: int = 1): + scores = np.array(scores) + return scores if is_binary else scores[:, class_id] + @register('dnnc_proba2labels') class Proba2Labels(Component): @@ -32,11 +33,14 @@ def __init__(self, confidence_threshold: float = 0.8, pooling: str = 'mean', multilabel: bool = False, + is_binary: bool = False, **kwargs) -> None: self.confidence_threshold = confidence_threshold self.pooling = pooling self.multilabel = multilabel + self.is_binary = is_binary + def __call__(self, simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], @@ -46,13 +50,13 @@ def __call__(self, *args, **kwargs): y_pred = [] - simmilarity_scores = np.array(simmilarity_scores) + simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) x = np.array(x) x_populated = np.array(x_populated) y_support = np.array(y_support) unique_labels = np.unique(y_support) - for example in x: + for example in x: example_mask = np.where(x_populated == example) example_simmilarity_scores = simmilarity_scores[example_mask] example_y_support = y_support[example_mask] @@ -79,14 +83,33 @@ def __call__(self, prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label y_pred.append(prediction) + return y_pred @register('nli_proba2labels') class NLIProba2Labels(Component): + def __init__(self, + is_binary: bool = False, + **kwargs): + self.is_binary = is_binary + + def __call__(self, + simmilarity_scores, + *args, **kwargs): + simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) + labels = (simmilarity_scores > 0.5).astype(int) + return labels + + +@register('nli_label2ids') +class NLIPLabel2Ids(Component): def __init__(self, **kwargs): pass - def __call__(self, simmilarity_scores, *args, **kwargs): - # print("SCORES = ", simmilarity_scores) - return (np.array(simmilarity_scores) > 0.5).astype(int) \ No newline at end of file + def __call__(self, + y_true, + *args, **kwargs): + + y_ids = np.array([int(label) for label in y_true]) + return y_ids diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 724ea8dc31..4e26f5c5de 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -39,7 +39,6 @@ def __init__(self, def __call__(self, input_texts : List[str]) -> List[List[str]]: - # TODO: перепиши на numpy if self.support_dataset: hypotesis_batch = [] premise_batch = [] From 4209b055d6913c6f05b7cd05931879adff6fdcc9 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 2 Nov 2022 11:55:11 +0300 Subject: [PATCH 10/57] fix imports --- deeppavlov/dataset_iterators/few_shot_iterator.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index ab52ba249c..384bb9e740 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -20,13 +20,10 @@ import numpy as np from overrides import overrides -from tqdm import tqdm from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator -import wandb - log = getLogger(__name__) @register('few_shot_iterator') @@ -92,13 +89,13 @@ def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple nli_triplets = [] # negative examples - for text, label in tqdm(data, desc='Negative examples generation'): + for text, label in data: for negative_label in label2negative[label]: for negative_example in label2examples[negative_label]: nli_triplets.append([[text, negative_example], 0]) # positive examples - for text, label in tqdm(data, desc='Positive examples generation'): + for text, label in data: for positive_example in label2examples[label]: if positive_example != text: nli_triplets.append([[text, positive_example], 1]) From bbc83b7595117e1949c569c7e78128e234c4c744 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Fri, 4 Nov 2022 15:10:34 +0300 Subject: [PATCH 11/57] change nli labels to strings --- deeppavlov/configs/classifiers/dnnc.json | 3 +-- .../configs/classifiers/roberta_nli.json | 13 ++++++------ .../core/common/requirements_registry.json | 13 ++++++++++++ .../dataset_iterators/few_shot_iterator.py | 7 +++++-- deeppavlov/metrics/few_shot_metrics.py | 13 ------------ .../models/classifiers/dnnc_proba2labels.py | 20 ++++++++++++++----- .../models/preprocessors/dnnc_preprocessor.py | 5 ++++- deeppavlov/requirements/sklearn.txt | 1 + 8 files changed, 46 insertions(+), 29 deletions(-) create mode 100644 deeppavlov/requirements/sklearn.txt diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index 9841d1c275..977a35de43 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -5,7 +5,7 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/all_oos/dnnc", + "data_path": "/archive/savkin/parsed_datasets/clinc150/banking_oos/dnnc", "train": "train.json", "valid": "dev.json", "test": "test.json" @@ -76,7 +76,6 @@ "is_binary": "{BINARY_CLASSIFICATION}", "in": [ "simmilarity_scores", - "x", "x_populated", "y_support" ], diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 7167d1b17e..9a9f57d17b 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -5,7 +5,7 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/all/dnnc", + "data_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc", "train": "train.json", "valid": "dev.json", "test": "test.json" @@ -45,7 +45,7 @@ "y_true" ], "out": [ - "y_true" + "y_ids" ] }, { @@ -54,7 +54,7 @@ "bert_features" ], "in_y": [ - "y_true" + "y_ids" ], "out": [ "simmilarity_scores" @@ -86,9 +86,10 @@ ] }, "train": { - "batch_size": 500, + "batch_size": 400, "epochs": 7, "log_every_n_epochs": 1, + "val_every_n_epochs": 1, "validation_patience": -1, "metrics": [ { @@ -100,7 +101,7 @@ } ], "evaluation_targets": [ - "valid" + "train" ], "class_name": "torch_trainer" }, @@ -109,7 +110,7 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_finetune", + "MODEL_PATH": "{MODELS_PATH}/classification/empty", "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": false diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json index d65eba771e..a0ea5b01cf 100644 --- a/deeppavlov/core/common/requirements_registry.json +++ b/deeppavlov/core/common/requirements_registry.json @@ -63,6 +63,19 @@ "russian_words_vocab": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], + "sklearn_accuracy": [ + "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" + ], + "sklearn_balanced_accuracy": [ + "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" + ], + "oos_scores": [ + "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" + ], + "sklearn_precision_recall_fscore_support": [ + "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" + ], + "slovnet_syntax_parser": [ "{DEEPPAVLOV_PATH}/requirements/slovnet.txt" ], diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index 384bb9e740..febdd0f594 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -24,6 +24,9 @@ from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +ENTAILMENT = 'entailment' +NON_ENTAILMENT = 'non_entailment' + log = getLogger(__name__) @register('few_shot_iterator') @@ -92,13 +95,13 @@ def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple for text, label in data: for negative_label in label2negative[label]: for negative_example in label2examples[negative_label]: - nli_triplets.append([[text, negative_example], 0]) + nli_triplets.append([[text, negative_example], NON_ENTAILMENT]) # positive examples for text, label in data: for positive_example in label2examples[label]: if positive_example != text: - nli_triplets.append([[text, positive_example], 1]) + nli_triplets.append([[text, positive_example], ENTAILMENT]) if self.shuffle: self.random.shuffle(nli_triplets) diff --git a/deeppavlov/metrics/few_shot_metrics.py b/deeppavlov/metrics/few_shot_metrics.py index a084c7f197..11038098dd 100644 --- a/deeppavlov/metrics/few_shot_metrics.py +++ b/deeppavlov/metrics/few_shot_metrics.py @@ -46,14 +46,6 @@ def balanced_accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: return balanced_accuracy_score(y_true, y_pred) -@register_metric('joint_accuracy_in_recall_oos') -def joint_accuracy_in_recall_oos(y_true, y_pred) -> float: - return accuracy(y_true, y_pred, exclude_oos=True) + oos_scores(y_true, y_pred)["recall"] - -@register_metric('sklearn_precision_recall_fscore_support') -def recision_recall_fscore(*args, **kwargs): - return precision_recall_fscore_support(*args, **kwargs) - @register_metric('oos_scores') def oos_scores(y_true, y_pred): @@ -61,8 +53,3 @@ def oos_scores(y_true, y_pred): y_pred_binary = (np.array(y_pred) == "oos") scores = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary') return dict(zip(["precision", "recall", "fbeta_score"], scores[:3])) - - -@register_metric('sklearn_classification_report') -def report(*args, **kwargs): - return classification_report(output_dict=True, *args, **kwargs) diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index b6af99f0bb..10736c1659 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -20,6 +20,9 @@ from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component +ENTAILMENT = 'entailment' +NON_ENTAILMENT = 'non_entailment' + log = getLogger(__name__) def preprocess_scores(scores, is_binary, class_id: int = 1): @@ -44,19 +47,20 @@ def __init__(self, def __call__(self, simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], - x: List[str], x_populated: List[str], y_support: List[str], *args, **kwargs): y_pred = [] + simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) - x = np.array(x) x_populated = np.array(x_populated) y_support = np.array(y_support) + + unique_texts = np.unique(x_populated) unique_labels = np.unique(y_support) - for example in x: + for example in unique_texts: example_mask = np.where(x_populated == example) example_simmilarity_scores = simmilarity_scores[example_mask] example_y_support = y_support[example_mask] @@ -98,7 +102,9 @@ def __call__(self, simmilarity_scores, *args, **kwargs): simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) - labels = (simmilarity_scores > 0.5).astype(int) + + labels = np.array([NON_ENTAILMENT] * len(simmilarity_scores), dtype="object") + labels[simmilarity_scores > 0.5] = ENTAILMENT return labels @@ -111,5 +117,9 @@ def __call__(self, y_true, *args, **kwargs): - y_ids = np.array([int(label) for label in y_true]) + label2id = { + NON_ENTAILMENT: 0, + ENTAILMENT: 1 + } + y_ids = np.array([label2id[label] for label in y_true]) return y_ids diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 4e26f5c5de..0ef614df02 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -39,6 +39,9 @@ def __init__(self, def __call__(self, input_texts : List[str]) -> List[List[str]]: + ''' + Generates all possible ordread pairs from 'input_texts' and 'self.support_dataset' + ''' if self.support_dataset: hypotesis_batch = [] premise_batch = [] @@ -52,4 +55,4 @@ def __call__(self, return hypotesis_batch, premise_batch, hypotesis_labels_batch else: - log.warning("Error while reading support dataset") \ No newline at end of file + log.warning("Error: no support dataset") diff --git a/deeppavlov/requirements/sklearn.txt b/deeppavlov/requirements/sklearn.txt new file mode 100644 index 0000000000..ff88936c77 --- /dev/null +++ b/deeppavlov/requirements/sklearn.txt @@ -0,0 +1 @@ +scikit-learn \ No newline at end of file From 52e378f65f1fa9e28baa23e02ba22be6f1d7e0a4 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 16 Nov 2022 16:47:55 +0300 Subject: [PATCH 12/57] add documentation --- .../few_shot_intent_classification.ipynb | 513 ++++++++++++++++++ 1 file changed, 513 insertions(+) create mode 100644 docs/features/models/few_shot_intent_classification.ipynb diff --git a/docs/features/models/few_shot_intent_classification.ipynb b/docs/features/models/few_shot_intent_classification.ipynb new file mode 100644 index 0000000000..4f01d2f199 --- /dev/null +++ b/docs/features/models/few_shot_intent_classification.ipynb @@ -0,0 +1,513 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table of contents \n", + "\n", + "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", + "\n", + "2. [Dataset](#2.-Datasets)\n", + "\n", + " 2.1 [OOS examples](#2.1-OOS-examples)\n", + " \n", + " 2.2 [Datasets format](#2.2-Datasets-format)\n", + "\n", + "3. [Model architecture](#3.-Model-architecture)\n", + "\n", + "4. [Get started with the model](#4.-Get-started-with-the-model)\n", + " \n", + " 4.1 [Installation](#4.1-Installation)\n", + "\n", + " 4.2 [Support dataset configuration](#4.2-Support-dataset-configuration)\n", + "\n", + "5. [Use the model for prediction](#5.-Use-the-model-for-prediction)\n", + "\n", + " 5.1 [Predict using Python](#5.1-Predict-using-Python)\n", + " \n", + " 5.2 [Predict using CLI](#5.2-Predict-using-CLI)\n", + " \n", + "6. [Train the model on your data](#6.-Train-the-model-on-your-data)\n", + " \n", + " 6.1. [Train your model from Python](#6.1-Train-your-model-from-Python)\n", + " \n", + " 6.2. [Train your model from CLI](#6.2-Train-your-model-from-CLI)\n", + " \n", + "7. [Evaluate](#7.-Evaluate)\n", + " \n", + " 7.1. [Evaluate from Python](#7.1-Evaluate-from-Python)\n", + " \n", + " 7.2. [Evaluate from CLI](#7.2-Evaluate-from-CLI)\n", + " \n", + "8. [Metrics](#8.-Metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Introduction to the task" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "__Intent classification__ is a task of identifying speaker's intent given an utterance, where intent is one of N classes or \"oos\". We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We consider the following dataset:\n", + "\n", + "- [CLINC150](https://paperswithcode.com/dataset/clinc150>) - has 10 different domains with 150 intents total and 100 shots per intent. This dataset simulates a setting, where model has to handle many different services with wide variety of intents\n", + "\n", + "Specifically, we validate our model on CLINC150 from original DNNC paper. Dataset is downloaded from dnnc github page https://github.com/salesforce/DNNC-few-shot-intent and then parsed to match the [format below](#22-dataset-format)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 OOS examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CLINC150 also provides 1000 OOS examples, which don't belong to any domain or intent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.2 Dataset format" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train, dev and test set are separate files with the following format\n", + "\n", + "```\n", + "{\n", + " \"columns\": [\n", + " \"text\",\n", + " \"category\"\n", + " ],\n", + "\n", + " \"data\": [\n", + "\n", + " [\n", + " \"text\"\n", + " \"intent_class\"\n", + " ],\n", + "\n", + " ...\n", + " ]\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Model architecture" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The typical methodology is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough\n", + "\n", + "\n", + "Proposed method suggests to model fine-grained relations of utterance pairs via pairise simmilarity:\n", + "\n", + "$h = BERT([[CLS], u, [SEP], e_{j,i}, [SEP]]) \\in \\R^d$\n", + "\n", + "$S(u, e_{j,i}) = \\sigma(W * h + b) \\in \\R$, where $e_{j, i} \\in E $- training set, $W \\in \\R^{1×d}$, $b \\in \\R$\n", + "\n", + "To mitigate the data scarcity setting in few-shot learning, DNNC uses knowldge-transfer from NLI task. We pretrain roberta-base on combination of 3 NLI datasets: SNLI, WNLI, MNLI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Get started with the model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.1 Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First make sure you have the DeepPavlov Library installed.\n", + "[More info about the first installation](http://docs.deeppavlov.ai/en/master/intro/installation.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --q deeppavlov" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then make sure that all the required packages for the model are installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m deeppavlov install dnnc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`dnnc` here is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", + "\n", + "Configuration file defines the model and describes it's hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.2 Support dataset configuration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before making prediction or evaluation you need to set path to your support dataset, which will be used for predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.utils import parse_config\n", + "\n", + "model_config = parse_config('dnnc')\n", + "\n", + "# dataset for predictions\n", + "print(model_config['chainer']['pipe'][0]['support_dataset_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Off-the-shelf prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Base model was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### OOS prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Out-of-scope (OOS) examples are determined via confidence_threshold parameter. The higher the threshold, the more often the model predicts OOS class. By default it is set to 0.5. You can change it to your preferences in configuration file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.utils import parse_config\n", + "\n", + "model_config = parse_config('dnnc')\n", + "\n", + "# dataset for predictions\n", + "print(model_config['chainer']['pipe'][-1]['confidence_threshold'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Use the model for prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.1 Predict using Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov import build_model\n", + "\n", + "model = build_model(\"dnnc\", download=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model([\"can you find me a good reviewed hotel in japan\", \"if i get a visa can i travel to japan\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.2 Predict using CLI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get predictions in an interactive mode through CLI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m deeppavlov interact dnnc -d" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or make predictions for samples from *stdin*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m deeppavlov predict dnnc -f " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. Train the model on your data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have a separate \"robert_nli\" config for training, which automatically transforms dataset into a pairwise format" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To train the model on your data, you need to change the path to the dataset in \"roberta_nli\" config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.utils import parse_config\n", + "\n", + "model_config = parse_config('roberta_nli')\n", + "\n", + "# dataset for training\n", + "print(model_config['dataset_reader']['data_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.1 Train your model from Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov import train_model\n", + "\n", + "model = train_model(\"roberta_nli\", download=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.2 Train your model from CLI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m deeppavlov train roberta_nli" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 7. Evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To evaluate the model on your data, you need to change the path to the dataset in \"dnnc\" config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov.core.commands.utils import parse_config\n", + "\n", + "model_config = parse_config('dnnc')\n", + "\n", + "# dataset for evaluation\n", + "print(model_config['dataset_reader']['data_path'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7.1 Evaluate from Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deeppavlov import evaluate_model\n", + "\n", + "model = evaluate_model('dnnc', download=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7.2 Evaluate from CLI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m deeppavlov evaluate dnnc -d" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.10 ('dnnc_cuda11_env': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d267cc100fe706c63aefa7fd2da1b610b862ac822a3924399ae410740f5c813e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From be99c316eaad6fbf849063d037ea39bcf38dd65d Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 5 Dec 2022 12:45:40 +0300 Subject: [PATCH 13/57] fix conversion of labels to ids and ids to labels --- deeppavlov/configs/classifiers/dnnc.json | 9 +-- .../configs/classifiers/roberta_nli.json | 64 ++++++++++++------- deeppavlov/core/common/registry.json | 2 - .../models/classifiers/dnnc_proba2labels.py | 43 +------------ 4 files changed, 48 insertions(+), 70 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index 977a35de43..b2acc2de40 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -37,7 +37,7 @@ "y": "category", "format": "json", "orient": "split", - "support_dataset_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" + "support_dataset_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json", }, { "class_name": "torch_transformers_preprocessor", @@ -76,6 +76,7 @@ "is_binary": "{BINARY_CLASSIFICATION}", "in": [ "simmilarity_scores", + "x", "x_populated", "y_support" ], @@ -83,7 +84,7 @@ "y_pred" ], "multilabel": false, - "confidence_threshold": 0.0 + "confidence_threshold": 0.5 } ], "out": [ @@ -91,7 +92,7 @@ ] }, "train": { - "batch_size": 5, + "batch_size": 1, "metrics": [ { "name": "sklearn_accuracy", @@ -130,7 +131,7 @@ "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", - "BINARY_CLASSIFICATION": false, + "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" } } diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 9a9f57d17b..527b93aa1d 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -14,23 +14,23 @@ "class_name": "few_shot_iterator", "seed": 0, "shuffle": true, - "shot": 10, + "shot": 5, "save_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" }, "chainer": { "in": [ - "hypothesis", - "premise" + "premise", + "hypothesis" ], "in_y": [ - "y_true" + "y_true_labels" ], "pipe": [ { "class_name": "torch_transformers_preprocessor", "in": [ - "hypothesis", - "premise" + "premise", + "hypothesis" ], "out": [ "bert_features" @@ -40,26 +40,32 @@ "max_seq_length": 128 }, { - "class_name": "nli_label2ids", + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": [ + "y_true_labels" + ], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", "in": [ - "y_true" + "y_true_labels" ], "out": [ - "y_ids" + "y_true_ids" ] }, { "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", "in": [ "bert_features" ], "in_y": [ - "y_ids" + "y_true_ids" ], "out": [ - "simmilarity_scores" + "y_pred_probas" ], - "n_classes": 2, "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{DNNC_PATH}/model", @@ -71,37 +77,47 @@ } }, { - "class_name": "nli_proba2labels", - "is_binary": "{BINARY_CLASSIFICATION}", "in": [ - "simmilarity_scores" + "y_pred_probas" + ], + "out": [ + "y_pred_ids" + ], + "class_name": "proba2labels", + "max_proba": true + }, + { + "ref": "classes_vocab", + "in": [ + "y_pred_ids" ], "out": [ - "y_pred" + "y_pred_labels" ] } ], "out": [ - "y_pred" + "y_pred_labels" ] }, "train": { - "batch_size": 400, + "batch_size": 370, "epochs": 7, "log_every_n_epochs": 1, - "val_every_n_epochs": 1, + "val_every_n_epochs": 7, "validation_patience": -1, + "validate_first": false, "metrics": [ { "name": "sklearn_accuracy", "inputs": [ - "y_true", - "y_pred" + "y_true_labels", + "y_pred_labels" ] } ], "evaluation_targets": [ - "train" + "valid" ], "class_name": "torch_trainer" }, @@ -110,10 +126,10 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/empty", + "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", "BASE_MODEL": "roberta-base", - "BINARY_CLASSIFICATION": false + "BINARY_CLASSIFICATION": true } } } \ No newline at end of file diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index f053459c1c..27fd236482 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -34,8 +34,6 @@ "ner_chunk_model": "deeppavlov.models.entity_extraction.ner_chunker:NerChunkModel", "ner_chunker": "deeppavlov.models.entity_extraction.ner_chunker:NerChunker", "ner_vocab": "deeppavlov.models.preprocessors.ner_preprocessor:NerVocab", - "nli_label2ids": "deeppavlov.models.classifiers.dnnc_proba2labels:NLIPLabel2Ids", - "nli_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:NLIProba2Labels", "nltk_moses_tokenizer": "deeppavlov.models.tokenizers.nltk_moses_tokenizer:NLTKMosesTokenizer", "nltk_tokenizer": "deeppavlov.models.tokenizers.nltk_tokenizer:NLTKTokenizer", "nn_trainer": "deeppavlov.core.trainers.nn_trainer:NNTrainer", diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 10736c1659..b1a1748276 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -20,9 +20,6 @@ from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component -ENTAILMENT = 'entailment' -NON_ENTAILMENT = 'non_entailment' - log = getLogger(__name__) def preprocess_scores(scores, is_binary, class_id: int = 1): @@ -47,6 +44,7 @@ def __init__(self, def __call__(self, simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], + x: List[str], x_populated: List[str], y_support: List[str], *args, @@ -57,10 +55,9 @@ def __call__(self, x_populated = np.array(x_populated) y_support = np.array(y_support) - unique_texts = np.unique(x_populated) unique_labels = np.unique(y_support) - for example in unique_texts: + for example in x: example_mask = np.where(x_populated == example) example_simmilarity_scores = simmilarity_scores[example_mask] example_y_support = y_support[example_mask] @@ -88,38 +85,4 @@ def __call__(self, y_pred.append(prediction) - return y_pred - - -@register('nli_proba2labels') -class NLIProba2Labels(Component): - def __init__(self, - is_binary: bool = False, - **kwargs): - self.is_binary = is_binary - - def __call__(self, - simmilarity_scores, - *args, **kwargs): - simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) - - labels = np.array([NON_ENTAILMENT] * len(simmilarity_scores), dtype="object") - labels[simmilarity_scores > 0.5] = ENTAILMENT - return labels - - -@register('nli_label2ids') -class NLIPLabel2Ids(Component): - def __init__(self, **kwargs): - pass - - def __call__(self, - y_true, - *args, **kwargs): - - label2id = { - NON_ENTAILMENT: 0, - ENTAILMENT: 1 - } - y_ids = np.array([label2id[label] for label in y_true]) - return y_ids + return y_pred \ No newline at end of file From 612f08a1af7cb6f6580d574fb89faf3a0d1f2152 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 5 Dec 2022 16:17:37 +0300 Subject: [PATCH 14/57] binary head dropout fix --- .../basic_classification_reader.py | 2 +- .../torch_bert/torch_transformers_classifier.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index b0a069c917..c354d2dc11 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -60,7 +60,7 @@ def read(self, data_path: str, url: str = None, data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') - print("PATH = ", Path(data_path, train_file)) + if not Path(data_path, train_file).exists(): if url is None: raise Exception( diff --git a/deeppavlov/models/torch_bert/torch_transformers_classifier.py b/deeppavlov/models/torch_bert/torch_transformers_classifier.py index ac1faa4455..9158af5ce5 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_classifier.py +++ b/deeppavlov/models/torch_bert/torch_transformers_classifier.py @@ -68,6 +68,7 @@ def __init__(self, n_classes, bert_config_file: Optional[str] = None, is_binary: Optional[bool] = False, num_special_tokens: int = None, + binary_head_dropout: int = 0.1, **kwargs) -> None: if not optimizer_parameters: @@ -88,6 +89,7 @@ def __init__(self, n_classes, self.is_binary = is_binary self.bert_config = None self.num_special_tokens = num_special_tokens + self.binary_head_dropout = binary_head_dropout if self.multilabel and not self.one_hot_labels: raise RuntimeError('Use one-hot encoded labels for multilabel classification!') @@ -204,7 +206,7 @@ def load(self, fname=None): if self.pretrained_bert: log.info(f"From pretrained {self.pretrained_bert}.") config = AutoConfig.from_pretrained(self.pretrained_bert, - # num_labels=self.n_classes, + classifier_dropout=self.binary_head_dropout, output_attentions=False, output_hidden_states=False) @@ -213,9 +215,6 @@ def load(self, fname=None): self.model = AutoModelForBinaryClassification(self.pretrained_bert, config) else: self.model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_bert, config=config) - - # TODO need a better solution here and at - # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load try: hidden_size = self.model.classifier.out_proj.in_features @@ -270,8 +269,7 @@ def __init__(self, pretrained_bert, config): self.pretrained_bert = pretrained_bert self.config = config -# self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config) - self.model = AutoModel.from_pretrained(self.pretrained_bert) + self.model = AutoModel.from_pretrained(self.pretrained_bert, self.config) self.classifier = BinaryClassificationHead(config) self.classifier.init_weights() @@ -292,8 +290,8 @@ def forward(self, outputs = self.model(input_ids, attention_mask=attention_mask, - #token_type_ids=token_type_ids, - #position_ids=position_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, @@ -324,8 +322,7 @@ def __init__(self, config): self.config = config self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size) - #self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) - self.dropout = torch.nn.Dropout(0.1) + self.dropout = torch.nn.Dropout(config.classifier_dropout) self.out_proj = torch.nn.Linear(config.hidden_size, 1) def init_weights(self): From 50b78756ad48299523c44df8574546331b98f073 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 5 Dec 2022 16:30:30 +0300 Subject: [PATCH 15/57] fix few-shot dos --- .../few_shot_intent_classification.ipynb | 68 ++++++++----------- docs/index.rst | 1 + 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/docs/features/models/few_shot_intent_classification.ipynb b/docs/features/models/few_shot_intent_classification.ipynb index 4f01d2f199..4e31dfd1d0 100644 --- a/docs/features/models/few_shot_intent_classification.ipynb +++ b/docs/features/models/few_shot_intent_classification.ipynb @@ -9,10 +9,8 @@ "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", "2. [Dataset](#2.-Datasets)\n", - "\n", - " 2.1 [OOS examples](#2.1-OOS-examples)\n", " \n", - " 2.2 [Datasets format](#2.2-Datasets-format)\n", + " 2.1 [Datasets format](#2.1-Datasets-format)\n", "\n", "3. [Model architecture](#3.-Model-architecture)\n", "\n", @@ -54,7 +52,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "__Intent classification__ is a task of identifying speaker's intent given an utterance, where intent is one of N classes or \"oos\". We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n" + "__Intent classification__ is a task of identifying speaker's intent given an utterance, where intent is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n" ] }, { @@ -68,39 +66,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We consider the following dataset:\n", - "\n", - "- [CLINC150](https://paperswithcode.com/dataset/clinc150>) - has 10 different domains with 150 intents total and 100 shots per intent. This dataset simulates a setting, where model has to handle many different services with wide variety of intents\n", + "In our experiments we used the [CLINC150](https://paperswithcode.com/dataset/clinc150) dataset, which has 10 different domains with 15 intents each, 100 shots per intent class and 1000 OOS examples. It simulates a setting, where model has to handle many different services with wide variety of intents.\n", "\n", - "Specifically, we validate our model on CLINC150 from original DNNC paper. Dataset is downloaded from dnnc github page https://github.com/salesforce/DNNC-few-shot-intent and then parsed to match the [format below](#22-dataset-format)" + "Specifically, we validate our model on CLINC150 from the original DNNC paper. We parsed it to match the format described [below](#21-dataset-format). The original dataset can be downloaded from the DNNC [github page](https://github.com/salesforce/DNNC-few-shot-intent)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2.1 OOS examples" + "## 2.1 Datasets format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "CLINC150 also provides 1000 OOS examples, which don't belong to any domain or intent" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2.2 Dataset format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Train, dev and test set are separate files with the following format\n", + "Train, dev and test set are separate json files, which have the following format\n", "\n", "```\n", "{\n", @@ -133,16 +115,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The typical methodology is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough\n", + "The typical methodology of few-shot intent classification is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough.\n", "\n", "\n", - "Proposed method suggests to model fine-grained relations of utterance pairs via pairise simmilarity:\n", + "DNNC authors suggests to model fine-grained relations of utterance pairs via pairise simmilarity:\n", "\n", "$h = BERT([[CLS], u, [SEP], e_{j,i}, [SEP]]) \\in \\R^d$\n", "\n", "$S(u, e_{j,i}) = \\sigma(W * h + b) \\in \\R$, where $e_{j, i} \\in E $- training set, $W \\in \\R^{1×d}$, $b \\in \\R$\n", "\n", - "To mitigate the data scarcity setting in few-shot learning, DNNC uses knowldge-transfer from NLI task. We pretrain roberta-base on combination of 3 NLI datasets: SNLI, WNLI, MNLI" + "To mitigate the data scarcity setting in few-shot learning, DNNC uses knowldge-transfer from NLI task. We pretrain [roberta-base](https://huggingface.co/roberta-base) on combination of 3 NLI datasets: SNLI, WNLI, MNLI." ] }, { @@ -164,7 +146,7 @@ "metadata": {}, "source": [ "First make sure you have the DeepPavlov Library installed.\n", - "[More info about the first installation](http://docs.deeppavlov.ai/en/master/intro/installation.html)" + "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] }, { @@ -173,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --q deeppavlov" + "!pip install deeppavlov" ] }, { @@ -196,7 +178,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`dnnc` here is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", + "`dnnc` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes it's hyperparameters" ] @@ -212,7 +194,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Before making prediction or evaluation you need to set path to your support dataset, which will be used for predictions" + "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training data as support dataset. You can specify the support dataset path in the in `dnnc` config file. It has the same format as metioned [before]()" ] }, { @@ -254,7 +236,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Out-of-scope (OOS) examples are determined via confidence_threshold parameter. The higher the threshold, the more often the model predicts OOS class. By default it is set to 0.5. You can change it to your preferences in configuration file" + "Out-of-scope (OOS) examples are determined via confidence_threshold parameter with the following algorithm. Firstly model calculates an average similarity score for every intent class from support dataset. Secondly it determines the class with maximum similarity score. Finally the model predicts class with maximum similarity if it's score is higher than confidence_threshold and \"oos\" class otherwise. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0.5. You can change it to your preferences in configuration file" ] }, { @@ -298,16 +280,24 @@ "metadata": {}, "outputs": [], "source": [ - "from deeppavlov import build_model\n", + "from deeppavlov import build_model, configs\n", "\n", "model = build_model(\"dnnc\", download=True)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['book_hotel', 'international_visa']\n" + ] + } + ], "source": [ "model([\"can you find me a good reviewed hotel in japan\", \"if i get a visa can i travel to japan\"])" ] @@ -362,14 +352,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We have a separate \"robert_nli\" config for training, which automatically transforms dataset into a pairwise format" + "We have a separate `roberta_nli` config for training, which automatically transforms dataset into a pairwise format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To train the model on your data, you need to change the path to the dataset in \"roberta_nli\" config" + "To train the model on your data, you need to change the path to the dataset in `roberta_nli` config" ] }, { @@ -431,7 +421,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To evaluate the model on your data, you need to change the path to the dataset in \"dnnc\" config" + "To evaluate the model on your data, you need to change the path to the dataset in `dnnc` config" ] }, { diff --git a/docs/index.rst b/docs/index.rst index 47be64eb43..4aed63037c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ Welcome to DeepPavlov's documentation! Context Question Answering Classification Entity Extraction + Few-shot Classification Named Entity Recognition Neural Ranking Spelling Correction From 96bc1943cb889ecbf63ef4a80e3837f5ad1d5d73 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Tue, 6 Dec 2022 15:46:10 +0300 Subject: [PATCH 16/57] add return format flag --- .../dataset_iterators/few_shot_iterator.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index febdd0f594..87cfb87e8a 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -17,6 +17,7 @@ from random import Random from typing import Dict, Any, List, Tuple, Optional from logging import getLogger +from pathlib import Path import numpy as np from overrides import overrides @@ -39,17 +40,30 @@ def __init__(self, shot: Optional[int] = None, shot_test: Optional[int] = None, save_path: Optional[str] = None, + return_nli_format: bool = False, *args, **kwargs) -> None: self.shot = shot self.shot_test = shot_test self.shuffle = shuffle self.random = Random(seed) - train_shot_examples = self._get_shot_examples(data.get('train', []), self.shot) - valid_shot_examples = self._get_shot_examples(data.get('valid', []), self.shot_test) - test_shot_examples = self._get_shot_examples(data.get('test', []), self.shot_test) - self.train = self.preprocess(train_shot_examples, *args, **kwargs) - self.valid = self.preprocess(valid_shot_examples, *args, **kwargs) - self.test = self.preprocess(test_shot_examples, *args, **kwargs) + + self.train = self.get_shot_examples(data.get('train', []), self.shot) + self.valid = self.get_shot_examples(data.get('valid', []), self.shot_test) + self.test = self.get_shot_examples(data.get('test', []), self.shot_test) + + if save_path is not None: + save_path = Path(save_path).expanduser() + save_path.parent.mkdir(parents=True, exist_ok=True) + with save_path.open("w") as file: + json_dict = {"columns": ["text","category"]} + json_dict["data"] = [[text, label] for text, label in self.train] + json.dump(json_dict, file, indent=4) + + if return_nli_format: + self.train = self.convert2nli(self.train) + self.valid = self.convert2nli(self.valid) + self.test = self.convert2nli(self.test) + self.data = { 'train': self.train, 'valid': self.valid, @@ -57,14 +71,6 @@ def __init__(self, 'all': self.train + self.test + self.valid } - - if save_path is None: - return - - with open(save_path, "w") as file: - json_dict = {"columns": ["text","category"]} - json_dict["data"] = [[text, label] for text, label in train_shot_examples] - json.dump(json_dict, file, indent=4) def _gather_info(self, data: List[Tuple[Any, Any]]): unique_labels = list(set([label for text, label in data])) @@ -83,8 +89,7 @@ def _gather_info(self, data: List[Tuple[Any, Any]]): return label2examples, label2negative - @overrides - def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]: + def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: if len(data) == 0: return data @@ -102,13 +107,14 @@ def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple for positive_example in label2examples[label]: if positive_example != text: nli_triplets.append([[text, positive_example], ENTAILMENT]) - + if self.shuffle: self.random.shuffle(nli_triplets) + return nli_triplets - def _get_shot_examples(self, data, shot): + def get_shot_examples(self, data, shot): if shot is None: return data From 98cb6607286c63a7eaff612c7708bbefd0b5bc2a Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Tue, 6 Dec 2022 15:49:32 +0300 Subject: [PATCH 17/57] add dataset and model downloading --- deeppavlov/configs/classifiers/dnnc.json | 25 +++++++++++++------ .../configs/classifiers/roberta_nli.json | 23 ++++++++++++----- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index b2acc2de40..0089eb5ef7 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -5,7 +5,7 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/banking_oos/dnnc", + "data_path": "{DOWNLOADS_PATH}/clinc150", "train": "train.json", "valid": "dev.json", "test": "test.json" @@ -37,7 +37,7 @@ "y": "category", "format": "json", "orient": "split", - "support_dataset_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json", + "support_dataset_path": "{SUPPORT_DATA_PATH}/parsed_dataset.json" }, { "class_name": "torch_transformers_preprocessor", @@ -84,7 +84,7 @@ "y_pred" ], "multilabel": false, - "confidence_threshold": 0.5 + "confidence_threshold": 0.0 } ], "out": [ @@ -92,7 +92,7 @@ ] }, "train": { - "batch_size": 1, + "batch_size": 10, "metrics": [ { "name": "sklearn_accuracy", @@ -128,11 +128,22 @@ "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/dnnc", + "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", + "MODELS_PATH": "{ROOT_PATH}/models/trash", + "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" - } + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", + "subdir": "{MODELS_PATH}" + }, + { + "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", + "subdir": "{DOWNLOADS_PATH}/clinc150" + } + ] } } \ No newline at end of file diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 527b93aa1d..8ef9616727 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -5,7 +5,7 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "/archive/savkin/parsed_datasets/clinc150/banking/dnnc", + "data_path": "{DOWNLOADS_PATH}/clinc150_no_oos", "train": "train.json", "valid": "dev.json", "test": "test.json" @@ -15,7 +15,8 @@ "seed": 0, "shuffle": true, "shot": 5, - "save_path": "/home/savkin/few_shot/DNNC/tmp/dataset/parsed_dataset.json" + "save_path": "{SUPPORT_DATA_PATH}/parsed_dataset.json", + "return_nli_format": true }, "chainer": { "in": [ @@ -68,7 +69,7 @@ ], "return_probas": true, "pretrained_bert": "{BASE_MODEL}", - "save_path": "{DNNC_PATH}/model", + "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "is_binary": "{BINARY_CLASSIFICATION}", "optimizer": "AdamW", @@ -125,11 +126,21 @@ "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models", + "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", + "MODELS_PATH": "{ROOT_PATH}/models/trash", "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", - "DNNC_PATH": "{MODELS_PATH}/classification/dnnc", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": true - } + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", + "subdir": "{MODELS_PATH}" + }, + { + "url": "https://files.deeppavlov.ai/datasets/clinc150_no_oos.tar.gz", + "subdir": "{DOWNLOADS_PATH}/clinc150_no_oos" + } + ] } } \ No newline at end of file From b17791be27e1eccdf414c8f45e770a9a3763a296 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 7 Dec 2022 10:49:37 +0300 Subject: [PATCH 18/57] Fix: change paths --- deeppavlov/configs/classifiers/roberta_nli.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 8ef9616727..a6312a04e3 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -127,7 +127,7 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", - "MODELS_PATH": "{ROOT_PATH}/models/trash", + "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": true @@ -135,7 +135,7 @@ "download": [ { "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", - "subdir": "{MODELS_PATH}" + "subdir": "{MODEL_PATH}" }, { "url": "https://files.deeppavlov.ai/datasets/clinc150_no_oos.tar.gz", @@ -143,4 +143,4 @@ } ] } -} \ No newline at end of file +} From d5c657d5cdab0da7461b75f21296cb76b7ad2735 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 7 Dec 2022 10:49:50 +0300 Subject: [PATCH 19/57] Fix: change paths --- deeppavlov/configs/classifiers/dnnc.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index 0089eb5ef7..c1de10bafb 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -129,7 +129,7 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", - "MODELS_PATH": "{ROOT_PATH}/models/trash", + "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", "BINARY_CLASSIFICATION": true, @@ -138,7 +138,7 @@ "download": [ { "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", - "subdir": "{MODELS_PATH}" + "subdir": "{MODEL_PATH}" }, { "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", @@ -146,4 +146,4 @@ } ] } -} \ No newline at end of file +} From 79ebde6befd744f41176c634dde2dd29362bc237 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 7 Dec 2022 10:59:41 +0300 Subject: [PATCH 20/57] Fix: download paths --- deeppavlov/configs/classifiers/dnnc.json | 2 +- deeppavlov/configs/classifiers/roberta_nli.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index c1de10bafb..fa5a707f7e 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -137,7 +137,7 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", "subdir": "{MODEL_PATH}" }, { diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index a6312a04e3..a71c689dad 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -134,7 +134,7 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", "subdir": "{MODEL_PATH}" }, { From a27b92af3a388ebbbe6af2de2096a6e56718735c Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 7 Dec 2022 11:42:57 +0300 Subject: [PATCH 21/57] remove skdlearn requirements --- deeppavlov/core/common/requirements_registry.json | 13 ------------- deeppavlov/requirements/sklearn.txt | 1 - 2 files changed, 14 deletions(-) delete mode 100644 deeppavlov/requirements/sklearn.txt diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json index a0ea5b01cf..d65eba771e 100644 --- a/deeppavlov/core/common/requirements_registry.json +++ b/deeppavlov/core/common/requirements_registry.json @@ -63,19 +63,6 @@ "russian_words_vocab": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" ], - "sklearn_accuracy": [ - "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" - ], - "sklearn_balanced_accuracy": [ - "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" - ], - "oos_scores": [ - "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" - ], - "sklearn_precision_recall_fscore_support": [ - "{DEEPPAVLOV_PATH}/requirements/sklearn.txt" - ], - "slovnet_syntax_parser": [ "{DEEPPAVLOV_PATH}/requirements/slovnet.txt" ], diff --git a/deeppavlov/requirements/sklearn.txt b/deeppavlov/requirements/sklearn.txt deleted file mode 100644 index ff88936c77..0000000000 --- a/deeppavlov/requirements/sklearn.txt +++ /dev/null @@ -1 +0,0 @@ -scikit-learn \ No newline at end of file From 61db0e65c8fd417a2acb8159e125e7b19ec20710 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 7 Dec 2022 11:46:39 +0300 Subject: [PATCH 22/57] fex ix metrics --- deeppavlov/core/common/metrics_registry.json | 6 ++--- deeppavlov/metrics/accuracy.py | 26 ++++++++++++++++++++ deeppavlov/metrics/few_shot_metrics.py | 13 ++++------ deeppavlov/metrics/roc_auc_score.py | 4 +-- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index 84c52b944e..c25a32a68a 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -33,10 +33,8 @@ "rank_response": "deeppavlov.models.ranking.metrics:rank_response", "roc_auc": "deeppavlov.metrics.roc_auc_score:roc_auc_score", "sets_accuracy": "deeppavlov.metrics.accuracy:sets_accuracy", - "sklearn_accuracy": "deeppavlov.metrics.few_shot_metrics:accuracy", - "sklearn_balanced_accuracy": "deeppavlov.metrics.few_shot_metrics:balanced_accuracy", - "sklearn_classification_report": "deeppavlov.metrics.few_shot_metrics:report", - "sklearn_precision_recall_fscore_support": "deeppavlov.metrics.few_shot_metrics:recision_recall_fscore", + "accuracy_oos": "deeppavlov.metrics.few_shot_metrics:accuracy", + "balanced_accuracy_oos": "deeppavlov.metrics.few_shot_metrics:balanced_accuracy", "slots_accuracy": "deeppavlov.metrics.accuracy:slots_accuracy", "spearman_correlation": "deeppavlov.metrics.correlation:spearman_correlation", "squad_v1_em": "deeppavlov.metrics.squad_metrics:squad_v1_exact_match", diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index 1042a74e78..560d92ee71 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -21,6 +21,32 @@ from deeppavlov.core.common.metrics_registry import register_metric +@register_metric('accuracy') +def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float: + """ + Calculate accuracy in terms of absolute coincidence + + Args: + y_true: array of true values + y_predicted: array of predicted values + + Returns: + fraction of absolutely coincidental samples + """ + examples_len = len(y_true) + # if y1 and y2 are both arrays, == can be erroneously interpreted as element-wise equality + + def _are_equal(y1, y2): + answer = (y1 == y2) + if isinstance(answer, np.ndarray): + answer = answer.all() + return answer + + equalities = [_are_equal(y1, y2) for y1, y2 in zip(y_true, y_predicted)] + correct = sum(equalities) + return correct / examples_len if examples_len else 0 + + @register_metric('multitask_accuracy') def multitask_accuracy(*args) -> float: """ diff --git a/deeppavlov/metrics/few_shot_metrics.py b/deeppavlov/metrics/few_shot_metrics.py index 11038098dd..c72b73cc5e 100644 --- a/deeppavlov/metrics/few_shot_metrics.py +++ b/deeppavlov/metrics/few_shot_metrics.py @@ -13,33 +13,30 @@ # limitations under the License. -from typing import List, Union, Optional - import numpy as np from deeppavlov.core.common.metrics_registry import register_metric from sklearn.metrics import accuracy_score, \ balanced_accuracy_score, \ - precision_recall_fscore_support, \ - classification_report + precision_recall_fscore_support def delete_oos(y_true, y_pred): y_true = np.array(y_true) y_pred = np.array(y_pred) - - ind_mask = np.where(y_true == 'oos') + ind_mask = np.where(y_true == 'oos') + y_true = np.delete(y_true, ind_mask, 0) y_pred = np.delete(y_pred, ind_mask, 0) return y_true, y_pred -@register_metric('sklearn_accuracy') +@register_metric('accuracy_oos') def accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: if exclude_oos: y_true, y_pred = delete_oos(y_true, y_pred) return accuracy_score(y_true, y_pred) -@register_metric('sklearn_balanced_accuracy') +@register_metric('balanced_accuracy_oos') def balanced_accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: if exclude_oos: y_true, y_pred = delete_oos(y_true, y_pred) diff --git a/deeppavlov/metrics/roc_auc_score.py b/deeppavlov/metrics/roc_auc_score.py index a2a3ebb841..d19c61a8b6 100644 --- a/deeppavlov/metrics/roc_auc_score.py +++ b/deeppavlov/metrics/roc_auc_score.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import List, Union, Optional +from typing import List, Union import numpy as np import sklearn.metrics @@ -38,4 +38,4 @@ def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray], return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)), np.squeeze(np.array(y_pred)), average="macro") except ValueError: - return 0. \ No newline at end of file + return 0. From 1d392b33c800ceb2e11465b47cb32a6637c13fcd Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 7 Dec 2022 11:49:42 +0300 Subject: [PATCH 23/57] fix configs format --- deeppavlov/configs/classifiers/dnnc.json | 48 ++++------- .../classifiers/glue/glue_mrpc_binary.json | 81 ------------------- .../configs/classifiers/roberta_nli.json | 66 +++++---------- 3 files changed, 34 insertions(+), 161 deletions(-) delete mode 100644 deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index 0089eb5ef7..80d434a5f8 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -16,18 +16,12 @@ "shuffle": true }, "chainer": { - "in": [ - "x" - ], - "in_y": [ - "y_true" - ], + "in": ["x"], + "in_y": ["y_true"], "pipe": [ { "class_name": "dnnc_input_preprocessor", - "in": [ - "x" - ], + "in": ["x"], "out": [ "x_support", "x_populated", @@ -45,9 +39,7 @@ "x_populated", "x_support" ], - "out": [ - "bert_features" - ], + "out": ["bert_features"], "vocab_file": "{BASE_MODEL}", "do_lower_case": true, "max_seq_length": 128 @@ -55,12 +47,8 @@ { "class_name": "torch_transformers_classifier", "main": true, - "in": [ - "bert_features" - ], - "out": [ - "simmilarity_scores" - ], + "in": ["bert_features"], + "out": ["simmilarity_scores"], "n_classes": 2, "return_probas": true, "pretrained_bert": "{BASE_MODEL}", @@ -80,22 +68,18 @@ "x_populated", "y_support" ], - "out": [ - "y_pred" - ], + "out": ["y_pred"], "multilabel": false, "confidence_threshold": 0.0 } ], - "out": [ - "y_pred" - ] + "out": ["y_pred"] }, "train": { "batch_size": 10, "metrics": [ { - "name": "sklearn_accuracy", + "name": "accuracy_oos", "inputs": [ "y_true", "y_pred" @@ -103,7 +87,7 @@ "exclude_oos": true }, { - "name": "sklearn_balanced_accuracy", + "name": "balanced_accuracy_oos", "inputs": [ "y_true", "y_pred" @@ -119,9 +103,7 @@ } ], "show_examples": false, - "evaluation_targets": [ - "test" - ], + "evaluation_targets": ["test"], "class_name": "torch_trainer" }, "metadata": { @@ -129,7 +111,7 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", - "MODELS_PATH": "{ROOT_PATH}/models/trash", + "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", "BINARY_CLASSIFICATION": true, @@ -137,8 +119,8 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", - "subdir": "{MODELS_PATH}" + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", + "subdir": "{MODEL_PATH}" }, { "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", @@ -146,4 +128,4 @@ } ] } -} \ No newline at end of file +} diff --git a/deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json b/deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json deleted file mode 100644 index e676860f98..0000000000 --- a/deeppavlov/configs/classifiers/glue/glue_mrpc_binary.json +++ /dev/null @@ -1,81 +0,0 @@ -{ - "dataset_reader": { - "class_name": "huggingface_dataset_reader", - "path": "glue", - "name": "mrpc", - "train": "train", - "valid": "validation", - "test": "test" - }, - "dataset_iterator": { - "class_name": "huggingface_dataset_iterator", - "features": ["sentence1", "sentence2"], - "label": "label", - "use_label_name": false, - "seed": 42 - }, - "chainer": { - "in": ["sentence1", "sentence2"], - "in_y": ["y_ids"], - "pipe": [ - { - "class_name": "torch_transformers_preprocessor", - "vocab_file": "{BASE_MODEL}", - "do_lower_case": false, - "max_seq_length": 100, - "in": ["sentence1", "sentence2"], - "out": ["bert_features"] - }, - { - "class_name": "torch_transformers_classifier", - "n_classes": 2, - "return_probas": true, - "pretrained_bert": "{BASE_MODEL}", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer": "AdamW", - "optimizer_parameters": { - "lr": 2e-05 - }, - "learning_rate_drop_patience": 3, - "learning_rate_drop_div": 2.0, - "in": ["bert_features"], - "in_y": ["y_ids"], - "out": ["y_pred_probas"] - }, - { - "in": ["y_pred_probas"], - "out": ["y_pred_ids"], - "class_name": "proba2labels", - "is_binary": "{BINARY_CLASSIFICATION}", - "confidence_threshold": 0.5 - } - ], - "out": ["y_pred_ids"] - }, - "train": { - "batch_size": 10, - "metrics": [ - "f1", - "accuracy" - ], - "validation_patience": 10, - "val_every_n_epochs": 1, - "log_every_n_epochs": 1, - "show_examples": false, - "evaluation_targets": ["train", "valid"], - "class_name": "torch_trainer", - "tensorboard_log_dir": "{MODEL_PATH}/", - "pytest_max_batches": 2 - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODEL_PATH": "/tmp/glue_mrpc_binary", - "BASE_MODEL": "distilbert-base-uncased", - "BINARY_CLASSIFICATION": true - } - } -} diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 8ef9616727..50934b694c 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -23,9 +23,7 @@ "premise", "hypothesis" ], - "in_y": [ - "y_true_labels" - ], + "in_y": ["y_true_labels"], "pipe": [ { "class_name": "torch_transformers_preprocessor", @@ -33,9 +31,7 @@ "premise", "hypothesis" ], - "out": [ - "bert_features" - ], + "out": ["bert_features"], "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128 @@ -43,30 +39,18 @@ { "id": "classes_vocab", "class_name": "simple_vocab", - "fit_on": [ - "y_true_labels" - ], + "fit_on": ["y_true_labels"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", - "in": [ - "y_true_labels" - ], - "out": [ - "y_true_ids" - ] + "in": ["y_true_labels"], + "out": ["y_true_ids"] }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", - "in": [ - "bert_features" - ], - "in_y": [ - "y_true_ids" - ], - "out": [ - "y_pred_probas" - ], + "in": ["bert_features"], + "in_y": ["y_true_ids"], + "out": ["y_pred_probas"], "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", @@ -78,28 +62,18 @@ } }, { - "in": [ - "y_pred_probas" - ], - "out": [ - "y_pred_ids" - ], "class_name": "proba2labels", + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], "max_proba": true }, { "ref": "classes_vocab", - "in": [ - "y_pred_ids" - ], - "out": [ - "y_pred_labels" - ] + "in": ["y_pred_ids"], + "out": ["y_pred_labels"] } ], - "out": [ - "y_pred_labels" - ] + "out": ["y_pred_labels"] }, "train": { "batch_size": 370, @@ -110,16 +84,14 @@ "validate_first": false, "metrics": [ { - "name": "sklearn_accuracy", + "name": "accuracy_oos", "inputs": [ "y_true_labels", "y_pred_labels" ] } ], - "evaluation_targets": [ - "valid" - ], + "evaluation_targets": ["valid"], "class_name": "torch_trainer" }, "metadata": { @@ -127,15 +99,15 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", - "MODELS_PATH": "{ROOT_PATH}/models/trash", + "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": true }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_finetune.tar.gz", - "subdir": "{MODELS_PATH}" + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", + "subdir": "{MODEL_PATH}" }, { "url": "https://files.deeppavlov.ai/datasets/clinc150_no_oos.tar.gz", @@ -143,4 +115,4 @@ } ] } -} \ No newline at end of file +} From 39e6ad125d0c09564e7bb3c74973642ac8be0c30 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 7 Dec 2022 16:17:03 +0300 Subject: [PATCH 24/57] Fix: configs format and paths --- deeppavlov/configs/classifiers/dnnc.json | 51 +++++-------------- .../configs/classifiers/roberta_nli.json | 30 ++++------- .../models/classifiers/dnnc_proba2labels.py | 4 +- 3 files changed, 24 insertions(+), 61 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc.json index 80d434a5f8..936ef8def1 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc.json @@ -5,7 +5,7 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "{DOWNLOADS_PATH}/clinc150", + "data_path": "{DOWNLOADS_PATH}", "train": "train.json", "valid": "dev.json", "test": "test.json" @@ -22,11 +22,7 @@ { "class_name": "dnnc_input_preprocessor", "in": ["x"], - "out": [ - "x_support", - "x_populated", - "y_support" - ], + "out": ["x_support", "x_populated", "y_support"], "x": "text", "y": "category", "format": "json", @@ -35,10 +31,7 @@ }, { "class_name": "torch_transformers_preprocessor", - "in": [ - "x_populated", - "x_support" - ], + "in": ["x_populated", "x_support"], "out": ["bert_features"], "vocab_file": "{BASE_MODEL}", "do_lower_case": true, @@ -55,51 +48,35 @@ "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer_parameters": { - "lr": 2e-05 - } + "optimizer_parameters": {"lr": 2e-05} }, { "class_name": "dnnc_proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", - "in": [ - "simmilarity_scores", - "x", - "x_populated", - "y_support" - ], + "in": ["simmilarity_scores", "x", "x_populated", "y_support"], "out": ["y_pred"], "multilabel": false, - "confidence_threshold": 0.0 + "confidence_threshold": 0.5 } ], "out": ["y_pred"] }, "train": { - "batch_size": 10, + "batch_size": 1, "metrics": [ { "name": "accuracy_oos", - "inputs": [ - "y_true", - "y_pred" - ], + "inputs": ["y_true", "y_pred"], "exclude_oos": true }, { "name": "balanced_accuracy_oos", - "inputs": [ - "y_true", - "y_pred" - ], + "inputs": ["y_true", "y_pred"], "exclude_oos": true }, { "name": "oos_scores", - "inputs": [ - "y_true", - "y_pred" - ] + "inputs": ["y_true", "y_pred"] } ], "show_examples": false, @@ -109,11 +86,9 @@ "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", - "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", + "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli", "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" }, @@ -124,7 +99,7 @@ }, { "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", - "subdir": "{DOWNLOADS_PATH}/clinc150" + "subdir": "{DOWNLOADS_PATH}" } ] } diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/roberta_nli.json index 50934b694c..be2bbf0fcb 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/roberta_nli.json @@ -5,7 +5,7 @@ "orient": "split", "x": "text", "y": "category", - "data_path": "{DOWNLOADS_PATH}/clinc150_no_oos", + "data_path": "{DOWNLOADS_PATH}", "train": "train.json", "valid": "dev.json", "test": "test.json" @@ -19,18 +19,12 @@ "return_nli_format": true }, "chainer": { - "in": [ - "premise", - "hypothesis" - ], + "in": ["premise", "hypothesis"], "in_y": ["y_true_labels"], "pipe": [ { "class_name": "torch_transformers_preprocessor", - "in": [ - "premise", - "hypothesis" - ], + "in": [ "premise", "hypothesis"], "out": ["bert_features"], "vocab_file": "{BASE_MODEL}", "do_lower_case": false, @@ -57,9 +51,7 @@ "load_path": "{MODEL_PATH}/model", "is_binary": "{BINARY_CLASSIFICATION}", "optimizer": "AdamW", - "optimizer_parameters": { - "lr": 2e-05 - } + "optimizer_parameters": {"lr": 2e-05} }, { "class_name": "proba2labels", @@ -76,7 +68,7 @@ "out": ["y_pred_labels"] }, "train": { - "batch_size": 370, + "batch_size": 10, "epochs": 7, "log_every_n_epochs": 1, "val_every_n_epochs": 7, @@ -85,10 +77,7 @@ "metrics": [ { "name": "accuracy_oos", - "inputs": [ - "y_true_labels", - "y_pred_labels" - ] + "inputs": ["y_true_labels", "y_pred_labels"] } ], "evaluation_targets": ["valid"], @@ -97,10 +86,9 @@ "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150_no_oos", "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", - "MODELS_PATH": "{ROOT_PATH}/models", - "MODEL_PATH": "{MODELS_PATH}/classification/roberta_nli_binary_finetune", + "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": true }, @@ -111,7 +99,7 @@ }, { "url": "https://files.deeppavlov.ai/datasets/clinc150_no_oos.tar.gz", - "subdir": "{DOWNLOADS_PATH}/clinc150_no_oos" + "subdir": "{DOWNLOADS_PATH}" } ] } diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index b1a1748276..9abb3e4153 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -30,7 +30,7 @@ def preprocess_scores(scores, is_binary, class_id: int = 1): class Proba2Labels(Component): def __init__(self, - confidence_threshold: float = 0.8, + confidence_threshold: float = 0.5, pooling: str = 'mean', multilabel: bool = False, is_binary: bool = False, @@ -85,4 +85,4 @@ def __call__(self, y_pred.append(prediction) - return y_pred \ No newline at end of file + return y_pred From dab193a776eeda8df55b70f4702bf242c6caba70 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 12 Dec 2022 10:31:36 +0300 Subject: [PATCH 25/57] Upd: documentation --- ...on.ipynb => few_shot_classification.ipynb} | 133 ++++++++++++------ docs/index.rst | 2 +- 2 files changed, 94 insertions(+), 41 deletions(-) rename docs/features/models/{few_shot_intent_classification.ipynb => few_shot_classification.ipynb} (71%) diff --git a/docs/features/models/few_shot_intent_classification.ipynb b/docs/features/models/few_shot_classification.ipynb similarity index 71% rename from docs/features/models/few_shot_intent_classification.ipynb rename to docs/features/models/few_shot_classification.ipynb index 4e31dfd1d0..b9ed328111 100644 --- a/docs/features/models/few_shot_intent_classification.ipynb +++ b/docs/features/models/few_shot_classification.ipynb @@ -28,15 +28,15 @@ " \n", "6. [Train the model on your data](#6.-Train-the-model-on-your-data)\n", " \n", - " 6.1. [Train your model from Python](#6.1-Train-your-model-from-Python)\n", + " 6.1 [Train your model from Python](#6.1-Train-your-model-from-Python)\n", " \n", - " 6.2. [Train your model from CLI](#6.2-Train-your-model-from-CLI)\n", + " 6.2 [Train your model from CLI](#6.2-Train-your-model-from-CLI)\n", " \n", "7. [Evaluate](#7.-Evaluate)\n", " \n", - " 7.1. [Evaluate from Python](#7.1-Evaluate-from-Python)\n", + " 7.1 [Evaluate from Python](#7.1-Evaluate-from-Python)\n", " \n", - " 7.2. [Evaluate from CLI](#7.2-Evaluate-from-CLI)\n", + " 7.2 [Evaluate from CLI](#7.2-Evaluate-from-CLI)\n", " \n", "8. [Metrics](#8.-Metrics)" ] @@ -52,7 +52,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "__Intent classification__ is a task of identifying speaker's intent given an utterance, where intent is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n" + "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n" ] }, { @@ -95,7 +95,7 @@ "\n", " [\n", " \"text\"\n", - " \"intent_class\"\n", + " \"label\"\n", " ],\n", "\n", " ...\n", @@ -115,7 +115,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The typical methodology of few-shot intent classification is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough.\n", + "The typical methodology of few-shot text classification is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough.\n", "\n", "\n", "DNNC authors suggests to model fine-grained relations of utterance pairs via pairise simmilarity:\n", @@ -162,7 +162,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then make sure that all the required packages for the model are installed." + "Then make sure that all the required packages, datasets and weights are installed." ] }, { @@ -171,7 +171,8 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov install dnnc" + "!python -m deeppavlov install dnnc\n", + "!python -m deeppavlov download dnnc" ] }, { @@ -194,14 +195,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training data as support dataset. You can specify the support dataset path in the in `dnnc` config file. It has the same format as metioned [before]()" + "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training set as support dataset. It is automatically saved by *dataset_iterator* during the training step, but you can specify your own support dataset in the in `dnnc` config file. It has the same format as metioned [before]()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "~/.deeppavlov/parsed_datasets/parsed_dataset.json\n" + ] + } + ], "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", @@ -236,14 +245,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Out-of-scope (OOS) examples are determined via confidence_threshold parameter with the following algorithm. Firstly model calculates an average similarity score for every intent class from support dataset. Secondly it determines the class with maximum similarity score. Finally the model predicts class with maximum similarity if it's score is higher than confidence_threshold and \"oos\" class otherwise. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0.5. You can change it to your preferences in configuration file" + "Out-of-scope (OOS) examples are determined via confidence_threshold parameter with the following algorithm. Firstly model calculates an average similarity score for every class from support dataset. Secondly it determines the class with maximum similarity score. Finally the model predicts class with maximum similarity if it's score is higher than confidence_threshold and \"oos\" class otherwise. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0.5 and you can change it to your preferences in configuration file" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5\n" + ] + } + ], "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", @@ -271,18 +288,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from deeppavlov import build_model, configs\n", - "\n", - "model = build_model(\"dnnc\", download=True)" + "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict. If you set 'download' flag to 'True', then existing model weights will be overwritten. " ] }, { @@ -299,6 +305,10 @@ } ], "source": [ + "from deeppavlov import build_model, configs\n", + "\n", + "model = build_model(\"dnnc\", install=True, download=True)\n", + "\n", "model([\"can you find me a good reviewed hotel in japan\", \"if i get a visa can i travel to japan\"])" ] }, @@ -322,7 +332,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov interact dnnc -d" + "!python -m deeppavlov interact dnnc [-d] [-i]" ] }, { @@ -338,7 +348,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov predict dnnc -f " + "!python -m deeppavlov predict dnnc [-d] [-i]" ] }, { @@ -364,9 +374,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "~/.deeppavlov/downloads/clinc150_no_oos\n" + ] + } + ], "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", @@ -391,7 +409,7 @@ "source": [ "from deeppavlov import train_model\n", "\n", - "model = train_model(\"roberta_nli\", download=True)" + "model = train_model(\"roberta_nli\", install=True, download=True)" ] }, { @@ -407,7 +425,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov train roberta_nli" + "!python -m deeppavlov train roberta_nli [-d] [-i]" ] }, { @@ -426,9 +444,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "~/.deeppavlov/downloads/clinc150\n" + ] + } + ], "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", @@ -453,7 +479,7 @@ "source": [ "from deeppavlov import evaluate_model\n", "\n", - "model = evaluate_model('dnnc', download=True)" + "model = evaluate_model('dnnc', install=True, download=True)" ] }, { @@ -469,13 +495,40 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov evaluate dnnc -d" + "!python -m deeppavlov evaluate dnnc [-d] [-i]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 8. Metrics" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We follow the original DNNC paper approach and choose the best confidence_threshold on the dev set " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Config name | Dataset | Shot| Threshold |In-domain Accuracy | OOS Precision | OOS Recall | OOS F1 |\n", + "| :--- | --- | --- | --- | --- | --- | --- | ---: |\n", + "| dnnc | [CLINC150](https://paperswithcode.com/dataset/clinc150) | 5 | 0.6 | 88.8 | 71.0 | 86.2 | 77.8 |\n", + "| dnnc | [CLINC150](https://paperswithcode.com/dataset/clinc150) Banking Domain| 5 | 0.4 | 90.7 | 97.3 | 98.7 | 98.0 |\n", + "| dnnc | [CLINC150](https://paperswithcode.com/dataset/clinc150) Work Domain | 5 | 0.1 | 98.1 | 99.9 | 95.4 | 97.6 |" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.10 ('dnnc_cuda11_env': venv)", + "display_name": "Python 3.8.10 ('dnnc_env': venv)", "language": "python", "name": "python3" }, @@ -489,12 +542,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "d267cc100fe706c63aefa7fd2da1b610b862ac822a3924399ae410740f5c813e" + "hash": "0856b36c2b26de82c1ead3e3019420db4ffa1511d91e779db8eb71f2314c36e4" } } }, diff --git a/docs/index.rst b/docs/index.rst index 4aed63037c..124d63d65e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -29,8 +29,8 @@ Welcome to DeepPavlov's documentation! BERT-based models Context Question Answering Classification + Few-shot Classification Entity Extraction - Few-shot Classification Named Entity Recognition Neural Ranking Spelling Correction From 718d1dee34d422abd03813710154fe84e92663f7 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 12 Dec 2022 10:43:01 +0300 Subject: [PATCH 26/57] Fix: typing --- .../dataset_iterators/few_shot_iterator.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index 87cfb87e8a..6a4feaa034 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -19,9 +19,6 @@ from logging import getLogger from pathlib import Path -import numpy as np -from overrides import overrides - from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @@ -72,7 +69,7 @@ def __init__(self, } - def _gather_info(self, data: List[Tuple[Any, Any]]): + def _gather_info(self, data: List[Tuple[Any, Any]]) -> Tuple[Dict, Dict]: unique_labels = list(set([label for text, label in data])) label2examples = {} @@ -89,7 +86,7 @@ def _gather_info(self, data: List[Tuple[Any, Any]]): return label2examples, label2negative - def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: + def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any], Any]]: if len(data) == 0: return data @@ -114,13 +111,12 @@ def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: return nli_triplets - def get_shot_examples(self, data, shot): + def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tuple[Any, Any]]: if shot is None: return data # shuffle data to select shot-examples - if self.shuffle: - self.random.shuffle(data) + self.random.shuffle(data) data_dict = {} for _, label in data: @@ -137,4 +133,8 @@ def get_shot_examples(self, data, shot): for label in data_dict.keys(): for text in data_dict[label]: new_data.append((text, label)) + + if self.shuffle: + self.random.shuffle(new_data) + return new_data From 7cd75a59a571a7182dcff4b05cf99cc80acca58e Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 12 Dec 2022 15:26:11 +0300 Subject: [PATCH 27/57] Upd: add oos removal in iterator --- .../{dnnc.json => dnnc_infer.json} | 4 +- .../{roberta_nli.json => dnnc_train.json} | 87 +++++++++++++------ .../dataset_iterators/few_shot_iterator.py | 36 +++++--- .../models/preprocessors/dnnc_preprocessor.py | 12 +-- 4 files changed, 94 insertions(+), 45 deletions(-) rename deeppavlov/configs/classifiers/{dnnc.json => dnnc_infer.json} (95%) rename deeppavlov/configs/classifiers/{roberta_nli.json => dnnc_train.json} (58%) diff --git a/deeppavlov/configs/classifiers/dnnc.json b/deeppavlov/configs/classifiers/dnnc_infer.json similarity index 95% rename from deeppavlov/configs/classifiers/dnnc.json rename to deeppavlov/configs/classifiers/dnnc_infer.json index 936ef8def1..8761543f67 100644 --- a/deeppavlov/configs/classifiers/dnnc.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -27,7 +27,7 @@ "y": "category", "format": "json", "orient": "split", - "support_dataset_path": "{SUPPORT_DATA_PATH}/parsed_dataset.json" + "support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json" }, { "class_name": "torch_transformers_preprocessor", @@ -87,7 +87,7 @@ "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", - "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", + "SUPPORT_DATA_PATH": "{ROOT_PATH}/preprocessed_datasets", "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli", "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" diff --git a/deeppavlov/configs/classifiers/roberta_nli.json b/deeppavlov/configs/classifiers/dnnc_train.json similarity index 58% rename from deeppavlov/configs/classifiers/roberta_nli.json rename to deeppavlov/configs/classifiers/dnnc_train.json index be2bbf0fcb..7ccb2dc7ab 100644 --- a/deeppavlov/configs/classifiers/roberta_nli.json +++ b/deeppavlov/configs/classifiers/dnnc_train.json @@ -15,17 +15,26 @@ "seed": 0, "shuffle": true, "shot": 5, - "save_path": "{SUPPORT_DATA_PATH}/parsed_dataset.json", "return_nli_format": true }, "chainer": { - "in": ["premise", "hypothesis"], - "in_y": ["y_true_labels"], + "in": [ + "premise", + "hypothesis" + ], + "in_y": [ + "y_true_labels" + ], "pipe": [ { "class_name": "torch_transformers_preprocessor", - "in": [ "premise", "hypothesis"], - "out": ["bert_features"], + "in": [ + "premise", + "hypothesis" + ], + "out": [ + "bert_features" + ], "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128 @@ -33,42 +42,66 @@ { "id": "classes_vocab", "class_name": "simple_vocab", - "fit_on": ["y_true_labels"], + "fit_on": [ + "y_true_labels" + ], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", - "in": ["y_true_labels"], - "out": ["y_true_ids"] + "in": [ + "y_true_labels" + ], + "out": [ + "y_true_ids" + ] }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", - "in": ["bert_features"], - "in_y": ["y_true_ids"], - "out": ["y_pred_probas"], + "in": [ + "bert_features" + ], + "in_y": [ + "y_true_ids" + ], + "out": [ + "y_pred_probas" + ], "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", "is_binary": "{BINARY_CLASSIFICATION}", "optimizer": "AdamW", - "optimizer_parameters": {"lr": 2e-05} + "optimizer_parameters": { + "lr": 2e-05 + } }, { "class_name": "proba2labels", - "in": ["y_pred_probas"], - "out": ["y_pred_ids"], + "in": [ + "y_pred_probas" + ], + "out": [ + "y_pred_ids" + ], "max_proba": true }, { "ref": "classes_vocab", - "in": ["y_pred_ids"], - "out": ["y_pred_labels"] + "in": [ + "y_pred_ids" + ], + "out": [ + "y_pred_labels" + ] } ], - "out": ["y_pred_labels"] + "out": [ + "y_pred_labels" + ] }, "train": { - "batch_size": 10, + "batch_size": 370, "epochs": 7, "log_every_n_epochs": 1, "val_every_n_epochs": 7, @@ -77,28 +110,32 @@ "metrics": [ { "name": "accuracy_oos", - "inputs": ["y_true_labels", "y_pred_labels"] + "inputs": [ + "y_true_labels", + "y_pred_labels" + ] } ], - "evaluation_targets": ["valid"], + "evaluation_targets": [ + "valid" + ], "class_name": "torch_trainer" }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150_no_oos", - "SUPPORT_DATA_PATH": "{ROOT_PATH}/parsed_datasets", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli", "BASE_MODEL": "roberta-base", "BINARY_CLASSIFICATION": true }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", - "subdir": "{MODEL_PATH}" + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", + "subdir": "{MODEL_PATH}" }, { - "url": "https://files.deeppavlov.ai/datasets/clinc150_no_oos.tar.gz", + "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", "subdir": "{DOWNLOADS_PATH}" } ] diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index 6a4feaa034..8b83492352 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -25,6 +25,8 @@ ENTAILMENT = 'entailment' NON_ENTAILMENT = 'non_entailment' +SUPPORT_DATASET_PATH="~/.deeppavlov/preprocessed_datasets/support_dataset.json" + log = getLogger(__name__) @register('few_shot_iterator') @@ -36,7 +38,6 @@ def __init__(self, shuffle: bool = True, shot: Optional[int] = None, shot_test: Optional[int] = None, - save_path: Optional[str] = None, return_nli_format: bool = False, *args, **kwargs) -> None: self.shot = shot @@ -44,17 +45,21 @@ def __init__(self, self.shuffle = shuffle self.random = Random(seed) - self.train = self.get_shot_examples(data.get('train', []), self.shot) - self.valid = self.get_shot_examples(data.get('valid', []), self.shot_test) - self.test = self.get_shot_examples(data.get('test', []), self.shot_test) + self.train = self.delete_oos(data.get('train', [])) + self.valid = self.delete_oos(data.get('valid', [])) + self.test = self.delete_oos(data.get('test', [])) + + self.train = self.get_shot_examples(self.train, self.shot) + self.valid = self.get_shot_examples(self.valid, self.shot_test) + self.test = self.get_shot_examples(self.test, self.shot_test) - if save_path is not None: - save_path = Path(save_path).expanduser() - save_path.parent.mkdir(parents=True, exist_ok=True) - with save_path.open("w") as file: - json_dict = {"columns": ["text","category"]} - json_dict["data"] = [[text, label] for text, label in self.train] - json.dump(json_dict, file, indent=4) + + save_path = Path(SUPPORT_DATASET_PATH).expanduser() + save_path.parent.mkdir(parents=True, exist_ok=True) + with save_path.open("w") as file: + json_dict = {"columns": ["text","category"]} + json_dict["data"] = [[text, label] for text, label in self.train] + json.dump(json_dict, file, indent=4) if return_nli_format: self.train = self.convert2nli(self.train) @@ -85,7 +90,6 @@ def _gather_info(self, data: List[Tuple[Any, Any]]) -> Tuple[Dict, Dict]: return label2examples, label2negative - def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any], Any]]: if len(data) == 0: return data @@ -109,7 +113,13 @@ def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any] self.random.shuffle(nli_triplets) return nli_triplets - + + def delete_oos(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: + filtered_data = [] + for text, label in data: + if label != 'oos': + filtered_data.append([text, label]) + return filtered_data def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tuple[Any, Any]]: if shot is None: diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 0ef614df02..bf30626831 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -1,14 +1,14 @@ from pathlib import Path -import pandas as pd from logging import exception, getLogger -from typing import List +from typing import List, Optional import numpy as np +import pandas as pd +from transformers import AutoTokenizer, BatchEncoding from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component - log = getLogger(__name__) @register('dnnc_input_preprocessor') @@ -17,7 +17,7 @@ def __init__(self, support_dataset_path: str, format: str = "csv", *args, **kwargs) -> None: - file = Path(support_dataset_path) + file = Path(support_dataset_path).expanduser() if file.exists(): if format == 'csv': keys = ('sep', 'header', 'names') @@ -35,6 +35,7 @@ def __init__(self, self.support_dataset = [(row[x], str(row[y])) for _, row in df.iterrows()] else: + self.support_dataset = None log.warning("Cannot find {} file".format(support_dataset_path)) def __call__(self, @@ -49,10 +50,11 @@ def __call__(self, for [premise, [hypotesis, hypotesis_labels]] in zip(input_texts * len(self.support_dataset), np.repeat(self.support_dataset, len(input_texts), axis=0)): + premise_batch.append(premise) hypotesis_batch.append(hypotesis) hypotesis_labels_batch.append(hypotesis_labels) return hypotesis_batch, premise_batch, hypotesis_labels_batch else: - log.warning("Error: no support dataset") + log.warning("Error: no support dataset") \ No newline at end of file From e8cf40ce8cee938251d27b1dd71e1634a7bd2b2a Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 12 Dec 2022 15:30:04 +0300 Subject: [PATCH 28/57] Fix: config format --- .../configs/classifiers/dnnc_train.json | 67 +++++-------------- .../models/preprocessors/dnnc_preprocessor.py | 2 +- 2 files changed, 17 insertions(+), 52 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc_train.json b/deeppavlov/configs/classifiers/dnnc_train.json index 7ccb2dc7ab..743957cb6b 100644 --- a/deeppavlov/configs/classifiers/dnnc_train.json +++ b/deeppavlov/configs/classifiers/dnnc_train.json @@ -18,23 +18,13 @@ "return_nli_format": true }, "chainer": { - "in": [ - "premise", - "hypothesis" - ], - "in_y": [ - "y_true_labels" - ], + "in": ["premise", "hypothesis"], + "in_y": ["y_true_labels"], "pipe": [ { "class_name": "torch_transformers_preprocessor", - "in": [ - "premise", - "hypothesis" - ], - "out": [ - "bert_features" - ], + "in": ["premise", "hypothesis"], + "out": ["bert_features"], "vocab_file": "{BASE_MODEL}", "do_lower_case": false, "max_seq_length": 128 @@ -42,30 +32,18 @@ { "id": "classes_vocab", "class_name": "simple_vocab", - "fit_on": [ - "y_true_labels" - ], + "fit_on": ["y_true_labels"], "save_path": "{MODEL_PATH}/classes.dict", "load_path": "{MODEL_PATH}/classes.dict", - "in": [ - "y_true_labels" - ], - "out": [ - "y_true_ids" - ] + "in": ["y_true_labels"], + "out": ["y_true_ids"] }, { "class_name": "torch_transformers_classifier", "n_classes": "#classes_vocab.len", - "in": [ - "bert_features" - ], - "in_y": [ - "y_true_ids" - ], - "out": [ - "y_pred_probas" - ], + "in": ["bert_features"], + "in_y": ["y_true_ids"], + "out": ["y_pred_probas"], "return_probas": true, "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", @@ -78,27 +56,17 @@ }, { "class_name": "proba2labels", - "in": [ - "y_pred_probas" - ], - "out": [ - "y_pred_ids" - ], + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], "max_proba": true }, { "ref": "classes_vocab", - "in": [ - "y_pred_ids" - ], - "out": [ - "y_pred_labels" - ] + "in": ["y_pred_ids"], + "out": ["y_pred_labels"] } ], - "out": [ - "y_pred_labels" - ] + "out": ["y_pred_labels"] }, "train": { "batch_size": 370, @@ -110,10 +78,7 @@ "metrics": [ { "name": "accuracy_oos", - "inputs": [ - "y_true_labels", - "y_pred_labels" - ] + "inputs": ["y_true_labels", "y_pred_labels"] } ], "evaluation_targets": [ diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index bf30626831..036030ca77 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -57,4 +57,4 @@ def __call__(self, return hypotesis_batch, premise_batch, hypotesis_labels_batch else: - log.warning("Error: no support dataset") \ No newline at end of file + log.warning("Error: no support dataset") From 72867caa744a7bc614049b8a6a8ec9521abe1734 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Tue, 7 Mar 2023 16:39:41 +0300 Subject: [PATCH 29/57] made the support dataset part of the input --- .../configs/classifiers/dnnc_infer.json | 23 ++-- .../configs/classifiers/dnnc_train.json | 108 ------------------ .../models/classifiers/dnnc_proba2labels.py | 6 +- .../models/preprocessors/dnnc_preprocessor.py | 68 +++++------ 4 files changed, 42 insertions(+), 163 deletions(-) delete mode 100644 deeppavlov/configs/classifiers/dnnc_train.json diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/dnnc_infer.json index 8761543f67..8b59becd7d 100644 --- a/deeppavlov/configs/classifiers/dnnc_infer.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -16,18 +16,15 @@ "shuffle": true }, "chainer": { - "in": ["x"], + "in": ["input"], "in_y": ["y_true"], "pipe": [ { "class_name": "dnnc_input_preprocessor", - "in": ["x"], - "out": ["x_support", "x_populated", "y_support"], - "x": "text", - "y": "category", - "format": "json", - "orient": "split", - "support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json" + "in": ["input"], + "out": ["x", "x_support", "x_populated", "y_support"], + "support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json", + "bidirectional": true }, { "class_name": "torch_transformers_preprocessor", @@ -56,7 +53,8 @@ "in": ["simmilarity_scores", "x", "x_populated", "y_support"], "out": ["y_pred"], "multilabel": false, - "confidence_threshold": 0.5 + "confidence_threshold": 0.0, + "pooling": "max" } ], "out": ["y_pred"] @@ -69,11 +67,6 @@ "inputs": ["y_true", "y_pred"], "exclude_oos": true }, - { - "name": "balanced_accuracy_oos", - "inputs": ["y_true", "y_pred"], - "exclude_oos": true - }, { "name": "oos_scores", "inputs": ["y_true", "y_pred"] @@ -103,4 +96,4 @@ } ] } -} +} \ No newline at end of file diff --git a/deeppavlov/configs/classifiers/dnnc_train.json b/deeppavlov/configs/classifiers/dnnc_train.json deleted file mode 100644 index 743957cb6b..0000000000 --- a/deeppavlov/configs/classifiers/dnnc_train.json +++ /dev/null @@ -1,108 +0,0 @@ -{ - "dataset_reader": { - "class_name": "basic_classification_reader", - "format": "json", - "orient": "split", - "x": "text", - "y": "category", - "data_path": "{DOWNLOADS_PATH}", - "train": "train.json", - "valid": "dev.json", - "test": "test.json" - }, - "dataset_iterator": { - "class_name": "few_shot_iterator", - "seed": 0, - "shuffle": true, - "shot": 5, - "return_nli_format": true - }, - "chainer": { - "in": ["premise", "hypothesis"], - "in_y": ["y_true_labels"], - "pipe": [ - { - "class_name": "torch_transformers_preprocessor", - "in": ["premise", "hypothesis"], - "out": ["bert_features"], - "vocab_file": "{BASE_MODEL}", - "do_lower_case": false, - "max_seq_length": 128 - }, - { - "id": "classes_vocab", - "class_name": "simple_vocab", - "fit_on": ["y_true_labels"], - "save_path": "{MODEL_PATH}/classes.dict", - "load_path": "{MODEL_PATH}/classes.dict", - "in": ["y_true_labels"], - "out": ["y_true_ids"] - }, - { - "class_name": "torch_transformers_classifier", - "n_classes": "#classes_vocab.len", - "in": ["bert_features"], - "in_y": ["y_true_ids"], - "out": ["y_pred_probas"], - "return_probas": true, - "pretrained_bert": "{BASE_MODEL}", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer": "AdamW", - "optimizer_parameters": { - "lr": 2e-05 - } - }, - { - "class_name": "proba2labels", - "in": ["y_pred_probas"], - "out": ["y_pred_ids"], - "max_proba": true - }, - { - "ref": "classes_vocab", - "in": ["y_pred_ids"], - "out": ["y_pred_labels"] - } - ], - "out": ["y_pred_labels"] - }, - "train": { - "batch_size": 370, - "epochs": 7, - "log_every_n_epochs": 1, - "val_every_n_epochs": 7, - "validation_patience": -1, - "validate_first": false, - "metrics": [ - { - "name": "accuracy_oos", - "inputs": ["y_true_labels", "y_pred_labels"] - } - ], - "evaluation_targets": [ - "valid" - ], - "class_name": "torch_trainer" - }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", - "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli", - "BASE_MODEL": "roberta-base", - "BINARY_CLASSIFICATION": true - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", - "subdir": "{MODEL_PATH}" - }, - { - "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", - "subdir": "{DOWNLOADS_PATH}" - } - ] - } -} diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 9abb3e4153..307bfc1b91 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -30,8 +30,8 @@ def preprocess_scores(scores, is_binary, class_id: int = 1): class Proba2Labels(Component): def __init__(self, - confidence_threshold: float = 0.5, - pooling: str = 'mean', + confidence_threshold: float = 0.0, + pooling: str = 'max', multilabel: bool = False, is_binary: bool = False, **kwargs) -> None: @@ -65,7 +65,7 @@ def __call__(self, probability_by_label = [] for label in unique_labels: ind_mask = np.where(example_y_support == label) - if self.pooling == 'mean': + if self.pooling == 'avg': label_probability = np.mean(example_simmilarity_scores[ind_mask]) elif self.pooling == 'max': label_probability = np.max(example_simmilarity_scores[ind_mask]) diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 036030ca77..7a83de7f06 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -1,6 +1,6 @@ from pathlib import Path from logging import exception, getLogger -from typing import List, Optional +from typing import List, Optional, Tuple import numpy as np import pandas as pd from transformers import AutoTokenizer, BatchEncoding @@ -14,47 +14,41 @@ @register('dnnc_input_preprocessor') class InputPreprocessor(Component): def __init__(self, - support_dataset_path: str, - format: str = "csv", + support_dataset_path: str = None, + bidirectional: bool = False, *args, **kwargs) -> None: - file = Path(support_dataset_path).expanduser() - if file.exists(): - if format == 'csv': - keys = ('sep', 'header', 'names') - options = {k: kwargs[k] for k in keys if k in kwargs} - df = pd.read_csv(file, **options) - elif format == 'json': - keys = ('orient', 'lines') - options = {k: kwargs[k] for k in keys if k in kwargs} - df = pd.read_json(file, **options) + self.bidirectional = bidirectional + if support_dataset_path: + file = Path(support_dataset_path).expanduser() + if file.exists(): + df = pd.read_json(file, orient='split') + self.support_dataset = [(row["text"], str(row["category"])) for _, row in df.iterrows()] else: - raise Exception('Unsupported file format: {}'.format(format)) + log.error(f"Cannot find {support_dataset_path} file") + self.support_dataset = None - x = kwargs.get("x", "text") - y = kwargs.get('y', 'labels') - - self.support_dataset = [(row[x], str(row[y])) for _, row in df.iterrows()] - else: - self.support_dataset = None - log.warning("Cannot find {} file".format(support_dataset_path)) - - def __call__(self, - input_texts : List[str]) -> List[List[str]]: + def __call__(self, input) -> List[List[str]]: ''' Generates all possible ordread pairs from 'input_texts' and 'self.support_dataset' ''' - if self.support_dataset: - hypotesis_batch = [] - premise_batch = [] - hypotesis_labels_batch = [] - - for [premise, [hypotesis, hypotesis_labels]] in zip(input_texts * len(self.support_dataset), - np.repeat(self.support_dataset, len(input_texts), axis=0)): - - premise_batch.append(premise) - hypotesis_batch.append(hypotesis) - hypotesis_labels_batch.append(hypotesis_labels) - return hypotesis_batch, premise_batch, hypotesis_labels_batch + if len(input) <= 1 or isinstance(input[1], str): + texts = input else: - log.warning("Error: no support dataset") + texts, self.support_dataset = input + + hypotesis_batch = [] + premise_batch = [] + hypotesis_labels_batch = [] + for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(self.support_dataset), + np.repeat(self.support_dataset, len(texts), axis=0)): + premise_batch.append(premise) + hypotesis_batch.append(hypotesis) + hypotesis_labels_batch.append(hypotesis_labels) + + if self.bidirectional: + premise_batch.append(hypotesis) + hypotesis_batch.append(premise) + hypotesis_labels_batch.append(hypotesis_labels) + return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch + \ No newline at end of file From c20d2d5f20e7097e2a4902f2d18578e586ad0528 Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Mon, 13 Mar 2023 18:32:23 +0300 Subject: [PATCH 30/57] Fix: index.rst --- docs/index.rst | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 124d63d65e..391bbc58ab 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,7 +9,8 @@ Welcome to DeepPavlov's documentation! QuickStart General concepts Configuration file - Models/Skills overview + Python pipelines + Models overview .. toctree:: @@ -26,27 +27,20 @@ Welcome to DeepPavlov's documentation! :maxdepth: 1 :caption: Models - BERT-based models - Context Question Answering - Classification - Few-shot Classification + + Context Question Answering + Classification + Named Entity Recognition Entity Extraction - Named Entity Recognition + BERT-based models Neural Ranking Spelling Correction TF-IDF Ranking Popularity Ranking Knowledge Base Question answering Relation Extraction - - -.. toctree:: - :glob: - :maxdepth: 1 - :caption: Skills - - Open-Domain Question Answering - Frequently Asked Questions Answering + SuperGLUE Submission + Open-Domain Question Answering .. toctree:: From 5d7a1988da7a4e2e7e7887a0c8f8f579a7af625d Mon Sep 17 00:00:00 2001 From: Maksim Savkin Date: Wed, 15 Mar 2023 17:50:53 +0300 Subject: [PATCH 31/57] Fix: index.rst --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 391bbc58ab..dffd62807b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,8 +26,8 @@ Welcome to DeepPavlov's documentation! :glob: :maxdepth: 1 :caption: Models - - + + Multitask BERT Context Question Answering Classification Named Entity Recognition From 5742605676294723f0aa5f3fa184220471a631dc Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Mon, 27 Mar 2023 15:55:34 +0300 Subject: [PATCH 32/57] Fix: empty reference in docs --- .../models/few_shot_classification.ipynb | 183 +++++------------- docs/index.rst | 1 + 2 files changed, 45 insertions(+), 139 deletions(-) diff --git a/docs/features/models/few_shot_classification.ipynb b/docs/features/models/few_shot_classification.ipynb index b9ed328111..ae3f7e3022 100644 --- a/docs/features/models/few_shot_classification.ipynb +++ b/docs/features/models/few_shot_classification.ipynb @@ -1,6 +1,15 @@ { "cells": [ { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Few-shot Text Classification" + ] + }, + { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -10,35 +19,27 @@ "\n", "2. [Dataset](#2.-Datasets)\n", " \n", - " 2.1 [Datasets format](#2.1-Datasets-format)\n", + " 2.1. [Datasets format](#2.1-Datasets-format)\n", "\n", "3. [Model architecture](#3.-Model-architecture)\n", "\n", "4. [Get started with the model](#4.-Get-started-with-the-model)\n", " \n", - " 4.1 [Installation](#4.1-Installation)\n", + " 4.1. [Installation](#4.1-Installation)\n", "\n", - " 4.2 [Support dataset configuration](#4.2-Support-dataset-configuration)\n", + " 4.2. [Support dataset configuration](#4.2-Support-dataset-configuration)\n", "\n", "5. [Use the model for prediction](#5.-Use-the-model-for-prediction)\n", "\n", - " 5.1 [Predict using Python](#5.1-Predict-using-Python)\n", - " \n", - " 5.2 [Predict using CLI](#5.2-Predict-using-CLI)\n", - " \n", - "6. [Train the model on your data](#6.-Train-the-model-on-your-data)\n", - " \n", - " 6.1 [Train your model from Python](#6.1-Train-your-model-from-Python)\n", - " \n", - " 6.2 [Train your model from CLI](#6.2-Train-your-model-from-CLI)\n", + " 5.1. [Predict using Python](#5.1-Predict-using-Python)\n", " \n", - "7. [Evaluate](#7.-Evaluate)\n", + " 5.2. [Predict using CLI](#5.2-Predict-using-CLI)\n", + " \n", + "6. [Evaluate](#6.-Evaluate)\n", " \n", - " 7.1 [Evaluate from Python](#7.1-Evaluate-from-Python)\n", + " 6.1. [Evaluate from Python](#6.1-Evaluate-from-Python)\n", " \n", - " 7.2 [Evaluate from CLI](#7.2-Evaluate-from-CLI)\n", - " \n", - "8. [Metrics](#8.-Metrics)" + " 6.2. [Evaluate from CLI](#6.2-Evaluate-from-CLI)" ] }, { @@ -63,12 +64,13 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "In our experiments we used the [CLINC150](https://paperswithcode.com/dataset/clinc150) dataset, which has 10 different domains with 15 intents each, 100 shots per intent class and 1000 OOS examples. It simulates a setting, where model has to handle many different services with wide variety of intents.\n", "\n", - "Specifically, we validate our model on CLINC150 from the original DNNC paper. We parsed it to match the format described [below](#21-dataset-format). The original dataset can be downloaded from the DNNC [github page](https://github.com/salesforce/DNNC-few-shot-intent)." + "Specifically, we validate our model on CLINC150 from the original DNNC paper. We parsed it to match the format described [below](#2.1-Datasets-format). The original dataset can be downloaded from the DNNC [github page](https://github.com/salesforce/DNNC-few-shot-intent)." ] }, { @@ -171,15 +173,16 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov install dnnc\n", - "!python -m deeppavlov download dnnc" + "!python -m deeppavlov install dnnc_infer\n", + "!python -m deeppavlov download dnnc_infer" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "`dnnc` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", + "`dnnc_infer` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes it's hyperparameters" ] @@ -192,10 +195,11 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training set as support dataset. It is automatically saved by *dataset_iterator* during the training step, but you can specify your own support dataset in the in `dnnc` config file. It has the same format as metioned [before]()" + "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training set as support dataset. It is automatically saved by *dataset_iterator* during the training step, but you can specify your own support dataset in the in `dnnc_infer` config file. It has the same format as metioned [before](#2.1-Datasets-format)" ] }, { @@ -214,7 +218,7 @@ "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", - "model_config = parse_config('dnnc')\n", + "model_config = parse_config('dnnc_infer')\n", "\n", "# dataset for predictions\n", "print(model_config['chainer']['pipe'][0]['support_dataset_path'])" @@ -242,6 +246,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -264,7 +269,7 @@ "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", - "model_config = parse_config('dnnc')\n", + "model_config = parse_config('dnnc_infer')\n", "\n", "# dataset for predictions\n", "print(model_config['chainer']['pipe'][-1]['confidence_threshold'])" @@ -307,7 +312,7 @@ "source": [ "from deeppavlov import build_model, configs\n", "\n", - "model = build_model(\"dnnc\", install=True, download=True)\n", + "model = build_model(\"dnnc_infer\", install=True, download=True)\n", "\n", "model([\"can you find me a good reviewed hotel in japan\", \"if i get a visa can i travel to japan\"])" ] @@ -332,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov interact dnnc [-d] [-i]" + "!python -m deeppavlov interact dnnc_infer [-d] [-i]" ] }, { @@ -348,98 +353,23 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov predict dnnc [-d] [-i]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 6. Train the model on your data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have a separate `roberta_nli` config for training, which automatically transforms dataset into a pairwise format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To train the model on your data, you need to change the path to the dataset in `roberta_nli` config" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "~/.deeppavlov/downloads/clinc150_no_oos\n" - ] - } - ], - "source": [ - "from deeppavlov.core.commands.utils import parse_config\n", - "\n", - "model_config = parse_config('roberta_nli')\n", - "\n", - "# dataset for training\n", - "print(model_config['dataset_reader']['data_path'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6.1 Train your model from Python" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from deeppavlov import train_model\n", - "\n", - "model = train_model(\"roberta_nli\", install=True, download=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6.2 Train your model from CLI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python -m deeppavlov train roberta_nli [-d] [-i]" + "!python -m deeppavlov predict dnnc_infer [-d] [-i]" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# 7. Evaluate" + "# 6. Evaluate" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "To evaluate the model on your data, you need to change the path to the dataset in `dnnc` config" + "To evaluate the model on your data, you need to change the path to the dataset in `dnnc_infer` config" ] }, { @@ -458,17 +388,18 @@ "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", - "model_config = parse_config('dnnc')\n", + "model_config = parse_config('dnnc_infer')\n", "\n", "# dataset for evaluation\n", "print(model_config['dataset_reader']['data_path'])" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## 7.1 Evaluate from Python" + "## 6.1 Evaluate from Python" ] }, { @@ -479,14 +410,15 @@ "source": [ "from deeppavlov import evaluate_model\n", "\n", - "model = evaluate_model('dnnc', install=True, download=True)" + "model = evaluate_model('dnnc_infer', install=True, download=True)" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## 7.2 Evaluate from CLI" + "## 6.2 Evaluate from CLI" ] }, { @@ -495,34 +427,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov evaluate dnnc [-d] [-i]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 8. Metrics" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We follow the original DNNC paper approach and choose the best confidence_threshold on the dev set " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| Config name | Dataset | Shot| Threshold |In-domain Accuracy | OOS Precision | OOS Recall | OOS F1 |\n", - "| :--- | --- | --- | --- | --- | --- | --- | ---: |\n", - "| dnnc | [CLINC150](https://paperswithcode.com/dataset/clinc150) | 5 | 0.6 | 88.8 | 71.0 | 86.2 | 77.8 |\n", - "| dnnc | [CLINC150](https://paperswithcode.com/dataset/clinc150) Banking Domain| 5 | 0.4 | 90.7 | 97.3 | 98.7 | 98.0 |\n", - "| dnnc | [CLINC150](https://paperswithcode.com/dataset/clinc150) Work Domain | 5 | 0.1 | 98.1 | 99.9 | 95.4 | 97.6 |" + "!python -m deeppavlov evaluate dnnc_infer [-d] [-i]" ] } ], @@ -542,7 +447,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10 (default, Jun 22 2022, 20:18:18) \n[GCC 9.4.0]" + "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { diff --git a/docs/index.rst b/docs/index.rst index dffd62807b..704ec886df 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ Welcome to DeepPavlov's documentation! Multitask BERT Context Question Answering Classification + Few-shot Classification Named Entity Recognition Entity Extraction BERT-based models From 6faccd3006312cb98ad2b4d21a207404ac566a83 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Mon, 27 Mar 2023 16:08:01 +0300 Subject: [PATCH 33/57] Fix: metrics registry --- deeppavlov/core/common/metrics_registry.json | 8 ++- deeppavlov/metrics/accuracy.py | 21 ++++++-- deeppavlov/metrics/few_shot_metrics.py | 52 -------------------- deeppavlov/metrics/fmeasure.py | 14 +++++- 4 files changed, 34 insertions(+), 61 deletions(-) delete mode 100644 deeppavlov/metrics/few_shot_metrics.py diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index c25a32a68a..678ac52183 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -1,5 +1,6 @@ { "acc": "deeppavlov.metrics.accuracy:round_accuracy", + "accuracy_oos": "deeppavlov.metrics.accuracy:accuracy_oos", "average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1", "average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1", "bleu": "deeppavlov.metrics.bleu:bleu", @@ -9,7 +10,6 @@ "f1_macro": "deeppavlov.metrics.fmeasure:round_f1_macro", "f1_weighted": "deeppavlov.metrics.fmeasure:round_f1_weighted", "google_bleu": "deeppavlov.metrics.bleu:google_bleu", - "joint_accuracy_in_recall_oos": "deeppavlov.metrics.few_shot_metrics:joint_accuracy_in_recall_oos", "kbqa_accuracy": "deeppavlov.metrics.accuracy:kbqa_accuracy", "log_loss": "deeppavlov.metrics.log_loss:sk_log_loss", "matthews_correlation": "deeppavlov.metrics.correlation:matthews_correlation", @@ -19,7 +19,7 @@ "multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy", "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", "ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1", - "oos_scores": "deeppavlov.metrics.few_shot_metrics:oos_scores", + "oos_scores": "deeppavlov.metrics.fmeasure:oos_scores", "pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation", "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu", "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy", @@ -33,8 +33,6 @@ "rank_response": "deeppavlov.models.ranking.metrics:rank_response", "roc_auc": "deeppavlov.metrics.roc_auc_score:roc_auc_score", "sets_accuracy": "deeppavlov.metrics.accuracy:sets_accuracy", - "accuracy_oos": "deeppavlov.metrics.few_shot_metrics:accuracy", - "balanced_accuracy_oos": "deeppavlov.metrics.few_shot_metrics:balanced_accuracy", "slots_accuracy": "deeppavlov.metrics.accuracy:slots_accuracy", "spearman_correlation": "deeppavlov.metrics.correlation:spearman_correlation", "squad_v1_em": "deeppavlov.metrics.squad_metrics:squad_v1_exact_match", @@ -43,4 +41,4 @@ "squad_v2_f1": "deeppavlov.metrics.squad_metrics:squad_v2_f1", "record_f1_score": "deeppavlov.metrics.record_metrics:record_f1_score", "record_em_score": "deeppavlov.metrics.record_metrics:record_em_score" -} \ No newline at end of file +} diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index 560d92ee71..dbcc7bdd92 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -20,7 +20,6 @@ from deeppavlov.core.common.metrics_registry import register_metric - @register_metric('accuracy') def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float: """ @@ -61,8 +60,11 @@ def multitask_accuracy(*args) -> float: """ n = len(args) y_true_by_tasks, y_predicted_by_tasks = args[:n // 2], args[n // 2:] - y_true, y_predicted = list(zip(*y_true_by_tasks)), list(zip(*y_predicted_by_tasks)) - return accuracy(y_true, y_predicted) + answers = [] + for true, pred in zip(y_true_by_tasks, y_predicted_by_tasks): + answers.append(accuracy(true, pred)) + final_answer = sum(answers)/len(answers) + return final_answer @register_metric('multitask_sequence_accuracy') @@ -185,3 +187,16 @@ def kbqa_accuracy(y_true, y_predicted): total_correct += 1 return total_correct / len(y_true) if len(y_true) else 0 + + +@register_metric('accuracy_oos') +def accuracy_oos(y_true, y_pred, exclude_oos: bool = False) -> float: + if exclude_oos: + y_true = np.array(y_true) + y_pred = np.array(y_pred) + + ind_mask = np.where(y_true == 'oos') + + y_true = np.delete(y_true, ind_mask, 0) + y_pred = np.delete(y_pred, ind_mask, 0) + return accuracy(y_true, y_pred) diff --git a/deeppavlov/metrics/few_shot_metrics.py b/deeppavlov/metrics/few_shot_metrics.py deleted file mode 100644 index c72b73cc5e..0000000000 --- a/deeppavlov/metrics/few_shot_metrics.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright 2017 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import numpy as np -from deeppavlov.core.common.metrics_registry import register_metric -from sklearn.metrics import accuracy_score, \ - balanced_accuracy_score, \ - precision_recall_fscore_support - -def delete_oos(y_true, y_pred): - y_true = np.array(y_true) - y_pred = np.array(y_pred) - - ind_mask = np.where(y_true == 'oos') - - y_true = np.delete(y_true, ind_mask, 0) - y_pred = np.delete(y_pred, ind_mask, 0) - return y_true, y_pred - -@register_metric('accuracy_oos') -def accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: - if exclude_oos: - y_true, y_pred = delete_oos(y_true, y_pred) - return accuracy_score(y_true, y_pred) - - -@register_metric('balanced_accuracy_oos') -def balanced_accuracy(y_true, y_pred, exclude_oos: bool = False) -> float: - if exclude_oos: - y_true, y_pred = delete_oos(y_true, y_pred) - - return balanced_accuracy_score(y_true, y_pred) - - -@register_metric('oos_scores') -def oos_scores(y_true, y_pred): - y_true_binary = (np.array(y_true) == "oos") - y_pred_binary = (np.array(y_pred) == "oos") - scores = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary') - return dict(zip(["precision", "recall", "fbeta_score"], scores[:3])) diff --git a/deeppavlov/metrics/fmeasure.py b/deeppavlov/metrics/fmeasure.py index 884c1e38a5..a33cc79d78 100644 --- a/deeppavlov/metrics/fmeasure.py +++ b/deeppavlov/metrics/fmeasure.py @@ -21,6 +21,7 @@ from sklearn.metrics import f1_score from deeppavlov.core.common.metrics_registry import register_metric +from sklearn.metrics import precision_recall_fscore_support log = getLogger(__name__) @@ -194,7 +195,11 @@ def round_f1(y_true, y_predicted): try: predictions = [np.round(x) for x in y_predicted] except TypeError: - predictions = y_predicted + if set(y_true) | set(y_predicted) in ({"True"}, {"False"}, {"False", "True"}): + y_true = [y == "True" for y in y_true] + predictions = [y == "True" for y in y_predicted] + else: + raise RuntimeError(f"Unexpectible type for {y_true} and {predictions}") return f1_score(y_true, predictions) @@ -413,3 +418,10 @@ def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_prob roc_auc2 = roc_auc_score(true_onehot2, pred_probas2) ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100 return (roc_auc1 + roc_auc2 + ner_f1_3) / 3 + +@register_metric('oos_scores') +def oos_scores(y_true, y_pred): + y_true_binary = (np.array(y_true) == "oos") + y_pred_binary = (np.array(y_pred) == "oos") + scores = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary') + return dict(zip(["precision", "recall", "fbeta_score"], scores[:3])) From d711fa593270ba010e44b61369b10f887f83b482 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 29 Mar 2023 11:57:10 +0300 Subject: [PATCH 34/57] Fix: bidirectional scores averaging --- .../configs/classifiers/dnnc_infer.json | 6 ++-- .../models/classifiers/dnnc_proba2labels.py | 32 +++++++------------ docs/index.rst | 1 - 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/dnnc_infer.json index 8b59becd7d..dc4bb32a2a 100644 --- a/deeppavlov/configs/classifiers/dnnc_infer.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -50,7 +50,7 @@ { "class_name": "dnnc_proba2labels", "is_binary": "{BINARY_CLASSIFICATION}", - "in": ["simmilarity_scores", "x", "x_populated", "y_support"], + "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"], "out": ["y_pred"], "multilabel": false, "confidence_threshold": 0.0, @@ -81,13 +81,13 @@ "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", "SUPPORT_DATA_PATH": "{ROOT_PATH}/preprocessed_datasets", - "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli", + "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10", "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" }, "download": [ { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli.tar.gz", + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz", "subdir": "{MODEL_PATH}" }, { diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 307bfc1b91..b1d19bcb9c 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -32,13 +32,11 @@ class Proba2Labels(Component): def __init__(self, confidence_threshold: float = 0.0, pooling: str = 'max', - multilabel: bool = False, is_binary: bool = False, **kwargs) -> None: self.confidence_threshold = confidence_threshold self.pooling = pooling - self.multilabel = multilabel self.is_binary = is_binary @@ -46,42 +44,36 @@ def __call__(self, simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], x: List[str], x_populated: List[str], - y_support: List[str], - *args, - **kwargs): + x_support: List[str], + y_support: List[str]): y_pred = [] simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) x_populated = np.array(x_populated) + x_support = np.array(x_support) y_support = np.array(y_support) unique_labels = np.unique(y_support) for example in x: - example_mask = np.where(x_populated == example) + example_mask = np.where(np.logical_xor(x_populated == example, x_support == example)) example_simmilarity_scores = simmilarity_scores[example_mask] example_y_support = y_support[example_mask] probability_by_label = [] for label in unique_labels: - ind_mask = np.where(example_y_support == label) + label_mask = np.where(example_y_support == label) + label_simmilarity_scores = example_simmilarity_scores[label_mask] if self.pooling == 'avg': - label_probability = np.mean(example_simmilarity_scores[ind_mask]) + label_probability = np.mean(label_simmilarity_scores) elif self.pooling == 'max': - label_probability = np.max(example_simmilarity_scores[ind_mask]) + label_probability = np.max(label_simmilarity_scores) probability_by_label.append(label_probability) + probability_by_label = np.array(probability_by_label) - - - - if self.multilabel: - threshold_mask = np.where(probability_by_label >= self.confidence_threshold) - threshold_y_support = unique_labels[threshold_mask] - prediction = ["oos"] if threshold_y_support.size == 0 else threshold_y_support - else: - max_probability = max(probability_by_label) - max_probability_label = unique_labels[np.argmax(probability_by_label)] - prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label + max_probability = max(probability_by_label) + max_probability_label = unique_labels[np.argmax(probability_by_label)] + prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label y_pred.append(prediction) diff --git a/docs/index.rst b/docs/index.rst index 704ec886df..dffd62807b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,7 +30,6 @@ Welcome to DeepPavlov's documentation! Multitask BERT Context Question Answering Classification - Few-shot Classification Named Entity Recognition Entity Extraction BERT-based models From af23394d6ad2c44d5aef487f7e10ca422cdfa082 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 29 Mar 2023 15:46:25 +0300 Subject: [PATCH 35/57] Fix: index.rst --- docs/index.rst | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index dffd62807b..124d63d65e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,8 +9,7 @@ Welcome to DeepPavlov's documentation! QuickStart General concepts Configuration file - Python pipelines - Models overview + Models/Skills overview .. toctree:: @@ -26,21 +25,28 @@ Welcome to DeepPavlov's documentation! :glob: :maxdepth: 1 :caption: Models - - Multitask BERT - Context Question Answering - Classification - Named Entity Recognition - Entity Extraction + BERT-based models + Context Question Answering + Classification + Few-shot Classification + Entity Extraction + Named Entity Recognition Neural Ranking Spelling Correction TF-IDF Ranking Popularity Ranking Knowledge Base Question answering Relation Extraction - SuperGLUE Submission - Open-Domain Question Answering + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Skills + + Open-Domain Question Answering + Frequently Asked Questions Answering .. toctree:: From 7702d65d0ceb17b576a87c641a78ddf0eee9cae2 Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Fri, 7 Apr 2023 11:53:27 +0300 Subject: [PATCH 36/57] refactor: minor style changes --- .../configs/classifiers/dnnc_infer.json | 192 +++++++++--------- .../dataset_iterators/few_shot_iterator.py | 51 +++-- deeppavlov/metrics/accuracy.py | 5 +- deeppavlov/metrics/fmeasure.py | 3 +- .../models/classifiers/dnnc_proba2labels.py | 11 +- .../models/preprocessors/dnnc_preprocessor.py | 27 ++- 6 files changed, 150 insertions(+), 139 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/dnnc_infer.json index dc4bb32a2a..c7763f01af 100644 --- a/deeppavlov/configs/classifiers/dnnc_infer.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -1,99 +1,99 @@ { - "dataset_reader": { - "class_name": "basic_classification_reader", - "format": "json", - "orient": "split", - "x": "text", - "y": "category", - "data_path": "{DOWNLOADS_PATH}", - "train": "train.json", - "valid": "dev.json", - "test": "test.json" - }, - "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "shuffle": true - }, - "chainer": { + "dataset_reader": { + "class_name": "basic_classification_reader", + "format": "json", + "orient": "split", + "x": "text", + "y": "category", + "data_path": "{DOWNLOADS_PATH}", + "train": "train.json", + "valid": "dev.json", + "test": "test.json" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42, + "shuffle": true + }, + "chainer": { + "in": ["input"], + "in_y": ["y_true"], + "pipe": [ + { + "class_name": "dnnc_input_preprocessor", "in": ["input"], - "in_y": ["y_true"], - "pipe": [ - { - "class_name": "dnnc_input_preprocessor", - "in": ["input"], - "out": ["x", "x_support", "x_populated", "y_support"], - "support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json", - "bidirectional": true - }, - { - "class_name": "torch_transformers_preprocessor", - "in": ["x_populated", "x_support"], - "out": ["bert_features"], - "vocab_file": "{BASE_MODEL}", - "do_lower_case": true, - "max_seq_length": 128 - }, - { - "class_name": "torch_transformers_classifier", - "main": true, - "in": ["bert_features"], - "out": ["simmilarity_scores"], - "n_classes": 2, - "return_probas": true, - "pretrained_bert": "{BASE_MODEL}", - "save_path": "{MODEL_PATH}/model", - "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer_parameters": {"lr": 2e-05} - }, - { - "class_name": "dnnc_proba2labels", - "is_binary": "{BINARY_CLASSIFICATION}", - "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"], - "out": ["y_pred"], - "multilabel": false, - "confidence_threshold": 0.0, - "pooling": "max" - } - ], - "out": ["y_pred"] - }, - "train": { - "batch_size": 1, - "metrics": [ - { - "name": "accuracy_oos", - "inputs": ["y_true", "y_pred"], - "exclude_oos": true - }, - { - "name": "oos_scores", - "inputs": ["y_true", "y_pred"] - } - ], - "show_examples": false, - "evaluation_targets": ["test"], - "class_name": "torch_trainer" + "out": ["x", "x_support", "x_populated", "y_support"], + "support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json", + "bidirectional": true + }, + { + "class_name": "torch_transformers_preprocessor", + "in": ["x_populated", "x_support"], + "out": ["bert_features"], + "vocab_file": "{BASE_MODEL}", + "do_lower_case": true, + "max_seq_length": 128 + }, + { + "class_name": "torch_transformers_classifier", + "main": true, + "in": ["bert_features"], + "out": ["simmilarity_scores"], + "n_classes": 2, + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "is_binary": "{BINARY_CLASSIFICATION}", + "optimizer_parameters": {"lr": 2e-05} + }, + { + "class_name": "dnnc_proba2labels", + "is_binary": "{BINARY_CLASSIFICATION}", + "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"], + "out": ["y_pred"], + "multilabel": false, + "confidence_threshold": 0.0, + "pooling": "max" + } + ], + "out": ["y_pred"] + }, + "train": { + "batch_size": 1, + "metrics": [ + { + "name": "accuracy_oos", + "inputs": ["y_true", "y_pred"], + "exclude_oos": true + }, + { + "name": "oos_scores", + "inputs": ["y_true", "y_pred"] + } + ], + "show_examples": false, + "evaluation_targets": ["test"], + "class_name": "torch_trainer" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", + "SUPPORT_DATA_PATH": "{ROOT_PATH}/preprocessed_datasets", + "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10", + "BINARY_CLASSIFICATION": true, + "BASE_MODEL": "roberta-base" }, - "metadata": { - "variables": { - "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", - "SUPPORT_DATA_PATH": "{ROOT_PATH}/preprocessed_datasets", - "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10", - "BINARY_CLASSIFICATION": true, - "BASE_MODEL": "roberta-base" - }, - "download": [ - { - "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz", - "subdir": "{MODEL_PATH}" - }, - { - "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", - "subdir": "{DOWNLOADS_PATH}" - } - ] - } -} \ No newline at end of file + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz", + "subdir": "{MODEL_PATH}" + }, + { + "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", + "subdir": "{DOWNLOADS_PATH}" + } + ] + } +} diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py index 8b83492352..544d631ff7 100644 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ b/deeppavlov/dataset_iterators/few_shot_iterator.py @@ -12,12 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. - import json -from random import Random -from typing import Dict, Any, List, Tuple, Optional from logging import getLogger from pathlib import Path +from random import Random +from typing import Dict, Any, List, Tuple, Optional from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @@ -25,23 +24,21 @@ ENTAILMENT = 'entailment' NON_ENTAILMENT = 'non_entailment' -SUPPORT_DATASET_PATH="~/.deeppavlov/preprocessed_datasets/support_dataset.json" +SUPPORT_DATASET_PATH = "~/.deeppavlov/preprocessed_datasets/support_dataset.json" log = getLogger(__name__) + @register('few_shot_iterator') class FewShotIterator(DataLearningIterator): - def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], - seed: int = None, - shuffle: bool = True, + seed: int = None, + shuffle: bool = True, shot: Optional[int] = None, shot_test: Optional[int] = None, return_nli_format: bool = False, *args, **kwargs) -> None: - self.shot = shot - self.shot_test = shot_test self.shuffle = shuffle self.random = Random(seed) @@ -49,23 +46,24 @@ def __init__(self, self.valid = self.delete_oos(data.get('valid', [])) self.test = self.delete_oos(data.get('test', [])) - self.train = self.get_shot_examples(self.train, self.shot) - self.valid = self.get_shot_examples(self.valid, self.shot_test) - self.test = self.get_shot_examples(self.test, self.shot_test) - + self.train = self.get_shot_examples(self.train, shot) + self.valid = self.get_shot_examples(self.valid, shot_test) + self.test = self.get_shot_examples(self.test, shot_test) save_path = Path(SUPPORT_DATASET_PATH).expanduser() save_path.parent.mkdir(parents=True, exist_ok=True) with save_path.open("w") as file: - json_dict = {"columns": ["text","category"]} - json_dict["data"] = [[text, label] for text, label in self.train] + json_dict = { + "columns": ["text", "category"], + "data": [[text, label] for text, label in self.train] + } json.dump(json_dict, file, indent=4) if return_nli_format: self.train = self.convert2nli(self.train) self.valid = self.convert2nli(self.valid) self.test = self.convert2nli(self.test) - + self.data = { 'train': self.train, 'valid': self.valid, @@ -73,7 +71,6 @@ def __init__(self, 'all': self.train + self.test + self.valid } - def _gather_info(self, data: List[Tuple[Any, Any]]) -> Tuple[Dict, Dict]: unique_labels = list(set([label for text, label in data])) @@ -82,12 +79,12 @@ def _gather_info(self, data: List[Tuple[Any, Any]]) -> Tuple[Dict, Dict]: label2examples[label] = [] for text, label in data: label2examples[label].append(text) - + label2negative = {} for i, label in enumerate(unique_labels): label2negative[label] = unique_labels.copy() del label2negative[label][i] - + return label2examples, label2negative def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any], Any]]: @@ -95,25 +92,25 @@ def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any] return data label2examples, label2negative = self._gather_info(data) - + nli_triplets = [] # negative examples for text, label in data: for negative_label in label2negative[label]: for negative_example in label2examples[negative_label]: nli_triplets.append([[text, negative_example], NON_ENTAILMENT]) - + # positive examples for text, label in data: for positive_example in label2examples[label]: if positive_example != text: nli_triplets.append([[text, positive_example], ENTAILMENT]) - + if self.shuffle: self.random.shuffle(nli_triplets) - + return nli_triplets - + def delete_oos(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: filtered_data = [] for text, label in data: @@ -124,7 +121,7 @@ def delete_oos(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tuple[Any, Any]]: if shot is None: return data - + # shuffle data to select shot-examples self.random.shuffle(data) @@ -135,7 +132,7 @@ def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tupl for text, label in data: if len(data_dict[label]) < shot: data_dict[label].append(text) - + if max(len(x) for x in data_dict.values()) < shot: log.warning(f"Some labels have less than \"shot\"={shot} examples") @@ -143,7 +140,7 @@ def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tupl for label in data_dict.keys(): for text in data_dict[label]: new_data.append((text, label)) - + if self.shuffle: self.random.shuffle(new_data) diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index dbcc7bdd92..2474107992 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -14,12 +14,13 @@ import itertools -from typing import List, Iterable +from typing import List import numpy as np from deeppavlov.core.common.metrics_registry import register_metric + @register_metric('accuracy') def accuracy(y_true: [list, np.ndarray], y_predicted: [list, np.ndarray]) -> float: """ @@ -194,7 +195,7 @@ def accuracy_oos(y_true, y_pred, exclude_oos: bool = False) -> float: if exclude_oos: y_true = np.array(y_true) y_pred = np.array(y_pred) - + ind_mask = np.where(y_true == 'oos') y_true = np.delete(y_true, ind_mask, 0) diff --git a/deeppavlov/metrics/fmeasure.py b/deeppavlov/metrics/fmeasure.py index a33cc79d78..b3285f0316 100644 --- a/deeppavlov/metrics/fmeasure.py +++ b/deeppavlov/metrics/fmeasure.py @@ -19,9 +19,9 @@ import numpy as np from sklearn.metrics import f1_score +from sklearn.metrics import precision_recall_fscore_support from deeppavlov.core.common.metrics_registry import register_metric -from sklearn.metrics import precision_recall_fscore_support log = getLogger(__name__) @@ -419,6 +419,7 @@ def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_prob ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100 return (roc_auc1 + roc_auc2 + ner_f1_3) / 3 + @register_metric('oos_scores') def oos_scores(y_true, y_pred): y_true_binary = (np.array(y_true) == "oos") diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index b1d19bcb9c..01a00c7598 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -22,10 +22,12 @@ log = getLogger(__name__) + def preprocess_scores(scores, is_binary, class_id: int = 1): scores = np.array(scores) return scores if is_binary else scores[:, class_id] + @register('dnnc_proba2labels') class Proba2Labels(Component): @@ -39,7 +41,6 @@ def __init__(self, self.pooling = pooling self.is_binary = is_binary - def __call__(self, simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], x: List[str], @@ -55,7 +56,7 @@ def __call__(self, unique_labels = np.unique(y_support) - for example in x: + for example in x: example_mask = np.where(np.logical_xor(x_populated == example, x_support == example)) example_simmilarity_scores = simmilarity_scores[example_mask] example_y_support = y_support[example_mask] @@ -69,12 +70,12 @@ def __call__(self, elif self.pooling == 'max': label_probability = np.max(label_simmilarity_scores) probability_by_label.append(label_probability) - + probability_by_label = np.array(probability_by_label) max_probability = max(probability_by_label) max_probability_label = unique_labels[np.argmax(probability_by_label)] prediction = "oos" if max_probability < self.confidence_threshold else max_probability_label - + y_pred.append(prediction) - + return y_pred diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 7a83de7f06..b5a6bc557e 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -1,16 +1,30 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from logging import getLogger from pathlib import Path -from logging import exception, getLogger -from typing import List, Optional, Tuple +from typing import List + import numpy as np import pandas as pd -from transformers import AutoTokenizer, BatchEncoding -from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component log = getLogger(__name__) + @register('dnnc_input_preprocessor') class InputPreprocessor(Component): def __init__(self, @@ -28,9 +42,7 @@ def __init__(self, self.support_dataset = None def __call__(self, input) -> List[List[str]]: - ''' - Generates all possible ordread pairs from 'input_texts' and 'self.support_dataset' - ''' + """Generates all possible ordread pairs from 'input_texts' and 'self.support_dataset'""" if len(input) <= 1 or isinstance(input[1], str): texts = input @@ -51,4 +63,3 @@ def __call__(self, input) -> List[List[str]]: hypotesis_batch.append(premise) hypotesis_labels_batch.append(hypotesis_labels) return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch - \ No newline at end of file From fff9a9d64355416f9851a90cdc8aee7e0b48fea5 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Fri, 14 Apr 2023 17:30:50 +0300 Subject: [PATCH 37/57] Fix: accuracy_oos arguments --- .../configs/classifiers/dnnc_infer.json | 5 ++--- deeppavlov/core/common/metrics_registry.json | 3 ++- deeppavlov/metrics/accuracy.py | 19 +++++++++---------- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/dnnc_infer.json index c7763f01af..6b7ceaa3d0 100644 --- a/deeppavlov/configs/classifiers/dnnc_infer.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -63,9 +63,8 @@ "batch_size": 1, "metrics": [ { - "name": "accuracy_oos", - "inputs": ["y_true", "y_pred"], - "exclude_oos": true + "name": "accuracy_in_domain", + "inputs": ["y_true", "y_pred"] }, { "name": "oos_scores", diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index 678ac52183..19d14c2395 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -1,6 +1,7 @@ { "acc": "deeppavlov.metrics.accuracy:round_accuracy", - "accuracy_oos": "deeppavlov.metrics.accuracy:accuracy_oos", + "accuracy": "deeppavlov.metrics.accuracy:accuracy", + "accuracy_in_domain": "deeppavlov.metrics.accuracy:accuracy_in_domain", "average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1", "average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1", "bleu": "deeppavlov.metrics.bleu:bleu", diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index 2474107992..b21cfd19dc 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -190,14 +190,13 @@ def kbqa_accuracy(y_true, y_predicted): return total_correct / len(y_true) if len(y_true) else 0 -@register_metric('accuracy_oos') -def accuracy_oos(y_true, y_pred, exclude_oos: bool = False) -> float: - if exclude_oos: - y_true = np.array(y_true) - y_pred = np.array(y_pred) - - ind_mask = np.where(y_true == 'oos') - - y_true = np.delete(y_true, ind_mask, 0) - y_pred = np.delete(y_pred, ind_mask, 0) +@register_metric('accuracy_in_domain') +def accuracy_in_domain(y_true: [list, np.ndarray], y_pred: [list, np.ndarray], oos = 'oos') -> float: + y_true = np.array(y_true) + y_pred = np.array(y_pred) + + ind_mask = np.where(y_true == oos) + + y_true = np.delete(y_true, ind_mask, 0) + y_pred = np.delete(y_pred, ind_mask, 0) return accuracy(y_true, y_pred) From c70568850bef062c20a839568886509d8934a450 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 19 Apr 2023 11:38:53 +0300 Subject: [PATCH 38/57] refactor: deleted a few-shot iterator that was not used anywhere --- .../dataset_iterators/few_shot_iterator.py | 147 ------------------ 1 file changed, 147 deletions(-) delete mode 100644 deeppavlov/dataset_iterators/few_shot_iterator.py diff --git a/deeppavlov/dataset_iterators/few_shot_iterator.py b/deeppavlov/dataset_iterators/few_shot_iterator.py deleted file mode 100644 index 544d631ff7..0000000000 --- a/deeppavlov/dataset_iterators/few_shot_iterator.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2017 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from logging import getLogger -from pathlib import Path -from random import Random -from typing import Dict, Any, List, Tuple, Optional - -from deeppavlov.core.common.registry import register -from deeppavlov.core.data.data_learning_iterator import DataLearningIterator - -ENTAILMENT = 'entailment' -NON_ENTAILMENT = 'non_entailment' - -SUPPORT_DATASET_PATH = "~/.deeppavlov/preprocessed_datasets/support_dataset.json" - -log = getLogger(__name__) - - -@register('few_shot_iterator') -class FewShotIterator(DataLearningIterator): - def __init__(self, - data: Dict[str, List[Tuple[Any, Any]]], - seed: int = None, - shuffle: bool = True, - shot: Optional[int] = None, - shot_test: Optional[int] = None, - return_nli_format: bool = False, - *args, **kwargs) -> None: - self.shuffle = shuffle - self.random = Random(seed) - - self.train = self.delete_oos(data.get('train', [])) - self.valid = self.delete_oos(data.get('valid', [])) - self.test = self.delete_oos(data.get('test', [])) - - self.train = self.get_shot_examples(self.train, shot) - self.valid = self.get_shot_examples(self.valid, shot_test) - self.test = self.get_shot_examples(self.test, shot_test) - - save_path = Path(SUPPORT_DATASET_PATH).expanduser() - save_path.parent.mkdir(parents=True, exist_ok=True) - with save_path.open("w") as file: - json_dict = { - "columns": ["text", "category"], - "data": [[text, label] for text, label in self.train] - } - json.dump(json_dict, file, indent=4) - - if return_nli_format: - self.train = self.convert2nli(self.train) - self.valid = self.convert2nli(self.valid) - self.test = self.convert2nli(self.test) - - self.data = { - 'train': self.train, - 'valid': self.valid, - 'test': self.test, - 'all': self.train + self.test + self.valid - } - - def _gather_info(self, data: List[Tuple[Any, Any]]) -> Tuple[Dict, Dict]: - unique_labels = list(set([label for text, label in data])) - - label2examples = {} - for label in unique_labels: - label2examples[label] = [] - for text, label in data: - label2examples[label].append(text) - - label2negative = {} - for i, label in enumerate(unique_labels): - label2negative[label] = unique_labels.copy() - del label2negative[label][i] - - return label2examples, label2negative - - def convert2nli(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Tuple[Any, Any], Any]]: - if len(data) == 0: - return data - - label2examples, label2negative = self._gather_info(data) - - nli_triplets = [] - # negative examples - for text, label in data: - for negative_label in label2negative[label]: - for negative_example in label2examples[negative_label]: - nli_triplets.append([[text, negative_example], NON_ENTAILMENT]) - - # positive examples - for text, label in data: - for positive_example in label2examples[label]: - if positive_example != text: - nli_triplets.append([[text, positive_example], ENTAILMENT]) - - if self.shuffle: - self.random.shuffle(nli_triplets) - - return nli_triplets - - def delete_oos(self, data: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]: - filtered_data = [] - for text, label in data: - if label != 'oos': - filtered_data.append([text, label]) - return filtered_data - - def get_shot_examples(self, data: List[Tuple[Any, Any]], shot: int) -> List[Tuple[Any, Any]]: - if shot is None: - return data - - # shuffle data to select shot-examples - self.random.shuffle(data) - - data_dict = {} - for _, label in data: - data_dict[label] = [] - - for text, label in data: - if len(data_dict[label]) < shot: - data_dict[label].append(text) - - if max(len(x) for x in data_dict.values()) < shot: - log.warning(f"Some labels have less than \"shot\"={shot} examples") - - new_data = [] - for label in data_dict.keys(): - for text in data_dict[label]: - new_data.append((text, label)) - - if self.shuffle: - self.random.shuffle(new_data) - - return new_data From a5975c862ce611352629a22e001242bbf73bba0f Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 19 Apr 2023 14:22:00 +0300 Subject: [PATCH 39/57] Refactor: dnnc_preprocessor --- deeppavlov/core/common/registry.json | 2 +- .../models/preprocessors/dnnc_preprocessor.py | 44 ++++++++----------- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 3d1a9b86f9..22b4ca649f 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -12,7 +12,7 @@ "dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor", "docred_reader": "deeppavlov.dataset_readers.docred_reader:DocREDDatasetReader", "document_chunker": "deeppavlov.models.preprocessors.odqa_preprocessors:DocumentChunker", - "dnnc_input_preprocessor": "deeppavlov.models.preprocessors.dnnc_preprocessor:InputPreprocessor", + "dnnc_pair_generator": "deeppavlov.models.preprocessors.dnnc_preprocessor:PairGenerator", "dnnc_proba2labels": "deeppavlov.models.classifiers.dnnc_proba2labels:Proba2Labels", "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser", "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker", diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index b5a6bc557e..4e3157e8d4 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -14,7 +14,7 @@ from logging import getLogger from pathlib import Path -from typing import List +from typing import List, Tuple import numpy as np import pandas as pd @@ -25,35 +25,27 @@ log = getLogger(__name__) -@register('dnnc_input_preprocessor') -class InputPreprocessor(Component): - def __init__(self, - support_dataset_path: str = None, - bidirectional: bool = False, - *args, **kwargs) -> None: - self.bidirectional = bidirectional - if support_dataset_path: - file = Path(support_dataset_path).expanduser() - if file.exists(): - df = pd.read_json(file, orient='split') - self.support_dataset = [(row["text"], str(row["category"])) for _, row in df.iterrows()] - else: - log.error(f"Cannot find {support_dataset_path} file") - self.support_dataset = None - - def __call__(self, input) -> List[List[str]]: - """Generates all possible ordread pairs from 'input_texts' and 'self.support_dataset'""" +@register('dnnc_pair_generator') +class PairGenerator(Component): + """ + Generates all possible ordered pairs from 'texts_batch' and 'support_dataset' + + Args: + bidirectional: adds pairs in reverse order + """ - if len(input) <= 1 or isinstance(input[1], str): - texts = input - else: - texts, self.support_dataset = input + def __init__(self, bidirectional: bool = False, *args, **kwargs) -> None: + self.bidirectional = bidirectional + def __call__(self, + texts_batch: List[str], + support_dataset: List[List[str]] + ) -> Tuple[List[str], List[str], List[str], List[str]]: hypotesis_batch = [] premise_batch = [] hypotesis_labels_batch = [] - for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(self.support_dataset), - np.repeat(self.support_dataset, len(texts), axis=0)): + for [premise, [hypotesis, hypotesis_labels]] in zip(texts_batch * len(support_dataset), + np.repeat(support_dataset, len(texts_batch), axis=0)): premise_batch.append(premise) hypotesis_batch.append(hypotesis) hypotesis_labels_batch.append(hypotesis_labels) @@ -62,4 +54,4 @@ def __call__(self, input) -> List[List[str]]: premise_batch.append(hypotesis) hypotesis_batch.append(premise) hypotesis_labels_batch.append(hypotesis_labels) - return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch + return texts_batch, hypotesis_batch, premise_batch, hypotesis_labels_batch From 6344c34d3ef0fe424682b6b06ddddb8e54b49dd6 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 19 Apr 2023 14:22:37 +0300 Subject: [PATCH 40/57] Refactor: dnnc_proba2labels --- .../models/classifiers/dnnc_proba2labels.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 01a00c7598..c9e913a160 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -23,18 +23,22 @@ log = getLogger(__name__) -def preprocess_scores(scores, is_binary, class_id: int = 1): - scores = np.array(scores) - return scores if is_binary else scores[:, class_id] - - @register('dnnc_proba2labels') class Proba2Labels(Component): + """ + Converts pairwise simmilarity scores into class label + + Args: + confidence_threshold: used to determine whether example belongs to one + of the classes in 'y_support' or not + pooling: strategy for averaging similarity scores for each label + """ def __init__(self, confidence_threshold: float = 0.0, pooling: str = 'max', - is_binary: bool = False, + is_binary: bool = True, + *args, **kwargs) -> None: self.confidence_threshold = confidence_threshold @@ -42,20 +46,23 @@ def __init__(self, self.is_binary = is_binary def __call__(self, - simmilarity_scores: Union[np.ndarray, List[List[float]], List[List[int]]], + simmilarity_scores: List[float], x: List[str], x_populated: List[str], x_support: List[str], - y_support: List[str]): + y_support: List[str]) -> List[str]: y_pred = [] - simmilarity_scores = preprocess_scores(simmilarity_scores, self.is_binary) + simmilarity_scores = np.array(simmilarity_scores) x_populated = np.array(x_populated) x_support = np.array(x_support) y_support = np.array(y_support) - unique_labels = np.unique(y_support) + # Transform probits vector into a simmilarity score + if not self.is_binary: + simmilarity_scores = simmilarity_scores[:, 1] + for example in x: example_mask = np.where(np.logical_xor(x_populated == example, x_support == example)) example_simmilarity_scores = simmilarity_scores[example_mask] From b4bd9f11d729b9a067d5ca4471472b8ddb40da80 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 19 Apr 2023 14:24:51 +0300 Subject: [PATCH 41/57] Refactor: config dnnc_infer --- .../configs/classifiers/dnnc_infer.json | 42 ++----------------- .../torch_transformers_classifier.py | 2 +- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/dnnc_infer.json index 6b7ceaa3d0..953e6ed48e 100644 --- a/deeppavlov/configs/classifiers/dnnc_infer.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -1,29 +1,12 @@ { - "dataset_reader": { - "class_name": "basic_classification_reader", - "format": "json", - "orient": "split", - "x": "text", - "y": "category", - "data_path": "{DOWNLOADS_PATH}", - "train": "train.json", - "valid": "dev.json", - "test": "test.json" - }, - "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "shuffle": true - }, "chainer": { - "in": ["input"], + "in": ["texts_batch", "support_dataset"], "in_y": ["y_true"], "pipe": [ { - "class_name": "dnnc_input_preprocessor", - "in": ["input"], + "class_name": "dnnc_pair_generator", + "in": ["texts_batch", "support_dataset"], "out": ["x", "x_support", "x_populated", "y_support"], - "support_dataset_path": "{SUPPORT_DATA_PATH}/support_dataset.json", "bidirectional": true }, { @@ -44,8 +27,7 @@ "pretrained_bert": "{BASE_MODEL}", "save_path": "{MODEL_PATH}/model", "load_path": "{MODEL_PATH}/model", - "is_binary": "{BINARY_CLASSIFICATION}", - "optimizer_parameters": {"lr": 2e-05} + "is_binary": "{BINARY_CLASSIFICATION}" }, { "class_name": "dnnc_proba2labels", @@ -59,22 +41,6 @@ ], "out": ["y_pred"] }, - "train": { - "batch_size": 1, - "metrics": [ - { - "name": "accuracy_in_domain", - "inputs": ["y_true", "y_pred"] - }, - { - "name": "oos_scores", - "inputs": ["y_true", "y_pred"] - } - ], - "show_examples": false, - "evaluation_targets": ["test"], - "class_name": "torch_trainer" - }, "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", diff --git a/deeppavlov/models/torch_bert/torch_transformers_classifier.py b/deeppavlov/models/torch_bert/torch_transformers_classifier.py index ee8e1c7597..2a44c87871 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_classifier.py +++ b/deeppavlov/models/torch_bert/torch_transformers_classifier.py @@ -75,7 +75,7 @@ def __init__(self, n_classes, optimizer_parameters = {"lr": 1e-3, "weight_decay": 0.01, "betas": (0.9, 0.999), - "eps": 1e-6}, + "eps": 1e-6} self.return_probas = return_probas self.one_hot_labels = one_hot_labels From 799e64c4c2c061e8dc18e3ab0a32b976c143a0e5 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 19 Apr 2023 15:15:08 +0300 Subject: [PATCH 42/57] canceled changes in torch_transformers_classifier --- .../models/torch_bert/torch_transformers_classifier.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/deeppavlov/models/torch_bert/torch_transformers_classifier.py b/deeppavlov/models/torch_bert/torch_transformers_classifier.py index 2a44c87871..fc549b0cb2 100644 --- a/deeppavlov/models/torch_bert/torch_transformers_classifier.py +++ b/deeppavlov/models/torch_bert/torch_transformers_classifier.py @@ -68,7 +68,6 @@ def __init__(self, n_classes, bert_config_file: Optional[str] = None, is_binary: Optional[bool] = False, num_special_tokens: int = None, - binary_head_dropout: int = 0.1, **kwargs) -> None: if not optimizer_parameters: @@ -89,7 +88,6 @@ def __init__(self, n_classes, self.is_binary = is_binary self.bert_config = None self.num_special_tokens = num_special_tokens - self.binary_head_dropout = binary_head_dropout if self.multilabel and not self.one_hot_labels: raise RuntimeError('Use one-hot encoded labels for multilabel classification!') @@ -206,7 +204,7 @@ def load(self, fname=None): if self.pretrained_bert: log.debug(f"From pretrained {self.pretrained_bert}.") config = AutoConfig.from_pretrained(self.pretrained_bert, - classifier_dropout=self.binary_head_dropout, + # num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) @@ -215,6 +213,9 @@ def load(self, fname=None): self.model = AutoModelForBinaryClassification(self.pretrained_bert, config) else: self.model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_bert, config=config) + + # TODO need a better solution here and at + # deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel.load try: hidden_size = self.model.classifier.out_proj.in_features @@ -322,7 +323,7 @@ def __init__(self, config): self.config = config self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size) - self.dropout = torch.nn.Dropout(config.classifier_dropout) + self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) self.out_proj = torch.nn.Linear(config.hidden_size, 1) def init_weights(self): From 56ec9e33af8fe69b9e1ff3ae377481e90911abf2 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Thu, 20 Apr 2023 11:04:39 +0300 Subject: [PATCH 43/57] Fix: removed few_shot_iterator from registry --- deeppavlov/core/common/registry.json | 1 - 1 file changed, 1 deletion(-) diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 22b4ca649f..19ae33eb56 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -17,7 +17,6 @@ "entity_detection_parser": "deeppavlov.models.entity_extraction.entity_detection_parser:EntityDetectionParser", "entity_linker": "deeppavlov.models.entity_extraction.entity_linking:EntityLinker", "faq_reader": "deeppavlov.dataset_readers.faq_reader:FaqDatasetReader", - "few_shot_iterator": "deeppavlov.dataset_iterators.few_shot_iterator:FewShotIterator", "fasttext": "deeppavlov.models.embedders.fasttext_embedder:FasttextEmbedder", "fit_trainer": "deeppavlov.core.trainers.fit_trainer:FitTrainer", "hashing_tfidf_vectorizer": "deeppavlov.models.vectorizers.hashing_tfidf_vectorizer:HashingTfIdfVectorizer", From 255a4258ab54120d96b1eb4000e60c417d05f4f4 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 14 Jun 2023 10:21:43 +0300 Subject: [PATCH 44/57] fix: delete whitespaces --- deeppavlov/models/classifiers/dnnc_proba2labels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index c9e913a160..ccd90505c5 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -61,7 +61,7 @@ def __call__(self, # Transform probits vector into a simmilarity score if not self.is_binary: - simmilarity_scores = simmilarity_scores[:, 1] + simmilarity_scores = simmilarity_scores[:, 1] for example in x: example_mask = np.where(np.logical_xor(x_populated == example, x_support == example)) From 8bdf2bd4e2adaa01f2162e6586438c7f9a25351b Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 14 Jun 2023 13:17:45 +0300 Subject: [PATCH 45/57] fix: delete unused --- deeppavlov/models/classifiers/dnnc_proba2labels.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index ccd90505c5..b647fd2421 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -13,7 +13,7 @@ # limitations under the License. from logging import getLogger -from typing import List, Union +from typing import List import numpy as np @@ -37,9 +37,7 @@ class Proba2Labels(Component): def __init__(self, confidence_threshold: float = 0.0, pooling: str = 'max', - is_binary: bool = True, - *args, - **kwargs) -> None: + is_binary: bool = True) -> None: self.confidence_threshold = confidence_threshold self.pooling = pooling From ebcff3a62f563ce7d84e67b777e1c48223c9bad5 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 14 Jun 2023 13:23:28 +0300 Subject: [PATCH 46/57] fix: call arguments --- deeppavlov/models/classifiers/dnnc_proba2labels.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index b647fd2421..46a2eed9d9 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -48,7 +48,10 @@ def __call__(self, x: List[str], x_populated: List[str], x_support: List[str], - y_support: List[str]) -> List[str]: + y_support: List[str], + *args, + **kwargs) -> List[str]: + y_pred = [] simmilarity_scores = np.array(simmilarity_scores) From c6f57d0782435b135f62e8f5b79db35fa0208a3f Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 14 Jun 2023 13:27:45 +0300 Subject: [PATCH 47/57] fix: delete whitespaces --- deeppavlov/models/preprocessors/dnnc_preprocessor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 4e3157e8d4..29deafe655 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -34,11 +34,11 @@ class PairGenerator(Component): bidirectional: adds pairs in reverse order """ - def __init__(self, bidirectional: bool = False, *args, **kwargs) -> None: + def __init__(self, bidirectional: bool = False) -> None: self.bidirectional = bidirectional - def __call__(self, - texts_batch: List[str], + def __call__(self, + texts_batch: List[str], support_dataset: List[List[str]] ) -> Tuple[List[str], List[str], List[str], List[str]]: hypotesis_batch = [] From dc55ea789dc276ab434a09ee85685662796ee060 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 14 Jun 2023 13:28:22 +0300 Subject: [PATCH 48/57] fix: remove unused --- deeppavlov/models/preprocessors/dnnc_preprocessor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 29deafe655..eaad4406c9 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -13,11 +13,9 @@ # limitations under the License. from logging import getLogger -from pathlib import Path from typing import List, Tuple import numpy as np -import pandas as pd from deeppavlov.core.common.registry import register from deeppavlov.core.models.component import Component From 243589b44e96d891737e4780721c446c2cf4e7e5 Mon Sep 17 00:00:00 2001 From: vasily Date: Wed, 14 Jun 2023 13:30:53 +0300 Subject: [PATCH 49/57] fix: __call__ arguments --- deeppavlov/models/preprocessors/dnnc_preprocessor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index eaad4406c9..14da55c2e4 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -37,8 +37,9 @@ def __init__(self, bidirectional: bool = False) -> None: def __call__(self, texts_batch: List[str], - support_dataset: List[List[str]] - ) -> Tuple[List[str], List[str], List[str], List[str]]: + support_dataset: List[List[str]], + *args, + **kwargs) -> Tuple[List[str], List[str], List[str], List[str]]: hypotesis_batch = [] premise_batch = [] hypotesis_labels_batch = [] From ebcdcff8c3f0ce38ab31f62b44754eb87f4a8341 Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Tue, 27 Jun 2023 07:41:58 +0300 Subject: [PATCH 50/57] docs: optimizer few_shot_classification ipynb file --- .../models/few_shot_classification.ipynb | 222 ++++-------------- 1 file changed, 44 insertions(+), 178 deletions(-) diff --git a/docs/features/models/few_shot_classification.ipynb b/docs/features/models/few_shot_classification.ipynb index ae3f7e3022..51b4637609 100644 --- a/docs/features/models/few_shot_classification.ipynb +++ b/docs/features/models/few_shot_classification.ipynb @@ -5,14 +5,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Few-shot Text Classification" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "#### Few-shot Text Classification\n", + "\n", "# Table of contents \n", "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", @@ -39,51 +33,20 @@ " \n", " 6.1. [Evaluate from Python](#6.1-Evaluate-from-Python)\n", " \n", - " 6.2. [Evaluate from CLI](#6.2-Evaluate-from-CLI)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Introduction to the task" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Dataset" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + " 6.2. [Evaluate from CLI](#6.2-Evaluate-from-CLI)\n", + "\n", + "# 1. Introduction to the task\n", + "\n", + "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n", + "\n", + "# 2. Dataset\n", + "\n", "In our experiments we used the [CLINC150](https://paperswithcode.com/dataset/clinc150) dataset, which has 10 different domains with 15 intents each, 100 shots per intent class and 1000 OOS examples. It simulates a setting, where model has to handle many different services with wide variety of intents.\n", "\n", - "Specifically, we validate our model on CLINC150 from the original DNNC paper. We parsed it to match the format described [below](#2.1-Datasets-format). The original dataset can be downloaded from the DNNC [github page](https://github.com/salesforce/DNNC-few-shot-intent)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2.1 Datasets format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "Specifically, we validate our model on CLINC150 from the original DNNC paper. We parsed it to match the format described [below](#2.1-Datasets-format). The original dataset can be downloaded from the DNNC [github page](https://github.com/salesforce/DNNC-few-shot-intent).\n", + "\n", + "## 2.1 Datasets format\n", + "\n", "Train, dev and test set are separate json files, which have the following format\n", "\n", "```\n", @@ -103,20 +66,10 @@ " ...\n", " ]\n", "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Model architecture" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "```\n", + "\n", + "# 3. Model architecture\n", + "\n", "The typical methodology of few-shot text classification is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough.\n", "\n", "\n", @@ -126,27 +79,12 @@ "\n", "$S(u, e_{j,i}) = \\sigma(W * h + b) \\in \\R$, where $e_{j, i} \\in E $- training set, $W \\in \\R^{1×d}$, $b \\in \\R$\n", "\n", - "To mitigate the data scarcity setting in few-shot learning, DNNC uses knowldge-transfer from NLI task. We pretrain [roberta-base](https://huggingface.co/roberta-base) on combination of 3 NLI datasets: SNLI, WNLI, MNLI." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Get started with the model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4.1 Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "To mitigate the data scarcity setting in few-shot learning, DNNC uses knowldge-transfer from NLI task. We pretrain [roberta-base](https://huggingface.co/roberta-base) on combination of 3 NLI datasets: SNLI, WNLI, MNLI.\n", + "\n", + "# 4. Get started with the model\n", + "\n", + "## 4.1 Installation\n", + "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" ] @@ -184,21 +122,10 @@ "source": [ "`dnnc_infer` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", - "Configuration file defines the model and describes it's hyperparameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4.2 Support dataset configuration" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "Configuration file defines the model and describes it's hyperparameters\n", + "\n", + "## 4.2 Support dataset configuration\n", + "\n", "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training set as support dataset. It is automatically saved by *dataset_iterator* during the training step, but you can specify your own support dataset in the in `dnnc_infer` config file. It has the same format as metioned [before](#2.1-Datasets-format)" ] }, @@ -228,28 +155,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Off-the-shelf prediction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Base model was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### OOS prediction" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "### Off-the-shelf prediction\n", + "\n", + "Base model was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n", + "\n", + "### OOS prediction\n", + "\n", "Out-of-scope (OOS) examples are determined via confidence_threshold parameter with the following algorithm. Firstly model calculates an average similarity score for every class from support dataset. Secondly it determines the class with maximum similarity score. Finally the model predicts class with maximum similarity if it's score is higher than confidence_threshold and \"oos\" class otherwise. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0.5 and you can change it to your preferences in configuration file" ] }, @@ -279,21 +190,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 5. Use the model for prediction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5.1 Predict using Python" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict. If you set 'download' flag to 'True', then existing model weights will be overwritten. " + "# 5. Use the model for prediction\n", + "\n", + "## 5.1 Predict using Python\n", + "\n", + "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict. If you set 'download' flag to 'True', then existing model weights will be overwritten." ] }, { @@ -321,13 +222,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 5.2 Predict using CLI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "## 5.2 Predict using CLI\n", + "\n", "You can also get predictions in an interactive mode through CLI." ] }, @@ -361,14 +257,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 6. Evaluate" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "# 6. Evaluate\n", + "\n", "To evaluate the model on your data, you need to change the path to the dataset in `dnnc_infer` config" ] }, @@ -431,31 +321,7 @@ ] } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.8.10 ('dnnc_env': venv)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "0856b36c2b26de82c1ead3e3019420db4ffa1511d91e779db8eb71f2314c36e4" - } - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 2 } From 453746369d7bfa9b5dd0da941f8fb27f1d9d9c7f Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Tue, 27 Jun 2023 07:50:09 +0300 Subject: [PATCH 51/57] remove: trailing spaces --- deeppavlov/models/classifiers/dnnc_proba2labels.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 46a2eed9d9..89bc1e1a54 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -49,9 +49,9 @@ def __call__(self, x_populated: List[str], x_support: List[str], y_support: List[str], - *args, + *args, **kwargs) -> List[str]: - + y_pred = [] simmilarity_scores = np.array(simmilarity_scores) @@ -62,7 +62,7 @@ def __call__(self, # Transform probits vector into a simmilarity score if not self.is_binary: - simmilarity_scores = simmilarity_scores[:, 1] + simmilarity_scores = simmilarity_scores[:, 1] for example in x: example_mask = np.where(np.logical_xor(x_populated == example, x_support == example)) From a05646b8dd6d10e6db9802dbc32e3710571743b6 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Tue, 27 Jun 2023 11:07:59 +0300 Subject: [PATCH 52/57] fix: remove unused metrics --- deeppavlov/core/common/metrics_registry.json | 2 -- deeppavlov/metrics/accuracy.py | 12 ------------ deeppavlov/metrics/fmeasure.py | 9 --------- 3 files changed, 23 deletions(-) diff --git a/deeppavlov/core/common/metrics_registry.json b/deeppavlov/core/common/metrics_registry.json index 19d14c2395..7632c22dda 100644 --- a/deeppavlov/core/common/metrics_registry.json +++ b/deeppavlov/core/common/metrics_registry.json @@ -1,7 +1,6 @@ { "acc": "deeppavlov.metrics.accuracy:round_accuracy", "accuracy": "deeppavlov.metrics.accuracy:accuracy", - "accuracy_in_domain": "deeppavlov.metrics.accuracy:accuracy_in_domain", "average__ner_f1__f1_macro__f1": "deeppavlov.metrics.fmeasure:ner_f1__f1_macro__f1", "average__roc_auc__roc_auc__ner_f1": "deeppavlov.metrics.fmeasure:roc_auc__roc_auc__ner_f1", "bleu": "deeppavlov.metrics.bleu:bleu", @@ -20,7 +19,6 @@ "multitask_token_accuracy": "deeppavlov.metrics.accuracy:multitask_token_accuracy", "ner_f1": "deeppavlov.metrics.fmeasure:ner_f1", "ner_token_f1": "deeppavlov.metrics.fmeasure:ner_token_f1", - "oos_scores": "deeppavlov.metrics.fmeasure:oos_scores", "pearson_correlation": "deeppavlov.metrics.correlation:pearson_correlation", "per_item_bleu": "deeppavlov.metrics.bleu:per_item_bleu", "per_item_dialog_accuracy": "deeppavlov.metrics.accuracy:per_item_dialog_accuracy", diff --git a/deeppavlov/metrics/accuracy.py b/deeppavlov/metrics/accuracy.py index b21cfd19dc..b77696fa46 100644 --- a/deeppavlov/metrics/accuracy.py +++ b/deeppavlov/metrics/accuracy.py @@ -188,15 +188,3 @@ def kbqa_accuracy(y_true, y_predicted): total_correct += 1 return total_correct / len(y_true) if len(y_true) else 0 - - -@register_metric('accuracy_in_domain') -def accuracy_in_domain(y_true: [list, np.ndarray], y_pred: [list, np.ndarray], oos = 'oos') -> float: - y_true = np.array(y_true) - y_pred = np.array(y_pred) - - ind_mask = np.where(y_true == oos) - - y_true = np.delete(y_true, ind_mask, 0) - y_pred = np.delete(y_pred, ind_mask, 0) - return accuracy(y_true, y_pred) diff --git a/deeppavlov/metrics/fmeasure.py b/deeppavlov/metrics/fmeasure.py index b3285f0316..442d3d5262 100644 --- a/deeppavlov/metrics/fmeasure.py +++ b/deeppavlov/metrics/fmeasure.py @@ -19,7 +19,6 @@ import numpy as np from sklearn.metrics import f1_score -from sklearn.metrics import precision_recall_fscore_support from deeppavlov.core.common.metrics_registry import register_metric @@ -418,11 +417,3 @@ def roc_auc__roc_auc__ner_f1(true_onehot1, pred_probas1, true_onehot2, pred_prob roc_auc2 = roc_auc_score(true_onehot2, pred_probas2) ner_f1_3 = ner_f1(ner_true3, ner_pred3) / 100 return (roc_auc1 + roc_auc2 + ner_f1_3) / 3 - - -@register_metric('oos_scores') -def oos_scores(y_true, y_pred): - y_true_binary = (np.array(y_true) == "oos") - y_pred_binary = (np.array(y_pred) == "oos") - scores = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary') - return dict(zip(["precision", "recall", "fbeta_score"], scores[:3])) From 628f1e322868c3df6291203fdb787b7888c2b0c1 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 28 Jun 2023 11:46:34 +0300 Subject: [PATCH 53/57] remove: unused parameters --- deeppavlov/configs/classifiers/dnnc_infer.json | 14 +++----------- .../models/classifiers/dnnc_proba2labels.py | 9 +++++---- .../models/preprocessors/dnnc_preprocessor.py | 15 +++++++-------- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/dnnc_infer.json index 953e6ed48e..21ae54754a 100644 --- a/deeppavlov/configs/classifiers/dnnc_infer.json +++ b/deeppavlov/configs/classifiers/dnnc_infer.json @@ -1,11 +1,11 @@ { "chainer": { - "in": ["texts_batch", "support_dataset"], + "in": ["texts", "dataset"], "in_y": ["y_true"], "pipe": [ { "class_name": "dnnc_pair_generator", - "in": ["texts_batch", "support_dataset"], + "in": ["texts", "dataset"], "out": ["x", "x_support", "x_populated", "y_support"], "bidirectional": true }, @@ -34,9 +34,7 @@ "is_binary": "{BINARY_CLASSIFICATION}", "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"], "out": ["y_pred"], - "multilabel": false, - "confidence_threshold": 0.0, - "pooling": "max" + "confidence_threshold": 0.0 } ], "out": ["y_pred"] @@ -44,8 +42,6 @@ "metadata": { "variables": { "ROOT_PATH": "~/.deeppavlov", - "DOWNLOADS_PATH": "{ROOT_PATH}/downloads/clinc150", - "SUPPORT_DATA_PATH": "{ROOT_PATH}/preprocessed_datasets", "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10", "BINARY_CLASSIFICATION": true, "BASE_MODEL": "roberta-base" @@ -54,10 +50,6 @@ { "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz", "subdir": "{MODEL_PATH}" - }, - { - "url": "https://files.deeppavlov.ai/datasets/clinc150.tar.gz", - "subdir": "{DOWNLOADS_PATH}" } ] } diff --git a/deeppavlov/models/classifiers/dnnc_proba2labels.py b/deeppavlov/models/classifiers/dnnc_proba2labels.py index 89bc1e1a54..26afd86255 100644 --- a/deeppavlov/models/classifiers/dnnc_proba2labels.py +++ b/deeppavlov/models/classifiers/dnnc_proba2labels.py @@ -32,12 +32,14 @@ class Proba2Labels(Component): confidence_threshold: used to determine whether example belongs to one of the classes in 'y_support' or not pooling: strategy for averaging similarity scores for each label + is_binary: determines whether the similarity is a number or a probability vector """ def __init__(self, confidence_threshold: float = 0.0, pooling: str = 'max', - is_binary: bool = True) -> None: + is_binary: bool = True, + **kwargs) -> None: self.confidence_threshold = confidence_threshold self.pooling = pooling @@ -48,9 +50,8 @@ def __call__(self, x: List[str], x_populated: List[str], x_support: List[str], - y_support: List[str], - *args, - **kwargs) -> List[str]: + y_support: List[str] + ) -> List[str]: y_pred = [] diff --git a/deeppavlov/models/preprocessors/dnnc_preprocessor.py b/deeppavlov/models/preprocessors/dnnc_preprocessor.py index 14da55c2e4..be564d03f9 100644 --- a/deeppavlov/models/preprocessors/dnnc_preprocessor.py +++ b/deeppavlov/models/preprocessors/dnnc_preprocessor.py @@ -32,19 +32,18 @@ class PairGenerator(Component): bidirectional: adds pairs in reverse order """ - def __init__(self, bidirectional: bool = False) -> None: + def __init__(self, bidirectional: bool = False, **kwargs) -> None: self.bidirectional = bidirectional def __call__(self, - texts_batch: List[str], - support_dataset: List[List[str]], - *args, - **kwargs) -> Tuple[List[str], List[str], List[str], List[str]]: + texts: List[str], + dataset: List[List[str]], + ) -> Tuple[List[str], List[str], List[str], List[str]]: hypotesis_batch = [] premise_batch = [] hypotesis_labels_batch = [] - for [premise, [hypotesis, hypotesis_labels]] in zip(texts_batch * len(support_dataset), - np.repeat(support_dataset, len(texts_batch), axis=0)): + for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(dataset), + np.repeat(dataset, len(texts), axis=0)): premise_batch.append(premise) hypotesis_batch.append(hypotesis) hypotesis_labels_batch.append(hypotesis_labels) @@ -53,4 +52,4 @@ def __call__(self, premise_batch.append(hypotesis) hypotesis_batch.append(premise) hypotesis_labels_batch.append(hypotesis_labels) - return texts_batch, hypotesis_batch, premise_batch, hypotesis_labels_batch + return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch From f756dbe7b89c1a9a643446f537bc74e2b70f1d7f Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 28 Jun 2023 11:54:12 +0300 Subject: [PATCH 54/57] docs: updated to new format --- .../models/few_shot_classification.ipynb | 321 +++++++++--------- 1 file changed, 157 insertions(+), 164 deletions(-) diff --git a/docs/features/models/few_shot_classification.ipynb b/docs/features/models/few_shot_classification.ipynb index 51b4637609..4054d1ad19 100644 --- a/docs/features/models/few_shot_classification.ipynb +++ b/docs/features/models/few_shot_classification.ipynb @@ -11,79 +11,32 @@ "\n", "1. [Introduction to the task](#1.-Introduction-to-the-task)\n", "\n", - "2. [Dataset](#2.-Datasets)\n", - " \n", - " 2.1. [Datasets format](#2.1-Datasets-format)\n", - "\n", - "3. [Model architecture](#3.-Model-architecture)\n", + "2. [Get started with the model](#2.-Get-started-with-the-model)\n", "\n", - "4. [Get started with the model](#4.-Get-started-with-the-model)\n", - " \n", - " 4.1. [Installation](#4.1-Installation)\n", + "3. [Models list](#3.-Models-list)\n", "\n", - " 4.2. [Support dataset configuration](#4.2-Support-dataset-configuration)\n", + "4. [Use the model for prediction](#4.-Use-the-model-for-prediction)\n", "\n", - "5. [Use the model for prediction](#5.-Use-the-model-for-prediction)\n", + " 4.1 [Dataset format](#4.1-Dataset-format)\n", "\n", - " 5.1. [Predict using Python](#5.1-Predict-using-Python)\n", - " \n", - " 5.2. [Predict using CLI](#5.2-Predict-using-CLI)\n", - " \n", - "6. [Evaluate](#6.-Evaluate)\n", + " 4.2. [Predict using Python](#4.2-Predict-using-Python)\n", " \n", - " 6.1. [Evaluate from Python](#6.1-Evaluate-from-Python)\n", - " \n", - " 6.2. [Evaluate from CLI](#6.2-Evaluate-from-CLI)\n", + " 4.3. [Predict using CLI](#4.3-Predict-using-CLI)\n", "\n", + "5. [Customize the model](#5.-Customize-the-model)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ "# 1. Introduction to the task\n", "\n", "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n", "\n", - "# 2. Dataset\n", - "\n", - "In our experiments we used the [CLINC150](https://paperswithcode.com/dataset/clinc150) dataset, which has 10 different domains with 15 intents each, 100 shots per intent class and 1000 OOS examples. It simulates a setting, where model has to handle many different services with wide variety of intents.\n", - "\n", - "Specifically, we validate our model on CLINC150 from the original DNNC paper. We parsed it to match the format described [below](#2.1-Datasets-format). The original dataset can be downloaded from the DNNC [github page](https://github.com/salesforce/DNNC-few-shot-intent).\n", - "\n", - "## 2.1 Datasets format\n", "\n", - "Train, dev and test set are separate json files, which have the following format\n", - "\n", - "```\n", - "{\n", - " \"columns\": [\n", - " \"text\",\n", - " \"category\"\n", - " ],\n", - "\n", - " \"data\": [\n", - "\n", - " [\n", - " \"text\"\n", - " \"label\"\n", - " ],\n", - "\n", - " ...\n", - " ]\n", - "}\n", - "```\n", - "\n", - "# 3. Model architecture\n", - "\n", - "The typical methodology of few-shot text classification is to embed each example into a vector space and use an off-the-shelf distance metric to perform a similarity search. However, the text embedding methods do not discriminate the OOS examples well enough.\n", - "\n", - "\n", - "DNNC authors suggests to model fine-grained relations of utterance pairs via pairise simmilarity:\n", - "\n", - "$h = BERT([[CLS], u, [SEP], e_{j,i}, [SEP]]) \\in \\R^d$\n", - "\n", - "$S(u, e_{j,i}) = \\sigma(W * h + b) \\in \\R$, where $e_{j, i} \\in E $- training set, $W \\in \\R^{1×d}$, $b \\in \\R$\n", - "\n", - "To mitigate the data scarcity setting in few-shot learning, DNNC uses knowldge-transfer from NLI task. We pretrain [roberta-base](https://huggingface.co/roberta-base) on combination of 3 NLI datasets: SNLI, WNLI, MNLI.\n", - "\n", - "# 4. Get started with the model\n", - "\n", - "## 4.1 Installation\n", + "# 2. Get started with the model\n", "\n", "First make sure you have the DeepPavlov Library installed.\n", "[More info about the first installation.](http://docs.deeppavlov.ai/en/master/intro/installation.html)" @@ -95,14 +48,15 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install deeppavlov" + "!pip install -q deeppavlov" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Then make sure that all the required packages, datasets and weights are installed." + "Then make sure that all the required packages are installed." ] }, { @@ -111,8 +65,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov install dnnc_infer\n", - "!python -m deeppavlov download dnnc_infer" + "!python -m deeppavlov install dnnc_infer" ] }, { @@ -122,79 +75,109 @@ "source": [ "`dnnc_infer` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", - "Configuration file defines the model and describes it's hyperparameters\n", - "\n", - "## 4.2 Support dataset configuration\n", - "\n", - "Before making predictions or evaluation you need to set path to your support dataset. DNNC model compares input text to every example in support dataset to determine, which class the input example belongs to. By default, the model uses training set as support dataset. It is automatically saved by *dataset_iterator* during the training step, but you can specify your own support dataset in the in `dnnc_infer` config file. It has the same format as metioned [before](#2.1-Datasets-format)" + "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", + "Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list)." ] }, { - "cell_type": "code", - "execution_count": 1, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "~/.deeppavlov/parsed_datasets/parsed_dataset.json\n" - ] - } - ], "source": [ - "from deeppavlov.core.commands.utils import parse_config\n", + "# 3. Models list\n", "\n", - "model_config = parse_config('dnnc_infer')\n", + "At the moment, only `dnnc_infer` config support out-of-scope detection.\n", "\n", - "# dataset for predictions\n", - "print(model_config['chainer']['pipe'][0]['support_dataset_path'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Off-the-shelf prediction\n", + "| Config name | Dataset | Shot | Model Size | In-domain accuracy | Out-of-scope recall | Out-of-scope precision |\n", + "| :--- | --- | --- | --- | --- | --- | ---: |\n", + "| dnnc_infer| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 84.1±1.9 | 93.2±0.8 | 97.8±0.3 |\n", + "| dnnc_infer| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 59.4±1.4 | 87.9±1.2 | 40.3±0.7 |\n", + "| dnnc_infer| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 51.4±2.1 | 93.7±0.7 | 82.7±1.4 |\n", + "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB |24.8±2.2 | 98.2±0.4 | 74.8±0.6 |\n", + "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 13.4±0.5 | 98.6±0.2 | 20.5±0.1 |\n", + "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 37 KB |10.7±0.8 | 99.0±0.3 | 36.4±0.2 |\n", + "\n", + "\n", + "With zero threshold we can get a classification accuracy without OOS detection:\n", + "\n", + "| Config name | Dataset | Shot | Model Size | Accuracy |\n", + "| :--- | --- | --- | --- | ---: |\n", + "| dnnc_infer| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 89.6 |\n", + "| dnnc_infer| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 79.6 |\n", + "| dnnc_infer| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 55.1 |\n", + "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 86.3 |\n", + "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 73.6\n", + "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 37 KB | 51.6 |\n", "\n", - "Base model was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n", + "\\* \\- config file was modified to predict OOS examples\n", + "\n", + "\n", + "# 4. Use the model for prediction\n", + "\n", + "Base model `dnnc_infer` was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n", + "\n", + "## 4.1 Dataset format\n", + "\n", + "DNNC model compares input text to every example in dataset to determine, which class the input example belongs to. The dataset based on which classification is performed has the following format:\n", + "\n", + "```\n", + "[\n", + " [\"text_1\", \"label_1\"],\n", + " [\"text_2\", \"label_2\"],\n", + " ...\n", + " [\"text_n\", \"label_n\"]\n", + "]\n", + "```\n", "\n", - "### OOS prediction\n", + "## 4.2 Predict using Python\n", "\n", - "Out-of-scope (OOS) examples are determined via confidence_threshold parameter with the following algorithm. Firstly model calculates an average similarity score for every class from support dataset. Secondly it determines the class with maximum similarity score. Finally the model predicts class with maximum similarity if it's score is higher than confidence_threshold and \"oos\" class otherwise. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0.5 and you can change it to your preferences in configuration file" + "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.5\n" + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/home/savkin/few_shot/DNNC/DeepPavlov/few_shot_classification.ipynb Cell 8\u001b[0m in \u001b[0;36m6\n\u001b[1;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\n\u001b[1;32m 4\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m\"\u001b[39m\u001b[39mCUDA_VISIBLE_DEVICES\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m1\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 6\u001b[0m model \u001b[39m=\u001b[39m build_model(\u001b[39m\"\u001b[39m\u001b[39mdnnc_infer\u001b[39m\u001b[39m\"\u001b[39m, download\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n", + "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/core/commands/infer.py:39\u001b[0m, in \u001b[0;36mbuild_model\u001b[0;34m(config, mode, load_trained, install, download)\u001b[0m\n\u001b[1;32m 37\u001b[0m install_from_config(config)\n\u001b[1;32m 38\u001b[0m \u001b[39mif\u001b[39;00m download:\n\u001b[0;32m---> 39\u001b[0m deep_download(config)\n\u001b[1;32m 41\u001b[0m import_packages(config\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mmetadata\u001b[39m\u001b[39m'\u001b[39m, {})\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mimports\u001b[39m\u001b[39m'\u001b[39m, []))\n\u001b[1;32m 43\u001b[0m model_config \u001b[39m=\u001b[39m config[\u001b[39m'\u001b[39m\u001b[39mchainer\u001b[39m\u001b[39m'\u001b[39m]\n", + "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:174\u001b[0m, in \u001b[0;36mdeep_download\u001b[0;34m(config)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m url\u001b[39m.\u001b[39mstartswith(\u001b[39m'\u001b[39m\u001b[39ms3://\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(config, \u001b[39mdict\u001b[39m):\n\u001b[1;32m 173\u001b[0m url \u001b[39m=\u001b[39m set_query_parameter(url, \u001b[39m'\u001b[39m\u001b[39mconfig\u001b[39m\u001b[39m'\u001b[39m, Path(config)\u001b[39m.\u001b[39mstem)\n\u001b[0;32m--> 174\u001b[0m download_resource(url, dest_paths, headers)\n", + "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:137\u001b[0m, in \u001b[0;36mdownload_resource\u001b[0;34m(url, dest_paths, headers)\u001b[0m\n\u001b[1;32m 134\u001b[0m lockfile \u001b[39m=\u001b[39m download_path \u001b[39m/\u001b[39m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39m\u001b[39m{\u001b[39;00mfile_name\u001b[39m}\u001b[39;00m\u001b[39m.lock\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 136\u001b[0m \u001b[39mwith\u001b[39;00m FileLock(lockfile)\u001b[39m.\u001b[39macquire(poll_intervall\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m):\n\u001b[0;32m--> 137\u001b[0m \u001b[39mif\u001b[39;00m check_md5(url, dest_paths, headers):\n\u001b[1;32m 138\u001b[0m log\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mSkipped \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m}\u001b[39;00m\u001b[39m download because of matching hashes\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 139\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39many\u001b[39m(ext \u001b[39min\u001b[39;00m url \u001b[39mfor\u001b[39;00m ext \u001b[39min\u001b[39;00m (\u001b[39m'\u001b[39m\u001b[39m.tar.gz\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m.gz\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m.zip\u001b[39m\u001b[39m'\u001b[39m)):\n", + "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:114\u001b[0m, in \u001b[0;36mcheck_md5\u001b[0;34m(url, dest_paths, headers)\u001b[0m\n\u001b[1;32m 112\u001b[0m not_done \u001b[39m=\u001b[39m []\n\u001b[1;32m 113\u001b[0m \u001b[39mfor\u001b[39;00m base_path \u001b[39min\u001b[39;00m dest_paths:\n\u001b[0;32m--> 114\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39;49m(file_md5(base_path \u001b[39m/\u001b[39;49m p) \u001b[39m==\u001b[39;49m _md5 \u001b[39mfor\u001b[39;49;00m p, _md5 \u001b[39min\u001b[39;49;00m expected\u001b[39m.\u001b[39;49mitems()):\n\u001b[1;32m 115\u001b[0m done \u001b[39m=\u001b[39m base_path\n\u001b[1;32m 116\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:114\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 112\u001b[0m not_done \u001b[39m=\u001b[39m []\n\u001b[1;32m 113\u001b[0m \u001b[39mfor\u001b[39;00m base_path \u001b[39min\u001b[39;00m dest_paths:\n\u001b[0;32m--> 114\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39m(file_md5(base_path \u001b[39m/\u001b[39;49m p) \u001b[39m==\u001b[39m _md5 \u001b[39mfor\u001b[39;00m p, _md5 \u001b[39min\u001b[39;00m expected\u001b[39m.\u001b[39mitems()):\n\u001b[1;32m 115\u001b[0m done \u001b[39m=\u001b[39m base_path\n\u001b[1;32m 116\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/core/data/utils.py:338\u001b[0m, in \u001b[0;36mfile_md5\u001b[0;34m(fpath, chunk_size)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[39mwith\u001b[39;00m fpath\u001b[39m.\u001b[39mopen(\u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 337\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39miter\u001b[39m(\u001b[39mlambda\u001b[39;00m: f\u001b[39m.\u001b[39mread(chunk_size), \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 338\u001b[0m file_hash\u001b[39m.\u001b[39;49mupdate(chunk)\n\u001b[1;32m 339\u001b[0m \u001b[39mreturn\u001b[39;00m file_hash\u001b[39m.\u001b[39mhexdigest()\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ - "from deeppavlov.core.commands.utils import parse_config\n", + "from deeppavlov import build_model\n", "\n", - "model_config = parse_config('dnnc_infer')\n", + "import os\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n", "\n", - "# dataset for predictions\n", - "print(model_config['chainer']['pipe'][-1]['confidence_threshold'])" + "model = build_model(\"dnnc_infer\", download=True)" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# 5. Use the model for prediction\n", + "If you set `download` flag to `True`, then existing model weights will be overwritten.\n", + "\n", + "Setting the `install` argument to `True` is equivalent to executing the command line `install` command. If set to `True`, it will first install all the required packages.\n", "\n", - "## 5.1 Predict using Python\n", + "**Input**: List[texts, dataset]\n", "\n", - "After [installing](#4.-Get-started-with-the-model) the model, build it from the config and predict. If you set 'download' flag to 'True', then existing model weights will be overwritten." + "**Output**: List[labels]" ] }, { @@ -203,28 +186,48 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "['book_hotel', 'international_visa']\n" - ] + "data": { + "text/plain": [ + "['translate', 'exchange_rate', 'car_rental']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from deeppavlov import build_model, configs\n", - "\n", - "model = build_model(\"dnnc_infer\", install=True, download=True)\n", + "texts = [\n", + " \"what expression would i use to say i love you if i were an italian\",\n", + " \"what's the currency conversion between krones and yen\",\n", + " \"i'd like to reserve a high-end car\"\n", + "]\n", + "\n", + "dataset = [\n", + " [\"please help me book a rental car for nashville\", \"car_rental\"],\n", + " [\"how can i rent a car in boston\", \"car_rental\"],\n", + " [\"help me get a rental car for march 2 to 6th\", \"car_rental\"],\n", + " \n", + " [\"how many pesos can i get for one dollar\", \"exchange_rate\"],\n", + " [\"tell me the exchange rate between rubles and dollars\", \"exchange_rate\"],\n", + " [\"what is the exchange rate in pesos for 100 dollars\", \"exchange_rate\"],\n", + " \n", + " [\"can you tell me how to say 'i do not speak much spanish', in spanish\", \"translate\"],\n", + " [\"please tell me how to ask for a taxi in french\", \"translate\"],\n", + " [\"how would i say thank you if i were russian\", \"translate\"]\n", + "]\n", "\n", - "model([\"can you find me a good reviewed hotel in japan\", \"if i get a visa can i travel to japan\"])" + "model(texts, dataset)" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## 5.2 Predict using CLI\n", + "## 4.2 Predict using CLI\n", "\n", - "You can also get predictions in an interactive mode through CLI." + "You can also get predictions in an interactive mode through CLI (Сommand Line Interface)." ] }, { @@ -233,10 +236,19 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov interact dnnc_infer [-d] [-i]" + "!python -m deeppavlov interact dnnc_infer -d" ] }, { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with all other files needed to run the model. " + ] + }, + { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -249,7 +261,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov predict dnnc_infer [-d] [-i]" + "!python -m deeppavlov predict dnnc_infer -f " ] }, { @@ -257,21 +269,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# 6. Evaluate\n", + "# 5. Customize the model\n", "\n", - "To evaluate the model on your data, you need to change the path to the dataset in `dnnc_infer` config" + "Out-of-scope (OOS) examples are determined via confidence with *confidence_threshold* parameter. For each input text, if the confidence of the model is lower than the *confidence_threshold*, then the input example is considered out-of-scop. The higher the threshold, the more often the model predicts \"oos\" class. By default it is set to 0, but you can change it to your preferences in configuration file." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "~/.deeppavlov/downloads/clinc150\n" + "0.0\n" ] } ], @@ -280,48 +292,29 @@ "\n", "model_config = parse_config('dnnc_infer')\n", "\n", - "# dataset for evaluation\n", - "print(model_config['dataset_reader']['data_path'])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6.1 Evaluate from Python" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from deeppavlov import evaluate_model\n", - "\n", - "model = evaluate_model('dnnc_infer', install=True, download=True)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6.2 Evaluate from CLI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python -m deeppavlov evaluate dnnc_infer [-d] [-i]" + "print(model_config['chainer']['pipe'][-1]['confidence_threshold'])" ] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "dnnc_cuda11_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, "nbformat": 4, "nbformat_minor": 2 } From ca199dcdf691c11187a7f299856d51266ae6f773 Mon Sep 17 00:00:00 2001 From: LogicZMaksimka Date: Wed, 28 Jun 2023 13:56:43 +0300 Subject: [PATCH 55/57] refactor: rename config --- ...{dnnc_infer.json => few_shot_roberta.json} | 0 .../models/few_shot_classification.ipynb | 53 ++++++------------- 2 files changed, 16 insertions(+), 37 deletions(-) rename deeppavlov/configs/classifiers/{dnnc_infer.json => few_shot_roberta.json} (100%) diff --git a/deeppavlov/configs/classifiers/dnnc_infer.json b/deeppavlov/configs/classifiers/few_shot_roberta.json similarity index 100% rename from deeppavlov/configs/classifiers/dnnc_infer.json rename to deeppavlov/configs/classifiers/few_shot_roberta.json diff --git a/docs/features/models/few_shot_classification.ipynb b/docs/features/models/few_shot_classification.ipynb index 4054d1ad19..c6748da9bc 100644 --- a/docs/features/models/few_shot_classification.ipynb +++ b/docs/features/models/few_shot_classification.ipynb @@ -65,7 +65,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov install dnnc_infer" + "!python -m deeppavlov install few_shot_roberta" ] }, { @@ -73,7 +73,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`dnnc_infer` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", + "`few_shot_roberta` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", "Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list)." @@ -86,13 +86,13 @@ "source": [ "# 3. Models list\n", "\n", - "At the moment, only `dnnc_infer` config support out-of-scope detection.\n", + "At the moment, only `few_shot_roberta` config support out-of-scope detection.\n", "\n", "| Config name | Dataset | Shot | Model Size | In-domain accuracy | Out-of-scope recall | Out-of-scope precision |\n", "| :--- | --- | --- | --- | --- | --- | ---: |\n", - "| dnnc_infer| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 84.1±1.9 | 93.2±0.8 | 97.8±0.3 |\n", - "| dnnc_infer| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 59.4±1.4 | 87.9±1.2 | 40.3±0.7 |\n", - "| dnnc_infer| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 51.4±2.1 | 93.7±0.7 | 82.7±1.4 |\n", + "| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 84.1±1.9 | 93.2±0.8 | 97.8±0.3 |\n", + "| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 59.4±1.4 | 87.9±1.2 | 40.3±0.7 |\n", + "| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 51.4±2.1 | 93.7±0.7 | 82.7±1.4 |\n", "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB |24.8±2.2 | 98.2±0.4 | 74.8±0.6 |\n", "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 13.4±0.5 | 98.6±0.2 | 20.5±0.1 |\n", "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 37 KB |10.7±0.8 | 99.0±0.3 | 36.4±0.2 |\n", @@ -102,9 +102,9 @@ "\n", "| Config name | Dataset | Shot | Model Size | Accuracy |\n", "| :--- | --- | --- | --- | ---: |\n", - "| dnnc_infer| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 89.6 |\n", - "| dnnc_infer| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 79.6 |\n", - "| dnnc_infer| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 55.1 |\n", + "| few_shot_roberta| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 89.6 |\n", + "| few_shot_roberta| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 1.4 GB | 79.6 |\n", + "| few_shot_roberta| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 1.4 GB | 55.1 |\n", "| fasttext_logreg*| [CLINC150-Banking-Domain](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 86.3 |\n", "| fasttext_logreg*| [CLINC150](https://paperswithcode.com/paper/an-evaluation-dataset-for-intent) | 5 | 37 KB | 73.6\n", "| fasttext_logreg*| [BANKING77-OOS](https://paperswithcode.com/paper/are-pretrained-transformers-robust-in-intent) | 5 | 37 KB | 51.6 |\n", @@ -114,7 +114,7 @@ "\n", "# 4. Use the model for prediction\n", "\n", - "Base model `dnnc_infer` was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n", + "Base model `few_shot_roberta` was already pre-trained to recognize simmilar utterances, so you can use off-the-shelf model to make predictions and evalutation. No additional training needed.\n", "\n", "## 4.1 Dataset format\n", "\n", @@ -136,34 +136,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/home/savkin/few_shot/DNNC/DeepPavlov/few_shot_classification.ipynb Cell 8\u001b[0m in \u001b[0;36m6\n\u001b[1;32m 3\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\n\u001b[1;32m 4\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m\"\u001b[39m\u001b[39mCUDA_VISIBLE_DEVICES\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m1\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 6\u001b[0m model \u001b[39m=\u001b[39m build_model(\u001b[39m\"\u001b[39m\u001b[39mdnnc_infer\u001b[39m\u001b[39m\"\u001b[39m, download\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n", - "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/core/commands/infer.py:39\u001b[0m, in \u001b[0;36mbuild_model\u001b[0;34m(config, mode, load_trained, install, download)\u001b[0m\n\u001b[1;32m 37\u001b[0m install_from_config(config)\n\u001b[1;32m 38\u001b[0m \u001b[39mif\u001b[39;00m download:\n\u001b[0;32m---> 39\u001b[0m deep_download(config)\n\u001b[1;32m 41\u001b[0m import_packages(config\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mmetadata\u001b[39m\u001b[39m'\u001b[39m, {})\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mimports\u001b[39m\u001b[39m'\u001b[39m, []))\n\u001b[1;32m 43\u001b[0m model_config \u001b[39m=\u001b[39m config[\u001b[39m'\u001b[39m\u001b[39mchainer\u001b[39m\u001b[39m'\u001b[39m]\n", - "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:174\u001b[0m, in \u001b[0;36mdeep_download\u001b[0;34m(config)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m url\u001b[39m.\u001b[39mstartswith(\u001b[39m'\u001b[39m\u001b[39ms3://\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(config, \u001b[39mdict\u001b[39m):\n\u001b[1;32m 173\u001b[0m url \u001b[39m=\u001b[39m set_query_parameter(url, \u001b[39m'\u001b[39m\u001b[39mconfig\u001b[39m\u001b[39m'\u001b[39m, Path(config)\u001b[39m.\u001b[39mstem)\n\u001b[0;32m--> 174\u001b[0m download_resource(url, dest_paths, headers)\n", - "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:137\u001b[0m, in \u001b[0;36mdownload_resource\u001b[0;34m(url, dest_paths, headers)\u001b[0m\n\u001b[1;32m 134\u001b[0m lockfile \u001b[39m=\u001b[39m download_path \u001b[39m/\u001b[39m \u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39m.\u001b[39m\u001b[39m{\u001b[39;00mfile_name\u001b[39m}\u001b[39;00m\u001b[39m.lock\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 136\u001b[0m \u001b[39mwith\u001b[39;00m FileLock(lockfile)\u001b[39m.\u001b[39macquire(poll_intervall\u001b[39m=\u001b[39m\u001b[39m10\u001b[39m):\n\u001b[0;32m--> 137\u001b[0m \u001b[39mif\u001b[39;00m check_md5(url, dest_paths, headers):\n\u001b[1;32m 138\u001b[0m log\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mSkipped \u001b[39m\u001b[39m{\u001b[39;00murl\u001b[39m}\u001b[39;00m\u001b[39m download because of matching hashes\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 139\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39many\u001b[39m(ext \u001b[39min\u001b[39;00m url \u001b[39mfor\u001b[39;00m ext \u001b[39min\u001b[39;00m (\u001b[39m'\u001b[39m\u001b[39m.tar.gz\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m.gz\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m.zip\u001b[39m\u001b[39m'\u001b[39m)):\n", - "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:114\u001b[0m, in \u001b[0;36mcheck_md5\u001b[0;34m(url, dest_paths, headers)\u001b[0m\n\u001b[1;32m 112\u001b[0m not_done \u001b[39m=\u001b[39m []\n\u001b[1;32m 113\u001b[0m \u001b[39mfor\u001b[39;00m base_path \u001b[39min\u001b[39;00m dest_paths:\n\u001b[0;32m--> 114\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39;49m(file_md5(base_path \u001b[39m/\u001b[39;49m p) \u001b[39m==\u001b[39;49m _md5 \u001b[39mfor\u001b[39;49;00m p, _md5 \u001b[39min\u001b[39;49;00m expected\u001b[39m.\u001b[39;49mitems()):\n\u001b[1;32m 115\u001b[0m done \u001b[39m=\u001b[39m base_path\n\u001b[1;32m 116\u001b[0m \u001b[39melse\u001b[39;00m:\n", - "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/download.py:114\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 112\u001b[0m not_done \u001b[39m=\u001b[39m []\n\u001b[1;32m 113\u001b[0m \u001b[39mfor\u001b[39;00m base_path \u001b[39min\u001b[39;00m dest_paths:\n\u001b[0;32m--> 114\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mall\u001b[39m(file_md5(base_path \u001b[39m/\u001b[39;49m p) \u001b[39m==\u001b[39m _md5 \u001b[39mfor\u001b[39;00m p, _md5 \u001b[39min\u001b[39;00m expected\u001b[39m.\u001b[39mitems()):\n\u001b[1;32m 115\u001b[0m done \u001b[39m=\u001b[39m base_path\n\u001b[1;32m 116\u001b[0m \u001b[39melse\u001b[39;00m:\n", - "File \u001b[0;32m/cephfs/home/savkin/few_shot/DNNC/DeepPavlov/deeppavlov/core/data/utils.py:338\u001b[0m, in \u001b[0;36mfile_md5\u001b[0;34m(fpath, chunk_size)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[39mwith\u001b[39;00m fpath\u001b[39m.\u001b[39mopen(\u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 337\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39miter\u001b[39m(\u001b[39mlambda\u001b[39;00m: f\u001b[39m.\u001b[39mread(chunk_size), \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[0;32m--> 338\u001b[0m file_hash\u001b[39m.\u001b[39;49mupdate(chunk)\n\u001b[1;32m 339\u001b[0m \u001b[39mreturn\u001b[39;00m file_hash\u001b[39m.\u001b[39mhexdigest()\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], + "outputs": [], "source": [ "from deeppavlov import build_model\n", "\n", - "import os\n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n", - "\n", - "model = build_model(\"dnnc_infer\", download=True)" + "model = build_model(\"few_shot_roberta\", download=True)" ] }, { @@ -236,7 +215,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov interact dnnc_infer -d" + "!python -m deeppavlov interact few_shot_roberta -d" ] }, { @@ -261,7 +240,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m deeppavlov predict dnnc_infer -f " + "!python -m deeppavlov predict few_shot_roberta -f " ] }, { @@ -290,7 +269,7 @@ "source": [ "from deeppavlov.core.commands.utils import parse_config\n", "\n", - "model_config = parse_config('dnnc_infer')\n", + "model_config = parse_config('few_shot_roberta')\n", "\n", "print(model_config['chainer']['pipe'][-1]['confidence_threshold'])" ] From fcc1d9d7036524574111d1042858caf64ebf99bf Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Thu, 29 Jun 2023 11:30:55 +0300 Subject: [PATCH 56/57] docs: optimized few-shot classification doc --- .../models/few_shot_classification.ipynb | 55 ++++--------------- 1 file changed, 10 insertions(+), 45 deletions(-) diff --git a/docs/features/models/few_shot_classification.ipynb b/docs/features/models/few_shot_classification.ipynb index c6748da9bc..c0207fc23f 100644 --- a/docs/features/models/few_shot_classification.ipynb +++ b/docs/features/models/few_shot_classification.ipynb @@ -23,14 +23,8 @@ " \n", " 4.3. [Predict using CLI](#4.3-Predict-using-CLI)\n", "\n", - "5. [Customize the model](#5.-Customize-the-model)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "5. [Customize the model](#5.-Customize-the-model)\n", + "\n", "# 1. Introduction to the task\n", "\n", "__Text classification__ is a task of identifying one of the pre-defined label given an utterance, where label is one of N classes or \"OOS\" (out-of-scope examples - utterances that do not belong to any of the predefined classes). We consider few-shot setting, where only few examples (5 or 10) per intent class are given as a training set.\n", @@ -76,14 +70,8 @@ "`few_shot_roberta` is the name of the model's *config_file*. [What is a Config File?](http://docs.deeppavlov.ai/en/master/intro/configuration.html) \n", "\n", "Configuration file defines the model and describes its hyperparameters. To use another model, change the name of the *config_file* here and further.\n", - "Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "Some of few-shot classification models with their config names can be found in the [table](#3.-Models-list).\n", + "\n", "# 3. Models list\n", "\n", "At the moment, only `few_shot_roberta` config support out-of-scope detection.\n", @@ -223,14 +211,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with all other files needed to run the model. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ + "`-d` is an optional download key (alternative to `download=True` in Python code). The key `-d` is used to download the pre-trained model along with all other files needed to run the model.\n", + "\n", "Or make predictions for samples from *stdin*." ] }, @@ -267,33 +249,16 @@ } ], "source": [ + "from deeppavlov import build_model\n", "from deeppavlov.core.commands.utils import parse_config\n", "\n", "model_config = parse_config('few_shot_roberta')\n", - "\n", - "print(model_config['chainer']['pipe'][-1]['confidence_threshold'])" + "model_config['chainer']['pipe'][-1]['confidence_threshold'] = 0.1\n", + "model = build_model(model_config)" ] } ], - "metadata": { - "kernelspec": { - "display_name": "dnnc_cuda11_env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 2 } From 59348f538989486fcf58e3680e03df4b75cf4c8a Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Thu, 29 Jun 2023 11:31:04 +0300 Subject: [PATCH 57/57] feat: few-shot tests --- tests/test_quick_start.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 5f68c7f9e8..bd21b4ccf8 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -114,7 +114,10 @@ ("classifiers/superglue/superglue_copa_roberta.json", "classifiers", ('TI',)): [LIST_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_boolq_roberta_mnli.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_record_roberta.json", "classifiers", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK], - ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK] + ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/few_shot_roberta.json", "classifiers", ('IP',)): [ + ('Dummy text', ['Dummy text Dummy text', 'Dummy class'], ('Dummy class',)) + ] }, "distil": { ("classifiers/paraphraser_convers_distilrubert_2L.json", "distil", ('IP')): [TWO_ARGUMENTS_INFER_CHECK],