-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluation.py
92 lines (67 loc) · 2.94 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import logging
import string
from collections import Counter
from typing import Callable
import regex
from rouge import Rouge
rouge = Rouge()
logger = logging.getLogger(__name__)
# Normalization and score functions from SQuAD evaluation script https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
def normalize_answer(s: str) -> str:
def remove_articles(text):
return regex.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def em(prediction, ground_truth, normalize_fn):
return float(normalize_fn(prediction) == normalize_fn(ground_truth))
def f1(prediction, ground_truth, normalize_fn):
prediction_tokens = normalize_fn(prediction).split()
ground_truth_tokens = normalize_fn(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def rouge_wrapper(prediction, ground_truth):
try:
result = rouge.get_scores(prediction, ground_truth, avg=True)
return result["rouge-1"]["f"], result["rouge-2"]["f"], result["rouge-l"]["f"]
except:
return 0.0, 0.0, 0.0
# pred = [p1, p2 ..., pn] gt=[[g1,g2,...,gn]]
def f1_score(prediction, ground_truths, normalize_fn: Callable[[str], str] = lambda x: x):
return max([f1(prediction, gt, normalize_fn) for gt in ground_truths])
def exact_match_score(prediction, ground_truths, normalize_fn: Callable[[str], str] = lambda x: x):
return max([em(prediction, gt, normalize_fn) for gt in ground_truths])
# pred = [p1, p2 ..., pn] gt=[[g1,g2,...,gn]]
def rouge_score(prediction, ground_truths):
ground_truths = [x for x in ground_truths if len(x) > 0]
if (
len(prediction) == 0 or len(ground_truths) == 0
): # check if empty prediction or if there is no hypothesis with len > 0
return 0.0, 0.0, 0.0
scores = [rouge_wrapper(prediction, gt) for gt in ground_truths]
rouge1 = max(s[0] for s in scores)
rouge2 = max(s[1] for s in scores)
rougel = max(s[2] for s in scores)
return rouge1, rouge2, rougel
# pred = [p1, p2 ..., pn] gt=[[g1,g2,...,gn]]
def bleu_score(prediction, ground_truths):
from sacrebleu import BLEU
bleu = BLEU()
score = bleu.corpus_score(prediction, ground_truths)
return score