-
Notifications
You must be signed in to change notification settings - Fork 1
/
eval_utils.py
154 lines (134 loc) · 6.78 KB
/
eval_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding: utf-8 -*-
# This script handles the decoding functions and performance measurement
import re
import random
from fuzzy_matching import *
# sentiment_word_list = ['positive', 'negative', 'neutral']
sentiment_word_list = ['great', 'bad', 'ok']
opinion2word = {'great': 'positive', 'bad': 'negative', 'ok': 'neutral'}
opinion2word_under_o2m = {'good': 'positive', 'great': 'positive', 'best': 'positive',
'bad': 'negative', 'okay': 'neutral', 'ok': 'neutral', 'average': 'neutral'}
numopinion2word = {'SP1': 'positive', 'SP2': 'negative', 'SP3': 'neutral'}
def extract_spans_para(task, seq, seq_type, output_type, special_token_list, use_x_shot, few_shot_data, use_french_data, use_dutch_data, dataset, is_test_mode, do_fuzzy_matching, sample_id):
special_token_aspect, special_token_opinion, special_token_category, special_token_sentiment, special_token_seperate = special_token_list
targets_seq = seq.replace("</s>", "").replace("<pad>", "").strip()
if task == 'ASPE':
as_pairs = []
# <extra_id_0> aspect0 <extra_id_2> sentiment0 <extra_id_0> aspect1 <extra_id_2> sentiment1...
if output_type == 'span':
all_ls = re.split("<..........>", targets_seq)
is_a = 0
for i in range(len(all_ls)):
if all_ls[i] == "":
continue
if (is_a % 2) == 0:
aspect = all_ls[i].strip()
else:
sentiment = all_ls[i].strip()
if sentiment not in sentiment_word_list:
sentiment = sentiment_word_list[random.randrange(0, 3)]
as_pairs.append((aspect, sentiment))
is_a += 1
# {aspect} is {sentiment} [SSEP] ...
elif output_type == 'paraphrase':
pair_seq_list = [s.strip() for s in targets_seq.split('[SSEP]')]
for pair_seq in pair_seq_list:
try:
aspect, sentiment = pair_seq.split(' is ')
except ValueError:
try:
# print(f'In {seq_type} seq, cannot decode: {s}')
pass
except UnicodeEncodeError:
# print(f'In {seq_type} seq, a string cannot be decoded')
pass
aspect, sentiment = '', ''
as_pairs.append((aspect, sentiment))
# ({aspect}, {sentiment}); (...)
elif output_type == 'extraction':
pair_seq_list = [s.strip().replace('(', '').replace(')', '') for s in targets_seq.split(';')]
for pair_seq in pair_seq_list:
try:
aspect, sentiment = [s.strip() for s in pair_seq.split(',')]
except ValueError:
try:
# print(f'In {seq_type} seq, cannot decode: {s}')
pass
except UnicodeEncodeError:
# print(f'In {seq_type} seq, a string cannot be decoded')
pass
aspect, sentiment = '', ''
as_pairs.append((aspect, sentiment))
else:
raise NotImplementedError
targets = []
if task == 'ASPE':
targets = list(set(as_pairs))
if is_test_mode != 0:
formated_targets = []
if do_fuzzy_matching:
if use_french_data:
if few_shot_data != 0 and use_x_shot != 0:
test_data_path = f'./data4fewshot/{use_x_shot}shot/{task}/{dataset}/test_{few_shot_data}.txt'
else:
test_data_path = f'./data4ml/french/{dataset}/test.txt'
elif use_dutch_data:
if few_shot_data != 0 and use_x_shot != 0:
test_data_path = f'./data4fewshot/{use_x_shot}shot/{task}/{dataset}/test_{few_shot_data}.txt'
else:
test_data_path = f'./data4ml/dutch/{dataset}/test.txt'
elif few_shot_data != 0 and use_x_shot != 0:
test_data_path = f'./data4fewshot/{use_x_shot}shot/{task}/{dataset}/test_{few_shot_data}.txt'
else:
test_data_path = f'./data/{task}/{dataset}/test.txt'
with open(test_data_path, 'r') as f:
lines = f.readlines()
review = lines[sample_id].strip()
for pred in targets:
predict_aspect = pred[0]
predict_sentiment = pred[1]
formated_aspect = fuzzy_matching(review, predict_aspect)
formated_targets.append((formated_aspect, predict_sentiment))
return formated_targets
elif task == 'AOPE':
targets = list(set(ao_pairs))
elif task == 'ABSC':
targets = list(set(senti_eles))
return targets
def compute_f1_scores(pred_pt, gold_pt):
"""
Function to compute F1 scores with pred and gold quads
The input needs to be already processed
"""
# number of true postive, gold standard, predictions
n_tp, n_gold, n_pred = 0, 0, 0
for i in range(len(pred_pt)):
n_gold += len(gold_pt[i])
n_pred += len(pred_pt[i])
for t in pred_pt[i]:
if t in gold_pt[i]:
n_tp += 1
print(f"number of gold spans: {n_gold}, predicted spans: {n_pred}, hit: {n_tp}")
precision = float(n_tp) / float(n_pred) if n_pred != 0 else 0
recall = float(n_tp) / float(n_gold) if n_gold != 0 else 0
f1 = 2 * precision * recall / (precision + recall) if precision != 0 or recall != 0 else 0
scores = {'precision': precision*100.0, 'recall': recall*100.0, 'f1': f1*100.0}
return scores
def compute_scores(pred_seqs, gold_seqs, task, output_type, special_token_list, use_x_shot, few_shot_data, use_french_data, use_dutch_data, dataset, is_test_mode, do_fuzzy_matching):
"""
Compute model performance
"""
assert len(pred_seqs) == len(gold_seqs)
num_samples = len(gold_seqs)
all_labels, all_preds = [], []
for i in range(num_samples):
gold_list = extract_spans_para(task, gold_seqs[i], 'gold', output_type, special_token_list, use_x_shot, few_shot_data, use_french_data, use_dutch_data, dataset, is_test_mode, do_fuzzy_matching, i)
pred_list = extract_spans_para(task, pred_seqs[i], 'pred', output_type, special_token_list, use_x_shot, few_shot_data, use_french_data, use_dutch_data, dataset, is_test_mode, do_fuzzy_matching, i)
print(f"gold:{gold_list}")
print(f"pred:{pred_list}")
all_labels.append(gold_list)
all_preds.append(pred_list)
print("\nResults:")
scores = compute_f1_scores(all_preds, all_labels)
print(scores)
return scores, all_preds, all_labels