forked from stanford-oval/storm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
eval_article_quality.py
189 lines (152 loc) · 8.17 KB
/
eval_article_quality.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""Compute article quality metrics on a dataset.
The script expects
- a CSV file (args.input_path) with a column 'topic' containing the topics for evaluation.
- a directory (args.gt_dir) containing human-written articles. The articles should be named as txt/{topic_name}.txt
and there should be a json file named json/{topic_name}.json containing the named entities in the article.
- a directory (args.pred_dir) containing generated articles. The outlines should be named as {topic_name}/{args.pred_file_name}.
"""
import argparse
import json
import logging
import os
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, LlamaForCausalLM
from evaluation_prometheus import get_grading_dict, preprocess_text
from evaluation_trim_length import process_document
from metrics import article_entity_recall, compute_rouge_scores
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class ColoredFormatter(logging.Formatter):
COLORS = {
'WARNING': '\033[93m', # Yellow
'INFO': '\033[97m', # White
'DEBUG': '\033[92m', # Green
'CRITICAL': '\033[94m', # Blue
'ERROR': '\033[91m', # Red
'RESET': '\033[0m', # Reset
}
def format(self, record):
color = self.COLORS.get(record.levelname, self.COLORS['RESET'])
record.levelname = color + record.levelname + self.COLORS['RESET']
record.msg = color + str(record.msg) + self.COLORS['RESET']
return super().format(record)
def load_str(path):
with open(path, 'r') as f:
return '\n'.join(f.readlines())
def load_json(path):
with open(path, 'r') as f:
return json.load(f)
def dump_json(data, path):
with open(path, 'w') as f:
json.dump(data, f, indent=2)
def assert_int(x):
try:
int(x)
return True
except:
return False
def main(args):
logger.info(f"loading tokenizer {args.tokenizer} and model {args.model}")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
model = LlamaForCausalLM.from_pretrained(args.model, device_map="auto")
df = pd.read_csv(args.input_path)
aggregated_results = {}
for i, row in tqdm(df.iterrows()):
import pdb
pdb.set_trace()
topic = row['topic']
topic_name = topic.replace(" ", "_").replace("/", "_")
llm_output_path = os.path.join(args.pred_dir, topic_name, args.pred_file_name)
assert os.path.exists(llm_output_path), f"llm output path not exists {llm_output_path}"
golden_answer_json = load_json(os.path.join(args.gt_dir, 'json', topic_name + '.json'))
golden_answer = load_str(os.path.join(args.gt_dir, 'txt', topic_name + '.txt'))
golden_answer = preprocess_text(golden_answer)
llm_output = load_str(llm_output_path)
llm_output = preprocess_text(llm_output)
output_file_path = os.path.join(args.result_output_dir, f"{topic_name}.json")
# Prometheus model has a limited context window.
trimmed_output_for_rubric_grading = process_document(llm_output_path, max_words=2000)
evaluation_main_dict = {"topic": topic, "grading": {}}
# Get rubric grading.
logger.info(f"Processing rubric grading.")
grading_dict = get_grading_dict(responses=[trimmed_output_for_rubric_grading],
topic=topic,
tokenizer=tokenizer,
model=model,
prompt_template_path=args.prompt_template_path,
rubric_path=args.rubric_path,
logger=logger)
for criteria_description, response_grading_dict in grading_dict.items():
for response_idx, feedback_dict in response_grading_dict.items():
if 'rubric_grading' not in evaluation_main_dict["grading"]:
evaluation_main_dict["grading"] = {"rubric_grading": {criteria_description: feedback_dict}}
else:
evaluation_main_dict["grading"]["rubric_grading"][criteria_description] = feedback_dict
# get automatic evaluation score
logger.info(f"Processing automatic evaluation.")
automatic_evaluation_score = compute_rouge_scores(predicted_answer=llm_output, golden_answer=golden_answer)
evaluation_main_dict["grading"]["auto_grading"] = automatic_evaluation_score
# get named entity overlap with golden answer
logger.info(f"Processing entity overlap with ground truth")
evaluation_main_dict["grading"]["entity_recall"] = article_entity_recall(
golden_entities=golden_answer_json['flair_entities'],
predicted_article=llm_output
)
dump_json(evaluation_main_dict, output_file_path)
if len(aggregated_results) == 0:
for k in evaluation_main_dict['grading']['rubric_grading']:
aggregated_results[k] = [evaluation_main_dict['grading']['rubric_grading'][k]]
for k in evaluation_main_dict['grading']['auto_grading']:
aggregated_results[k] = [evaluation_main_dict['grading']['auto_grading'][k]]
aggregated_results['entity_recall'] = [evaluation_main_dict['grading']['entity_recall']]
else:
for k in evaluation_main_dict['grading']['rubric_grading']:
aggregated_results[k].append(evaluation_main_dict['grading']['rubric_grading'][k])
for k in evaluation_main_dict['grading']['auto_grading']:
aggregated_results[k].append(evaluation_main_dict['grading']['auto_grading'][k])
aggregated_results['entity_recall'].append(evaluation_main_dict['grading']['entity_recall'])
# compute average score
logger.info(f"Computing average score.")
avg_results = {}
for k in aggregated_results:
if type(aggregated_results[k][0]) is dict:
avg_results[k] = sum([float(x['score']) for x in aggregated_results[k]]) / len(aggregated_results[k])
else:
avg_results[k] = sum(aggregated_results[k]) / len(aggregated_results[k])
print(f"{k}: {avg_results[k]}")
dump_json(avg_results, os.path.join(args.result_output_dir, "avg_results.json"))
if __name__ == "__main__":
# configure logger
global logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
formatter = ColoredFormatter('%(levelname)s: %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# command line argument
parser = argparse.ArgumentParser()
parser.add_argument('--input-path', type=str,
help='Using csv file to store topic and ground truth url at present.')
parser.add_argument('--pred-dir',
help='Directory to the file containing the LLM output.')
parser.add_argument('--gt-dir',
help='Directory to the file containing the human-written articles.')
parser.add_argument('--result-output-dir',
help='Directory to store the evaluation results. '
'Each article evaluation will be saved as separate file named after {topic_name}.json')
parser.add_argument('--pred-file-name', help='Name of the article file.')
parser.add_argument("--prompt-template-path", default="./eval_prometheus_no_ref.prompt",
help='path to evaluation prometheus prompt template')
parser.add_argument("--rubric-path", default="./eval_rubric_5.json", help='path to rubric json file')
parser.add_argument('--tokenizer', default="meta-llama/Llama-2-7b-chat-hf")
parser.add_argument('--model',
choices=["prometheus-eval/prometheus-13b-v1.0", "prometheus-eval/prometheus-7b-v1.0"],
default="prometheus-eval/prometheus-13b-v1.0",
help="Model to use for rubric evaluation.")
args = parser.parse_args()
# check output directory
if not os.path.exists(args.result_output_dir):
os.makedirs(args.result_output_dir)
logger.info(f"Directory {args.result_output_dir} created.")
main(args)