-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlayman_summaries.py
167 lines (135 loc) · 5.99 KB
/
layman_summaries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
'''
LAYMAN SUMMARY GENERATOR for Abstracts of research papers
This script makes layman summaries from scientific abstracts using a Large Language Model from OpenAI ChatGPT.
As input we use the API from OpenAIRE to get content from the Netherlands Research Portal: https://netherlands.openaire.eu
Assistance in code development and debugging was provided by the code interpretation functionality of OpenAI's GPT-4 model.
Authors: Maurice Vanderfeesten and OpenAI's GPT-4 model
License: CC0
'''
import requests
import openai
import pandas as pd
import json
import time
import openai.error
import os
import yaml
from datetime import datetime
'''
# Load OpenAI API key and other configuration from a JSON configuration file
def load_config():
with open('config.json', 'r') as f:
config = json.load(f)
return config
'''
# Load OpenAI API key and other configuration from a YAML configuration file
def load_config():
with open('config.yaml', 'r') as f:
config = yaml.safe_load(f)
return config
# Fetch data from the OpenAIRE API
def fetch_data(data_url):
response = requests.get(data_url)
return response.json()
# Extract necessary information from each "result" in the data
def extract_info(result):
identifier = result['header']['dri:objIdentifier']['$']
originalId = extract_key_info(result, 'originalId')
title = extract_key_info(result, 'title')
description = extract_key_info(result, 'description')
return identifier, originalId, title, description
# Extract information from given key if it exists
def extract_key_info(result, key):
if key in result['metadata']['oaf:entity']['oaf:result']:
try:
info = result['metadata']['oaf:entity']['oaf:result'][key][0]['$']
except:
info = result['metadata']['oaf:entity']['oaf:result'][key]['$']
else:
info = None # Or any default value of your choice
return info
# Define a function to handle API calls from OpenAI with retry logic
def create_chat_completion_with_retry(prompt, aimodel, max_tokens, retries=3, delay=5):
for i in range(retries):
try:
response = openai.ChatCompletion.create(model=aimodel, messages=prompt, max_tokens=max_tokens)
return response
except openai.error.APIError as e:
if e.status == 502 and i < retries - 1: # Retry on 502 error
time.sleep(delay) # Wait for a bit before retrying
continue
else:
raise e # If it's not a 502 error or we've run out of retries, re-raise the exception
# Generate a layman summary using OpenAI
def generate_summary(description, aimodel, max_tokens, audience_name, audience_message):
if description is None:
return None
# Make for each audience a different system message
system_message = "You are a helpful assistant. " + audience_message
# Instruct what to do with the abstract for a specific audience
user_message = f'Summarize this text for a {audience_name}: {description}'
prompt = [
{"role": "system", "content": system_message},
{"role": "user", "content": user_message}
]
# Use the retry function instead of directly calling `openai.ChatCompletion.create`
response = create_chat_completion_with_retry(prompt, aimodel, max_tokens)
# Get generated text from each response
generated_text = response['choices'][0]['message']['content'].strip()
# Truncate the text to end with a complete sentence
truncated_text = truncate_text(generated_text, max_tokens-20)
# Return the truncated response
return truncated_text
# Truncate the text to the last complete sentence that fits within max_tokens.
def truncate_text(text, max_tokens):
tokenized_text = text.split() # split the text into words
if len(tokenized_text) > max_tokens:
# find the last period within max_tokens
last_period = ' '.join(tokenized_text[:max_tokens]).rfind('.')
# truncate to the last complete sentence
tokenized_text = tokenized_text[:last_period+1]
return ' '.join(tokenized_text)
# Save DataFrame to CSV and Excel files
def save_df(df, aimodel):
num_records = len(df)
if not os.path.exists('results'):
os.makedirs('results')
now = datetime.now()
formatted_date = now.strftime("%Y-%m-%d_%H%M")
csv_file_name = f"results_{num_records}_{aimodel}_{formatted_date}.csv"
xlsx_file_name = f"results_{num_records}_{aimodel}_{formatted_date}.xlsx"
csv_file_path = os.path.join('results', csv_file_name)
xlsx_file_path = os.path.join('results', xlsx_file_name)
df.to_csv(csv_file_path, index=False)
df.to_excel(xlsx_file_path, index=False)
# Main function to orchestrate the operations
def main(config):
# Get the variables from the config
openai.api_key = config['openai_api_key']
aimodel = config['aimodel']
max_tokens = config['max_tokens']
data_url = config['data_url']
audiences = config['audiences']
# Get the input data
data = fetch_data(data_url)
results = []
# Extract metadata fields for each of the records from the input data
for result in data['response']['results']['result']:
identifier, originalId, title, description = extract_info(result)
result_dict = {
'objIdentifier': identifier,
'originalId': originalId,
'title': title,
'description': description
}
# Generate layman summaries for each of the defined audiences from the abstract of the paper
for audience in audiences:
layman_summary = generate_summary(description, aimodel, max_tokens, audience['name'], audience['message'])
result_dict[f'layman_summary_{audience["name"].lower()}'] = layman_summary
results.append(result_dict)
# Create a DataFrame from the results
df = pd.DataFrame(results)
# Save the data of this session as an excel and csv file in the results folder
save_df(df, aimodel)
# Return the DataFrame
return df