-
Notifications
You must be signed in to change notification settings - Fork 0
/
title_summary_generation.py
152 lines (116 loc) · 5.54 KB
/
title_summary_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import openai
import pandas as pd
import numpy as np
import os
import regex
import re
##########################################################################
##########################################################################
# USAGE:
# enter the filepath and filename as prompted, and then
# enter the number of utterances per question you'd like to generate.
# INPUT
# Input file should be a csv file list of questions where it indicates
# that at least two columns each named "Question" and "Answer", respecively
# This script will remove all other columns
# OUTPUT:
# This will output a csv file with a generated "Summary" and "Title" for
# each entry, using an LLM, in this case, ChatGPT4.0
##########################################################################
##########################################################################
#api_key=
os.environ['OPENAI_API_KEY'];
# User inputs:
filepath=input("Please enter the path and filename of the csv that contains the questions and answers that need titles and summaries:\n")
title_summary_var=input("Do you need:\n 1. Title generation \n 2. Summary generation \n 3. Both title and summary generation\n")
def remove_number_period(text):
return re.sub(r'^\d{1,3}\.', '', text)
def remove_character(s):
return s.replace('Â', '')
# function to generate utterances from a dataframe entry:
def generate_title(dataframe, column_name, column_name2):
"""
Generate alternate versions of sentences in a specified column of a DataFrame.
:param api_key: OpenAI API key
:param dataframe: DataFrame containing the sentences
:param column_name: Name of the column with sentences
:param num_variations: Number of alternate versions to generate
:return: DataFrame with original sentences and their alternate versions
"""
results = []
#for sentence,index in dataframe[column_name]:
for i, row in dataframe.iterrows():
#for sentence2 in dataframe[column_name2]:
sentence = row[column_name]
sentence2 = row[column_name2]
try:
prompt='Write a 5 to 6 word title for an entry with this question ' + sentence + 'and this answer ' + sentence2
response = openai.chat.completions.create(
model="gpt-4",
#"gpt-3.5-turbo", # or another model of your choice
messages = [ # Change the prompt parameter to messages parameter
{'role': 'user', 'content': prompt}
]
)
choices = response.choices
chat_completion = choices[0]
alternates = chat_completion.message.content
results.append({'Original': sentence, 'Alternates': alternates})
except Exception as e:
print(f"An error occurred: {e}")
results.append({'Original': sentence, 'Alternates': []})
return pd.DataFrame(results)
def generate_summary(dataframe, column_name, column_name2):
"""
Generate alternate versions of sentences in a specified column of a DataFrame.
:param api_key: OpenAI API key
:param dataframe: DataFrame containing the sentences
:param column_name: Name of the column with sentences
:param num_variations: Number of alternate versions to generate
:return: DataFrame with original sentences and their alternate versions
"""
results = []
#for sentence,index in dataframe[column_name]:
for i, row in dataframe.iterrows():
#for sentence2 in dataframe[column_name2]:
sentence = row[column_name]
sentence2 = row[column_name2]
try:
prompt='Write a short summary for an entry with this question ' + sentence + 'and this answer ' + sentence2
response = openai.chat.completions.create(
model="gpt-4",
#"gpt-3.5-turbo", # or another model of your choice
messages = [ # Change the prompt parameter to messages parameter
{'role': 'user', 'content': prompt}
]
)
choices = response.choices
chat_completion = choices[0]
alternates = chat_completion.message.content
results.append({'Original': sentence, 'Alternates': alternates})
except Exception as e:
print(f"An error occurred: {e}")
results.append({'Original': sentence, 'Alternates': []})
return pd.DataFrame(results)
if __name__ == "__main__":
test_df = pd.read_csv(filepath, encoding = "ISO-8859-1")
test_df.dropna(subset=['Question','Answer'], inplace=True)
test_df = test_df.loc[:, ['Question','Answer']]
test_df['Question']=test_df['Question'].astype('string')
test_df['Answer']=test_df['Answer'].astype('string')
#test_df['Question'] = test_df['Question'].apply(remove_number_period)
if title_summary_var==1 | title_summary_var==3:
new_df1 = generate_title(test_df, 'Question', 'Answer')
if title_summary_var==2 | title_summary_var==3:
new_df2 = generate_summary(test_df, 'Question', 'Answer')
if title_summary_var==3:
new_df = pd.concat([new_df1, new_df2])
elif title_summary_var==1:
new_df = new_df1
else:
new_df = new_df2
# extract filename from filepath variable:
file=os.path.basename(filepath)
# write to new csv file using original filename to identify:
new_df.to_csv('title_summary_' + "_" + file, sep=',', index=False, encoding='utf-8')
print("Title and Summary columns for ",file," have been generated in title_summary_","_",file)