-
Notifications
You must be signed in to change notification settings - Fork 15
/
extractors.py
246 lines (199 loc) · 6.97 KB
/
extractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"Various metadata extractors."
import sys
from datetime import date
from typing import Optional, Sequence
from langchain.llms.base import BaseLLM
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import (
ChatPromptTemplate,
FewShotChatMessagePromptTemplate,
PromptTemplate,
)
from langchain_openai import OpenAI
from pydantic import BaseModel
_DEBUG = True
####################
# Period extractor #
####################
class Period(BaseModel):
start_date: date
end_date: date
# Set up a parser + inject instructions into the prompt template.
_PERIOD_PARSER = PydanticOutputParser(pydantic_object=Period)
# Prompt
_PERIOD_PROMPT = PromptTemplate(
template="Extract the dates. When a month is specicified starts at the first day of the month and ends at the last day of the month. When a week is specified starts on Monday and ends on Sunday.\n {format_instructions}\nToday is {day_of_week} {today}.\nInput: {query}\n",
input_variables=["query", "today", "day_of_week"],
partial_variables={"format_instructions": _PERIOD_PARSER.get_format_instructions()},
)
def extract_period(
query: str, today: str = None, day_of_week: str = None, model: BaseLLM = None
) -> Optional[Period]:
"""
Extract a period from a query.
"""
if today is None:
today = date.today().strftime("%Y-%m-%d")
if day_of_week is None:
day_of_week = date.today().strftime("%A")
_input = _PERIOD_PROMPT.format_prompt(
query=query, today=today, day_of_week=day_of_week
)
if _DEBUG:
print(f"Input: {_input.to_string()}", file=sys.stderr)
if model is None:
model = OpenAI(temperature=0)
output = model.invoke(_input.to_string())
if _DEBUG:
print(f"Period output: {output}", file=sys.stderr)
try:
return _PERIOD_PARSER.parse(output)
except:
return None
####################
# Intent extractor #
####################
class Intent(BaseModel):
intent: str
# Set up a parser + inject instructions into the prompt template.
_INTENT_PARSER = PydanticOutputParser(pydantic_object=Intent)
# Prompt
_INTENT_PROMPT = PromptTemplate(
template="Please analyze the following question to classify if this is an activity report request, a summary request, or a regular question.\n{format_instructions}\nQuestion: {query}\n",
input_variables=["query"],
partial_variables={"format_instructions": _INTENT_PARSER.get_format_instructions()},
)
def extract_intent(query: str, model: BaseLLM = None) -> Optional[Intent]:
"""
Extract a intent without any date from a query.
"""
_input = _INTENT_PROMPT.format_prompt(query=query)
if _DEBUG:
print(f"Input: {_input.to_string()}", file=sys.stderr)
if model is None:
model = OpenAI(temperature=0)
output = model.invoke(_input.to_string())
if _DEBUG:
print(f"Intent output: {output}", file=sys.stderr)
try:
return _INTENT_PARSER.parse(output)
except:
return None
#######################
# Documents extractor #
#######################
class Documents(BaseModel):
document_names: Sequence[str]
# Set up a parser + inject instructions into the prompt template.
_DOC_PARSER = PydanticOutputParser(pydantic_object=Documents)
# Prompt
_DOC_PROMPT = PromptTemplate(
template="Based on the question, please choose the most relevant document(s) to provide a well-informed answer. Here is the list of documents to choose from:\n{documents_desc}\n{format_instructions}\nQuestion: {query}\n",
input_variables=["query"],
partial_variables={"format_instructions": _DOC_PARSER.get_format_instructions()},
)
def extract_documents(
query: str, documents_desc: str, model: BaseLLM = None
) -> Optional[Documents]:
"""
Extract the document(s) from a query.
"""
_input = _DOC_PROMPT.format_prompt(query=query, documents_desc=documents_desc)
if _DEBUG:
print(f"Input: {_input.to_string()}", file=sys.stderr)
if model is None:
model = OpenAI(temperature=0)
output = model.invoke(_input.to_string())
if _DEBUG:
print(f"Output documents: {output}", file=sys.stderr)
try:
return _DOC_PARSER.parse(output)
except Exception as excp:
print(excp, file=sys.stderr)
return None
######################
# Sentence extractor #
######################
class Sentence(BaseModel):
sentence: str
# Set up a parser + inject instructions into the prompt template.
_SENTENCE_PARSER = PydanticOutputParser(pydantic_object=Sentence)
# Prompt
_SENTENCE_PROMPT = PromptTemplate(
template="Please rephrase the following sentence to remove any notion of time.\n {format_instructions}\nSentence: {query}\n",
input_variables=["query"],
partial_variables={
"format_instructions": _SENTENCE_PARSER.get_format_instructions()
},
)
def extract_sentence_no_time(query: str, model: BaseLLM = None) -> Optional[Sentence]:
"""
Extract a sentence without any time from a query.
"""
_input = _SENTENCE_PROMPT.format_prompt(query=query)
if _DEBUG:
print(f"Input: {_input.to_string()}", file=sys.stderr)
if model is None:
model = OpenAI(temperature=0)
output = model.invoke(_input.to_string())
if _DEBUG:
print(f"Output without time: {output}", file=sys.stderr)
try:
return _SENTENCE_PARSER.parse(output)
except:
return None
#######################
# Step Back extractor #
#######################
# Prompt
# Few Shot Examples
_EXAMPLES = [
{
"input": "Could the members of The Police perform lawful arrests?",
"output": "what can the members of The Police do?",
},
{
"input": "Jan Sindel’s was born in what country?",
"output": "what is Jan Sindel’s personal history?",
},
]
# We now transform these to example messages
_EXAMPLE_PROMPT = ChatPromptTemplate.from_messages(
[
("human", "{input}"),
("ai", "{output}"),
]
)
_FEW_SHOT_PROMPT = FewShotChatMessagePromptTemplate(
example_prompt=_EXAMPLE_PROMPT,
examples=_EXAMPLES,
)
_STEP_BACK_PROMPT = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer.\nHere are a few examples:""",
),
# Few shot examples
_FEW_SHOT_PROMPT,
# New question
("user", "{question}"),
]
)
def extract_step_back(query: str, model: BaseLLM = None) -> Optional[Sentence]:
"""
Extract a step back question from a query.
"""
_input = _STEP_BACK_PROMPT.format(question=query)
if _DEBUG:
print(f"Input: {_input}", file=sys.stderr)
if model is None:
model = OpenAI(temperature=0)
output = model.invoke(_input)
if _DEBUG:
print(f"Step back output: {output}", file=sys.stderr)
try:
return output.split("AI: ")[1].strip()
except:
return None
# extractors.py ends here