Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added filter_range parameter that allows to filter answers with similar start/end indices #680

Merged
merged 2 commits into from
Jan 12, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions examples/question_answering_filtering_similar_answers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from farm.infer import QAInferencer
Timoeller marked this conversation as resolved.
Show resolved Hide resolved
from pprint import pprint

QA_input = [
{
"questions": ["“In what country lies the Normandy?”"],
"text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France."""
}]

infer = QAInferencer.load("deepset/roberta-base-squad2", task_type="question_answering", gpu=True)
infer.model.prediction_heads[0].n_best = 5
infer.model.prediction_heads[0].n_best_per_sample = 5

# To filter duplicates, each pair of answers where the start indices or end indices differ by 5 or less are handled as duplicates with the following parameter setting.
# Setting this parameter to 0 filters exact duplicates: pairs of answers that have the same start indices or end indices.
# Setting this parameter to -1 turns off duplicate removal (default).
infer.model.prediction_heads[0].filter_range = 5

result = infer.inference_from_dicts(dicts=QA_input, return_json=False)

for r in result:
pprint(r.to_json())
17 changes: 17 additions & 0 deletions farm/modeling/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,7 @@ def __init__(self, layer_dims=[768,2],
context_window_size=100,
n_best=5,
n_best_per_sample=1,
filter_range=-1,
**kwargs):
"""
:param layer_dims: dimensions of Feed Forward block, e.g. [768,2], for adjusting to BERT embedding. Output should be always 2
Expand All @@ -947,6 +948,10 @@ def __init__(self, layer_dims=[768,2],
This is decoupled from n_best on document level, since predictions on passage level are very similar.
It should have a low value
:type n_best_per_sample: int
:param filter_range: The maximum distance that the start indices or end indices of two answers need to have to be handled as duplicates.
Timoeller marked this conversation as resolved.
Show resolved Hide resolved
0 corresponds to exact duplicates.
-1 turns off duplicate removal.
:type filter_range: int
"""
super(QuestionAnsweringHead, self).__init__()
if len(kwargs) > 0:
Expand All @@ -964,6 +969,7 @@ def __init__(self, layer_dims=[768,2],
self.context_window_size = context_window_size
self.n_best = n_best
self.n_best_per_sample = n_best_per_sample
self.filter_range = filter_range
self.generate_config()


Expand Down Expand Up @@ -1125,6 +1131,8 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx):
# Initialize some variables
top_candidates = []
n_candidates = sorted_candidates.shape[0]
start_idx_candidates = set()
end_idx_candidates = set()

# Iterate over all candidates and break when we have all our n_best candidates
for candidate_idx in range(n_candidates):
Expand All @@ -1137,6 +1145,8 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx):
# Ignore no_answer scores which will be extracted later in this method
if start_idx == 0 and end_idx == 0:
continue
if self.filter_range > -1 and (start_idx in start_idx_candidates or end_idx in end_idx_candidates):
continue
score = start_end_matrix[start_idx, end_idx].item()
top_candidates.append(QACandidate(offset_answer_start=start_idx,
offset_answer_end=end_idx,
Expand All @@ -1145,6 +1155,13 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx):
offset_unit="token",
aggregation_level="passage",
passage_id=sample_idx))
if self.filter_range > -1:
for i in range(0, self.filter_range + 1):
start_idx_candidates.add(start_idx + i)
start_idx_candidates.add(start_idx - i)
end_idx_candidates.add(end_idx + i)
end_idx_candidates.add(end_idx - i)


no_answer_score = start_end_matrix[0, 0].item()
top_candidates.append(QACandidate(offset_answer_start=0,
Expand Down
97 changes: 97 additions & 0 deletions test/test_question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,106 @@ def test_id(span_inference_result, no_answer_inference_result):
assert no_answer_inference_result.id == "best_id_ever"


def test_duplicate_answer_filtering():
QA_input = [
{
"questions": ["“In what country lies the Normandy?”"],
"text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France."""
}]

base_LM_model = "deepset/bert-base-cased-squad2"

inferencer = QAInferencer.load(base_LM_model, batch_size=2, gpu=False, task_type="question_answering",
Timoeller marked this conversation as resolved.
Show resolved Hide resolved
num_processes=0)

inferencer.model.prediction_heads[0].n_best = 5
inferencer.model.prediction_heads[0].n_best_per_sample = 5
inferencer.model.prediction_heads[0].filter_range = 0

result = inferencer.inference_from_dicts(dicts=QA_input)
offset_answer_starts = []
offset_answer_ends = []
for answer in result[0]["predictions"][0]["answers"]:
offset_answer_starts.append(answer["offset_answer_start"])
offset_answer_ends.append(answer["offset_answer_end"])

assert len(offset_answer_starts) == len(set(offset_answer_starts))
assert len(offset_answer_ends) == len(set(offset_answer_ends))


def test_no_duplicate_answer_filtering():
QA_input = [
{
"questions": ["“In what country lies the Normandy?”"],
"text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France."""
}]

base_LM_model = "deepset/bert-base-cased-squad2"

inferencer = QAInferencer.load(base_LM_model, batch_size=2, gpu=False, task_type="question_answering",
num_processes=0)

inferencer.model.prediction_heads[0].n_best = 5
inferencer.model.prediction_heads[0].n_best_per_sample = 5
inferencer.model.prediction_heads[0].filter_range = -1

result = inferencer.inference_from_dicts(dicts=QA_input)
offset_answer_starts = []
offset_answer_ends = []
for answer in result[0]["predictions"][0]["answers"]:
offset_answer_starts.append(answer["offset_answer_start"])
offset_answer_ends.append(answer["offset_answer_end"])

assert len(offset_answer_starts) != len(set(offset_answer_starts))
assert len(offset_answer_ends) != len(set(offset_answer_ends))


def test_range_duplicate_answer_filtering():
QA_input = [
{
"questions": ["“In what country lies the Normandy?”"],
"text": """The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\")
raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia.
The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries. Weird things happen in Normandy, France."""
}]

base_LM_model = "deepset/bert-base-cased-squad2"

inferencer = QAInferencer.load(base_LM_model, batch_size=2, gpu=False, task_type="question_answering",
num_processes=0)

inferencer.model.prediction_heads[0].n_best = 5
inferencer.model.prediction_heads[0].n_best_per_sample = 5
inferencer.model.prediction_heads[0].filter_range = 5

result = inferencer.inference_from_dicts(dicts=QA_input)
offset_answer_starts = []
offset_answer_ends = []
for answer in result[0]["predictions"][0]["answers"]:
offset_answer_starts.append(answer["offset_answer_start"])
offset_answer_ends.append(answer["offset_answer_end"])

offset_answer_starts.sort()
offset_answer_starts.remove(0)
distances_answer_starts = [j-i for i, j in zip(offset_answer_starts[:-1],offset_answer_starts[1:])]
assert all(distance > inferencer.model.prediction_heads[0].filter_range for distance in distances_answer_starts)

offset_answer_ends.sort()
offset_answer_ends.remove(0)
distances_answer_ends = [j-i for i, j in zip(offset_answer_ends[:-1], offset_answer_ends[1:])]
assert all(distance > inferencer.model.prediction_heads[0].filter_range for distance in distances_answer_ends)


if(__name__=="__main__"):
test_training()
test_save_load()
test_inference_different_inputs()
test_inference_objs()
test_duplicate_answer_filtering()
test_no_duplicate_answer_filtering()
test_range_duplicate_answer_filtering()