Question Answering improvements - NQ3 (#419)

* unify squad and nq baskets * Clean id handling * Add QAInference type hints * add input_features test
deepset-ai · Jul 3, 2020 · f04c230 · f04c230
1 parent 68cbe01
commit f04c230
Show file tree

Hide file tree

Showing 19 changed files with 378 additions and 241 deletions.
diff --git a/examples/natural_questions.py b/examples/natural_questions.py
@@ -6,7 +6,7 @@
 from farm.data_handler.data_silo import DataSilo
 from farm.data_handler.processor import NaturalQuestionsProcessor
 from farm.file_utils import fetch_archive_from_http
-from farm.infer import Inferencer
+from farm.infer import QAInferencer
 from farm.modeling.adaptive_model import AdaptiveModel
 from farm.modeling.language_model import LanguageModel
 from farm.modeling.optimization import initialize_optimizer
@@ -68,7 +68,7 @@ def question_answering():
         max_seq_len=384,
         train_filename=train_filename,
         dev_filename=dev_filename,
-        keep_is_impossible=keep_is_impossible,
+        keep_no_answer=keep_is_impossible,
         downsample_context_size=downsample_context_size,
         data_dir=Path("../data/natural_questions"),
     )
@@ -131,7 +131,7 @@ def question_answering():
         }
     ]
 
-    model = Inferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True)
+    model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True)
     result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects
 
     print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?"

diff --git a/examples/question_answering.py b/examples/question_answering.py
@@ -7,7 +7,7 @@
 from farm.data_handler.data_silo import DataSilo
 from farm.data_handler.processor import SquadProcessor
 from farm.data_handler.utils import write_squad_predictions
-from farm.infer import Inferencer
+from farm.infer import QAInferencer
 from farm.modeling.adaptive_model import AdaptiveModel
 from farm.modeling.language_model import LanguageModel
 from farm.modeling.optimization import initialize_optimizer
@@ -110,7 +110,7 @@ def question_answering():
                 "context":  "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
             }]
 
-    model = Inferencer.load(save_dir, batch_size=40, gpu=True)
+    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
     result = model.inference_from_dicts(dicts=QA_input)[0]
 
     pprint.pprint(result)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
@@ -70,7 +70,6 @@ def sample_to_features_text(
     input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
     padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
 
-
     assert len(input_ids) == max_seq_len
     assert len(padding_mask) == max_seq_len
     assert len(segment_ids) == max_seq_len
@@ -307,10 +306,28 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
     return [feature_dict]
 
 
-def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None, max_answers=6):
+def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks_mid,
+                          answer_type_list=None, max_answers=6):
     """ Prepares data for processing by the model. Supports cases where there are
     multiple answers for the one question/document pair. max_answers is by default set to 6 since
-    that is the most number of answers in the squad2.0 dev set."""
+    that is the most number of answers in the squad2.0 dev set.
+
+    :param sample: A Sample object that contains one question / passage pair
+    :type sample: Sample
+    :param tokenizer: A Tokenizer object
+    :type tokenizer: Tokenizer
+    :param max_seq_len: The maximum sequence length
+    :type max_seq_len: int
+    :param sp_toks_start: The number of special tokens that come before the question tokens
+    :type sp_toks_start: int
+    :param sp_toks_mid: The number of special tokens that come between the question and passage tokens
+    :type sp_toks_mid: int
+    :param answer_type_list: A list of all the answer types that can be expected e.g. ["no_answer", "span", "yes", "no"] for Natural Questions
+    :type answer_type_list: List[str]
+    :param max_answers: The maximum number of answer annotations for a sample (In SQuAD, this is 6 hence the default)
+    :type max_answers: int
+    :return: dict (keys: [input_ids, padding_mask, segment_ids, answer_type_ids, passage_start_t, start_of_word, labels, id, seq_2_start_2])
+    """
 
     # Initialize some basic variables
     question_tokens = sample.tokenized["question_tokens"]
@@ -329,9 +346,10 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
     labels, answer_types = generate_labels(answers,
                                            passage_len_t,
                                            question_len_t,
-                                           tokenizer,
-                                           answer_type_list=answer_type_list,
-                                           max_answers=max_answers)
+                                           max_answers,
+                                           sp_toks_start,
+                                           sp_toks_mid,
+                                           answer_type_list)
 
     # Generate a start of word vector for the full sequence (i.e. question + answer + special tokens).
     # This will allow us to perform evaluation during training without clear text.
@@ -382,10 +400,9 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
     if tokenizer.__class__.__name__ in ["XLMRobertaTokenizer", "RobertaTokenizer"]:
         segment_ids = np.zeros_like(segment_ids)
 
-    # Todo: explain how only the first of labels will be used in train, and the full array will be used in eval
-    # TODO Offset, start of word and spec_tok_mask are not actually needed by model.forward() but are needed for model.formatted_preds()
-    # TODO passage_start_t is index of passage's first token  relative to document
-    # I don't think we actually need offsets anymore
+    # The first of the labels will be used in train, and the full array will be used in eval.
+    # start of word and spec_tok_mask are not actually needed by model.forward() but are needed for model.formatted_preds()
+    # passage_start_t is index of passage's first token relative to document
     feature_dict = {"input_ids": input_ids,
                     "padding_mask": padding_mask,
                     "segment_ids": segment_ids,
@@ -398,85 +415,50 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
     return [feature_dict]
 
 
-def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answers, answer_type_list=None):
+def generate_labels(answers, passage_len_t, question_len_t, max_answers,
+                    sp_toks_start, sp_toks_mid, answer_type_list=None):
     """
-    Creates QA label for each answer in answers. The labels are the index of the start and end token
+    Creates QA label vector for each answer in answers. The labels are the index of the start and end token
     relative to the passage. They are contained in an array of size (max_answers, 2).
-    -1 used to fill array since there the number of answers is often less than max_answers.
+    -1 is used to fill array since there the number of answers is often less than max_answers.
     The index values take in to consideration the question tokens, and also special tokens such as [CLS].
     When the answer is not fully contained in the passage, or the question
     is impossible to answer, the start_idx and end_idx are 0 i.e. start and end are on the very first token
-    (in most models, this is the [CLS] token). Note that in our implementation NQ has 4 labels
+    (in most models, this is the [CLS] token). Note that in our implementation NQ has 4 answer types
     ["no_answer", "yes", "no", "span"] and this is what answer_type_list should look like"""
 
+    # Note here that label_idxs get passed to the QuestionAnsweringHead and answer_types get passed to the text
+    # classification head. label_idxs may contain multiple start, end labels since SQuAD dev and test sets
+    # can have multiple annotations. By contrast, Natural Questions only has one annotation per sample hence
+    # why answer_types is only of length 1
     label_idxs = np.full((max_answers, 2), fill_value=-1)
-    answer_types = np.full((max_answers), fill_value=-1)
+    answer_types = np.asarray([-1])
+    answer_str = ""
 
     # If there are no answers
     if len(answers) == 0:
         label_idxs[0, :] = 0
         answer_types[:] = 0
         return label_idxs, answer_types
 
+    # Iterate over the answers for the one sample
     for i, answer in enumerate(answers):
-        answer_type = answer["answer_type"]
         start_idx = answer["start_t"]
         end_idx = answer["end_t"]
 
-        # We are going to operate on one-hot label vectors which will later be converted back to label indices.
-        # This is to take advantage of tokenizer.encode_plus() which applies model dependent special token conventions.
-        # The two label vectors (start and end) are composed of sections that correspond to the question and
-        # passage tokens. These are initialized here. The section corresponding to the question
-        # will always be composed of 0s.
-        start_vec_question = [0] * question_len_t
-        end_vec_question = [0] * question_len_t
-        start_vec_passage = [0] * passage_len_t
-        end_vec_passage = [0] * passage_len_t
-
-        # If the answer is in the current passage, populate the label vector with 1s for start and end
+        # Check that the start and end are contained within this passage
         if answer_in_passage(start_idx, end_idx, passage_len_t):
-            start_vec_passage[start_idx] = 1
-            end_vec_passage[end_idx] = 1
-
-        # Combine the sections of the label vectors. The length of each of these will be:
-        # question_len_t + passage_len_t + n_special_tokens
-        start_vec = combine_vecs(start_vec_question,
-                                    start_vec_passage,
-                                    tokenizer,
-                                    spec_tok_val=0)
-        end_vec = combine_vecs(end_vec_question,
-                                  end_vec_passage,
-                                  tokenizer,
-                                  spec_tok_val=0)
-
-        start_label_present = 1 in start_vec
-        end_label_present = 1 in end_vec
-
-        # This is triggered if the answer is not in the passage or the question warrants a no_answer
-        # In both cases, the token at idx=0 (in BERT, this is the [CLS] token) is given both the start and end label
-        if start_label_present is False and end_label_present is False:
-            start_vec[0] = 1
-            end_vec[0] = 1
-            answer_type = "no_answer"
-        elif start_label_present is False or end_label_present is False:
-            raise Exception("The label vectors are lacking either a start or end label")
-
-        # Ensure label vectors are one-hot
-        assert sum(start_vec) == 1
-        assert sum(end_vec) == 1
-
-        start_idx = start_vec.index(1)
-        end_idx = end_vec.index(1)
-
-        label_idxs[i, 0] = start_idx
-        label_idxs[i, 1] = end_idx
-
-        # Only Natural Questions trains a classification head on answer_type, SQuAD only has the QA head. answer_type_list
-        # will be None for SQuAD but something like ["no_answer", "span", "yes", "no"] for Natural Questions
-        if answer_type_list:
-            answer_types[i] = answer_type_list.index(answer_type)
-
-    assert np.max(label_idxs) > -1
+            label_idxs[i][0] = sp_toks_start + question_len_t + sp_toks_mid + start_idx
+            label_idxs[i][1] = sp_toks_start + question_len_t + sp_toks_mid + end_idx
+            answer_str = answer["answer_type"]
+        # If the start or end of the span answer is outside the passage, treat passage as no_answer
+        else:
+            label_idxs[i][0] = 0
+            label_idxs[i][1] = 0
+            answer_str = "no_answer"
+
+    if answer_type_list:
+        answer_types[0] = answer_type_list.index(answer_str)
 
     return label_idxs, answer_types