From b8e553d075d647973e2d3f579e2fcc2bd81e4a4e Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 16 Jun 2020 17:17:02 +0200
Subject: [PATCH 01/40] unify squad and nq baskets

---
 farm/data_handler/processor.py   |  9 ++--
 farm/data_handler/samples.py     | 73 --------------------------------
 farm/modeling/prediction_head.py | 41 +++++-------------
 farm/modeling/predictions.py     |  2 +-
 4 files changed, 16 insertions(+), 109 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 6c3a8a403..2e7bac6c3 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1087,7 +1087,8 @@ def _dicts_to_baskets(self, dicts, indices):
         for index, document in zip(indices, dicts_tokenized):
             for q_idx, raw in enumerate(document):
                 # In case of Question Answering the external ID is used for document IDs
-                basket = SampleBasket(raw=raw, id=f"{index}-{q_idx}", external_id=raw.get("document_id",None))
+                id_str = str(raw.get("document_id", index)) + f"-{q_idx}"
+                basket = SampleBasket(raw=raw, id=id_str)
                 baskets.append(basket)
         return baskets
 
@@ -1099,7 +1100,7 @@ def apply_tokenization(self, dictionary):
         raw_baskets = []
         dictionary = convert_qa_input_dict(dictionary)
         document_text = dictionary["context"]
-        document_id = dictionary.get("document_id",None)
+        document_id = dictionary.get("document_id", None)
 
         document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
         document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
@@ -1146,7 +1147,7 @@ def apply_tokenization(self, dictionary):
                    "question_start_of_word": question_start_of_word,
                    "answers": answers,
                    "answer_type": answer_type,
-                   "squad_id": squad_id}
+                   "external_id": squad_id}
             raw_baskets.append(raw)
         return raw_baskets
 
@@ -1510,7 +1511,7 @@ def apply_tokenization(self, dictionary):
                    "question_start_of_word": question_start_of_word,
                    "answers": answers,
                    "answer_type": answer_type,
-                   "nq_id": nq_id}
+                   "external_id": nq_id}
             raw_baskets.append(raw)
         return raw_baskets
 
diff --git a/farm/data_handler/samples.py b/farm/data_handler/samples.py
index 95b22ce3b..4d98ff0a7 100644
--- a/farm/data_handler/samples.py
+++ b/farm/data_handler/samples.py
@@ -23,7 +23,6 @@ def __init__(self, id: str, raw: dict, external_id=None, samples=None):
         :type samples: Sample
         """
         self.id = id
-        self.external_id = external_id
         self.raw = raw
         self.samples = samples
 
@@ -129,78 +128,6 @@ def create_sample_ner(split_text, label, basket_id):
     return [Sample(id=basket_id + "-0", clear_text={"text": text, "label": label})]
 
 
-# TODO Remove - This has been superceded by create_samples_qa which can handle both Squad and Natural Questions
-# def create_samples_squad(dictionary, max_query_len, max_seq_len, doc_stride, n_special_tokens):
-#     """
-#     This method will split question-document pairs from the SampleBasket into question-passage pairs which will
-#     each form one sample. The "t" and "c" in variables stand for token and character respectively.
-#     """
-#
-#     # Initialize some basic variables
-#     # is_training = check_if_training(dictionary)
-#     question_tokens = dictionary["question_tokens"][:max_query_len]
-#     question_len_t = len(question_tokens)
-#     question_offsets = dictionary["question_offsets"]
-#     doc_tokens = dictionary["document_tokens"]
-#     doc_offsets = dictionary["document_offsets"]
-#     doc_text = dictionary["document_text"]
-#     doc_start_of_word = dictionary["document_start_of_word"]
-#     samples = []
-#
-#     # Calculate the number of tokens that can be reserved for the passage. This is calculated by considering
-#     # the max_seq_len, the number of tokens in the question and the number of special tokens that will be added
-#     # when the question and passage are joined (e.g. [CLS] and [SEP])
-#     passage_len_t = max_seq_len - question_len_t - n_special_tokens
-#
-#     # Perform chunking of document into passages. The sliding window moves in steps of doc_stride.
-#     # passage_spans is a list of dictionaries where each defines the start and end of each passage
-#     # on both token and character level
-#     passage_spans = chunk_into_passages(doc_offsets,
-#                                         doc_stride,
-#                                         passage_len_t,
-#                                         doc_text)
-#     for passage_span in passage_spans:
-#         # Unpack each variable in the dictionary. The "_t" and "_c" indicate
-#         # whether the index is on the token or character level
-#         passage_start_t = passage_span["passage_start_t"]
-#         passage_end_t = passage_span["passage_end_t"]
-#         passage_start_c = passage_span["passage_start_c"]
-#         passage_end_c = passage_span["passage_end_c"]
-#         passage_id = passage_span["passage_id"]
-#
-#         # passage_offsets will be relative to the start of the passage (i.e. they will start at 0)
-#         # TODO: Is passage offsets actually needed? At this point, maybe we only care about token level
-#         passage_offsets = doc_offsets[passage_start_t: passage_end_t]
-#         passage_start_of_word = doc_start_of_word[passage_start_t: passage_end_t]
-#         passage_offsets = [x - passage_offsets[0] for x in passage_offsets]
-#         passage_tokens = doc_tokens[passage_start_t: passage_end_t]
-#         passage_text = dictionary["document_text"][passage_start_c: passage_end_c]
-#
-#         # Deal with the potentially many answers (e.g. Squad dev set)
-#         answers_clear, answers_tokenized = process_answers(dictionary["answers"],
-#                                                            doc_offsets,
-#                                                            passage_start_c,
-#                                                            passage_start_t)
-#
-#         clear_text = {"passage_text": passage_text,
-#                       "question_text": dictionary["question_text"],
-#                       "passage_id": passage_id,
-#                       "answers": answers_clear,
-#                       "answer_type": dictionary["answer_type"]}
-#         tokenized = {"passage_start_t": passage_start_t,
-#                      "passage_tokens": passage_tokens,
-#                      "passage_offsets": passage_offsets,
-#                      "passage_start_of_word": passage_start_of_word,
-#                      "question_tokens": question_tokens,
-#                      "question_offsets": question_offsets,
-#                      "question_start_of_word": dictionary["question_start_of_word"][:max_query_len],
-#                      "answers": answers_tokenized}
-#         samples.append(Sample(id=passage_id,
-#                               clear_text=clear_text,
-#                               tokenized=tokenized))
-#     return samples
-
-
 def process_answers(answers, doc_offsets, passage_start_c, passage_start_t):
     """TODO Write Comment"""
     answers_clear = []
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 1e9d1d87e..8cd03dd22 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1233,37 +1233,16 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
 
         # Iterate over each set of document level prediction
         for pred_d, no_ans_gap, basket in zip(top_preds, no_ans_gaps, baskets):
-            # TODO the follow try catch is because of difference in Basket structure between NQ and SQuAD - resolve this!!!
-            # TODO This code is horrible - will be cleaned soon
 
             # Unpack document offsets, clear text and squad_id
-            try:
-                token_offsets = basket.samples[0].tokenized["document_offsets"] # NQ style
-            except KeyError:
-                token_offsets = basket.raw["document_offsets"]                  # SQuAD style
-
-            try:
-                document_text = basket.raw["context"]       # SQuAD style
-            except KeyError:
-                try:
-                    document_text = basket.raw["text"] # NQ style
-                except KeyError:
-                    document_text = basket.raw["document_text"]
-
-            try:
-                question = basket.raw["questions"][0]  # SQuAD style
-            except KeyError:
-                try:
-                    question = basket.raw["qas"][0]         # NQ style
-                except KeyError:
-                    question = basket.raw["question_text"]
-
-            try:
-                question_id = basket.raw["squad_id"]
-            except KeyError:
-                question_id = None # TODO add NQ id here
-
-            basket_id = basket.id
+            token_offsets = basket.samples[0].tokenized["document_offsets"]
+            document_text = basket.raw.get("document_text", None)
+            question = basket.raw.get("question_text", None)
+            external_id = basket.id
+            if not document_text:
+                document_text = basket.raw.get("context", None)
+            if not question:
+                question = basket.raw.get("qas")[0]
 
             # Iterate over each prediction on the one document
             full_preds = []
@@ -1276,11 +1255,11 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
                 qa_answer.add_answer(pred_str)
                 full_preds.append(qa_answer)
             n_samples = full_preds[0].n_samples_in_doc
-            curr_doc_pred = QAPred(id=basket_id,
+            curr_doc_pred = QAPred(id=external_id,
                                    prediction=full_preds,
                                    context=document_text,
                                    question=question,
-                                   question_id=question_id,
+                                   question_id=external_id,
                                    token_offsets=token_offsets,
                                    context_window_size=self.context_window_size,
                                    aggregation_level="document",
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 7fedf7e02..ee52285b4 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -86,7 +86,7 @@ class QAPred(Pred):
     token_offsets: List[int]
     context_window_size: int #TODO only needed for to_json() - can we get rid context_window_size, TODO Do we really need this?
     aggregation_level: str
-    question_id: Optional[str]
+    question_id: Optional[str]  # TODO Is this needed when we already have PRed.id?
     answer_types: Optional[List[str]] = []
     ground_truth_answer: Optional[str] = None
     no_answer_gap: Optional[float] = None

From 39bebc2f9182374de8cfe32d34e755898ae7626a Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 17 Jun 2020 15:03:29 +0200
Subject: [PATCH 02/40] attempt at simplifying ids

---
 farm/data_handler/input_features.py | 140 +---------------------------
 farm/data_handler/processor.py      |  55 ++++++-----
 farm/data_handler/samples.py        |   5 +-
 farm/data_handler/utils.py          |  20 +---
 farm/eval.py                        |   4 +-
 farm/modeling/prediction_head.py    |  15 +--
 farm/modeling/predictions.py        |   1 -
 7 files changed, 49 insertions(+), 191 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index ca0850a77..092b9e6f8 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -12,9 +12,7 @@
 from farm.data_handler.utils import (
     expand_labels,
     pad,
-    mask_random_words,
-    convert_id
-)
+    mask_random_words)
 from farm.modeling.tokenization import insert_at_special_tokens_pos
 
 logger = logging.getLogger(__name__)
@@ -323,7 +321,7 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
     passage_start_of_word = sample.tokenized["passage_start_of_word"]
     passage_len_t = len(passage_tokens)
     answers = sample.tokenized["answers"]
-    sample_id = convert_id(sample.id)
+    sample_id = [int(x) for x in sample.id.split("-")]
 
     # Generates a numpy array of shape (max_answers, 2) where (i, 2) indexes into the start and end indices
     # of the ith answer. The array is filled with -1 since the number of answers is often less than max_answers
@@ -392,10 +390,10 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
                     "padding_mask": padding_mask,
                     "segment_ids": segment_ids,
                     "answer_type_ids": answer_types,
-                    "id": sample_id,
                     "passage_start_t": passage_start_t,
                     "start_of_word": start_of_word,
                     "labels": labels,
+                    "id": sample_id,
                     "seq_2_start_t": seq_2_start_t}
     return [feature_dict]
 
@@ -527,138 +525,6 @@ def get_camembert_seq_2_start(input_ids):
     second_backslash_s = input_ids.index(6, first_backslash_s + 1)
     return second_backslash_s + 1
 
-def sample_to_features_squadOLD(
-    sample, tokenizer, max_seq_len, doc_stride, max_query_length, tasks,
-):
-    sample.clear_text = DotMap(sample.clear_text, _dynamic=False)
-    is_training = sample.clear_text.is_training
-
-    unique_id = 1000000000
-    features = []
-
-    query_tokens = tokenizer.tokenize(sample.clear_text.question_text)
-
-    if len(query_tokens) > max_query_length:
-        query_tokens = query_tokens[0:max_query_length]
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(sample.clear_text.doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    tok_start_position = None
-    tok_end_position = None
-    if is_training and sample.clear_text.is_impossible:
-        tok_start_position = -1
-        tok_end_position = -1
-    if is_training and not sample.clear_text.is_impossible:
-        tok_start_position = orig_to_tok_index[sample.clear_text.start_position]
-        if sample.clear_text.end_position < len(sample.clear_text.doc_tokens) - 1:
-            tok_end_position = orig_to_tok_index[sample.clear_text.end_position + 1] - 1
-        else:
-            tok_end_position = len(all_doc_tokens) - 1
-        (tok_start_position, tok_end_position) = _SQUAD_improve_answer_span(
-            all_doc_tokens,
-            tok_start_position,
-            tok_end_position,
-            tokenizer,
-            sample.clear_text.orig_answer_text,
-        )
-
-    # The -3 accounts for [CLS], [SEP] and [SEP]
-    max_tokens_for_doc = max_seq_len - len(query_tokens) - 3
-
-    # We can have documents that are longer than the maximum sequence length.
-    # To deal with this we do a sliding window approach, where we take chunks
-    # of the up to our max length with a stride of `doc_stride`.
-    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"]
-    )
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(all_doc_tokens):
-        length = len(all_doc_tokens) - start_offset
-        if length > max_tokens_for_doc:
-            length = max_tokens_for_doc
-        doc_spans.append(_DocSpan(start=start_offset, length=length))
-        if start_offset + length == len(all_doc_tokens):
-            break
-        start_offset += min(length, doc_stride)
-
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-        tokens = []
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in query_tokens:
-            tokens.append(token)
-            segment_ids.append(0)
-        tokens.append("[SEP]")
-        segment_ids.append(0)
-
-        for i in range(doc_span.length):
-            split_token_index = doc_span.start + i
-            tokens.append(all_doc_tokens[split_token_index])
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        padding_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < max_seq_len:
-            input_ids.append(0)
-            padding_mask.append(0)
-            segment_ids.append(0)
-
-        assert len(input_ids) == max_seq_len
-        assert len(padding_mask) == max_seq_len
-        assert len(segment_ids) == max_seq_len
-
-        start_position = 0
-        end_position = 0
-        if is_training and not sample.clear_text.is_impossible:
-            # For training, if our document chunk does not contain an annotation
-            # we keep it but set the start and end position to unanswerable
-            doc_start = doc_span.start
-            doc_end = doc_span.start + doc_span.length - 1
-            out_of_span = False
-            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                out_of_span = True
-            if out_of_span:
-                start_position = 0
-                end_position = 0
-            else:
-                doc_offset = len(query_tokens) + 2
-                start_position = tok_start_position - doc_start + doc_offset
-                end_position = tok_end_position - doc_start + doc_offset
-        if is_training and sample.clear_text.is_impossible:
-            start_position = 0
-            end_position = 0
-
-        inp_feat = {}
-        inp_feat["input_ids"] = input_ids
-        inp_feat["padding_mask"] = padding_mask  # attention_mask
-        inp_feat["segment_ids"] = segment_ids  # token_type_ids
-        inp_feat["start_position"] = start_position
-        inp_feat["end_position"] = end_position
-        inp_feat["is_impossible"] = sample.clear_text.is_impossible
-        inp_feat["sample_id"] = sample.id
-        inp_feat["passage_shift"] = doc_span.start
-        features.append(inp_feat)
-        unique_id += 1
-
-    return features
-
 
 def _SQUAD_improve_answer_span(
     doc_tokens, input_start, input_end, tokenizer, orig_answer_text
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 2e7bac6c3..381ad9e30 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -283,7 +283,7 @@ def _init_samples_in_baskets(self):
             try:
                 basket.samples = self._dict_to_samples(dictionary=basket.raw, all_dicts=all_dicts)
                 for num, sample in enumerate(basket.samples):
-                     sample.id = f"{basket.id}-{num}"
+                     sample.id = f"{basket.id_internal}-{num}"
             except:
                 logger.error(f"Could not create sample(s) from this dict: \n {basket.raw}")
                 raise
@@ -308,7 +308,7 @@ def _create_dataset(self, keep_baskets=False):
         dataset, tensor_names = convert_features_to_dataset(features=features_flat)
         return dataset, tensor_names
 
-    def dataset_from_dicts(self, dicts, indices=None, rest_api_schema=False, return_baskets = False):
+    def dataset_from_dicts(self, dicts, indices=None, return_baskets = False):
         """
         Contains all the functionality to turn a list of dict objects into a PyTorch Dataset and a
         list of tensor names. This can be used for inference mode.
@@ -317,21 +317,15 @@ def dataset_from_dicts(self, dicts, indices=None, rest_api_schema=False, return_
         :type dicts: list of dicts
         :return: a Pytorch dataset and a list of tensor names.
         """
-        if rest_api_schema:
-            id_prefix = "infer"
-        else:
-            id_prefix = "train"
         # We need to add the index (coming from multiprocessing chunks) to have a unique basket ID
-        if indices:
-            self.baskets = [
-                SampleBasket(raw=tr, id=f"{id_prefix}-{index}")
-                for (tr, index) in zip(dicts, indices)
-            ]
-        else:
-            self.baskets = [
-                SampleBasket(raw=tr, id=f"{id_prefix}-{i}")
-                for (i, tr) in enumerate(dicts)
-            ]
+
+        self.baskets = []
+        for id_internal, d in enumerate(dicts):
+            id_external = self._id_from_dict(d)
+            if indices:
+                id_internal = indices[id_internal]
+            self.baskets.append(SampleBasket(raw=d, id_external=id_external, id_internal=id_internal))
+
         self._init_samples_in_baskets()
         self._featurize_samples()
         if indices:
@@ -364,6 +358,21 @@ def _log_params(self):
             params.update({name: str(value)})
         MlLogger.log_params(params)
 
+    @staticmethod
+    def _id_from_dict(d):
+        candidates = []
+        candidates.append(d.get("example_id", None))
+        candidates.append(d.get("external_id", None))
+        candidates = [x for x in candidates if x]
+        if len(candidates) == 0:
+            return None
+        elif len(candidates) == 1:
+            return candidates[0]
+        else:
+            raise Exception
+
+
+
 
 #########################################
 # Processors for Text Classification ####
@@ -1084,11 +1093,13 @@ def _dicts_to_baskets(self, dicts, indices):
         dicts_tokenized = [self.apply_tokenization(d) for d in dicts]
 
         baskets = []
+
         for index, document in zip(indices, dicts_tokenized):
             for q_idx, raw in enumerate(document):
                 # In case of Question Answering the external ID is used for document IDs
-                id_str = str(raw.get("document_id", index)) + f"-{q_idx}"
-                basket = SampleBasket(raw=raw, id=id_str)
+                id_external = self._id_from_dict(raw)
+                id_internal = f"{index}-{q_idx}"
+                basket = SampleBasket(raw=raw, id_internal=id_internal, id_external=id_external)
                 baskets.append(basket)
         return baskets
 
@@ -1100,7 +1111,6 @@ def apply_tokenization(self, dictionary):
         raw_baskets = []
         dictionary = convert_qa_input_dict(dictionary)
         document_text = dictionary["context"]
-        document_id = dictionary.get("document_id", None)
 
         document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
         document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
@@ -1109,7 +1119,7 @@ def apply_tokenization(self, dictionary):
             answers = []
             # For training and dev where labelled samples are read in from a SQuAD style file
             try:
-                squad_id = question["id"]
+                external_id = question["id"]
                 question_text = question["question"]
                 for answer in question["answers"]:
                     if answer["text"] == "":
@@ -1122,7 +1132,7 @@ def apply_tokenization(self, dictionary):
                     answers.append(a)
             # For inference where samples are read in as dicts without an id or answers
             except TypeError:
-                squad_id = None
+                external_id = None
                 question_text = question
             question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
             question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
@@ -1140,14 +1150,13 @@ def apply_tokenization(self, dictionary):
                    "document_tokens": document_tokenized["tokens"],
                    "document_offsets": document_tokenized["offsets"],
                    "document_start_of_word": document_start_of_word,
-                   "document_id": document_id,
                    "question_text": question_text,
                    "question_tokens": question_tokenized["tokens"],
                    "question_offsets": question_tokenized["offsets"],
                    "question_start_of_word": question_start_of_word,
                    "answers": answers,
                    "answer_type": answer_type,
-                   "external_id": squad_id}
+                   "external_id": external_id}
             raw_baskets.append(raw)
         return raw_baskets
 
diff --git a/farm/data_handler/samples.py b/farm/data_handler/samples.py
index 4d98ff0a7..2be71d127 100644
--- a/farm/data_handler/samples.py
+++ b/farm/data_handler/samples.py
@@ -11,7 +11,7 @@ class SampleBasket:
     is needed for tasks like question answering where the source text can generate multiple input - label
     pairs."""
 
-    def __init__(self, id: str, raw: dict, external_id=None, samples=None):
+    def __init__(self, id_internal: str, raw: dict, id_external=None, samples=None):
         """
         :param id: A unique identifying id. Used for identification within FARM.
         :type id: str
@@ -22,7 +22,8 @@ def __init__(self, id: str, raw: dict, external_id=None, samples=None):
         :param samples: An optional list of Samples used to populate the basket at initialization.
         :type samples: Sample
         """
-        self.id = id
+        self.id_internal = id_internal
+        self.id_external = id_external
         self.raw = raw
         self.samples = samples
 
diff --git a/farm/data_handler/utils.py b/farm/data_handler/utils.py
index a50871e82..f099de4ad 100644
--- a/farm/data_handler/utils.py
+++ b/farm/data_handler/utils.py
@@ -786,25 +786,6 @@ def split_with_metadata(text):
     assert len(split_text) == len(indexes)
     return split_text, indexes
 
-
-def convert_id(id_string):
-    """
-    Splits a string id into parts. If it is an id generated in the SQuAD pipeline it simple splits the id by the dashes
-    and converts the parts to ints. If it is generated by the non-SQuAD pipeline, it splits the id by the dashes and
-    converts references to "train" or "infer" into ints.
-    :param id_string:
-    :return:
-    """
-    ret = []
-    datasets = ["train", "infer"]
-    id_list = id_string.split("-")
-    for x in id_list:
-        if x in datasets:
-            ret.append(datasets.index(x))
-        else:
-            ret.append(int(x))
-    return ret
-
 def convert_qa_input_dict(infer_dict):
     """ Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
     ["text", "questions"] (api format). This function converts the latter into the former"""
@@ -831,3 +812,4 @@ def convert_qa_input_dict(infer_dict):
 
 
 
+
diff --git a/farm/eval.py b/farm/eval.py
index ce7240734..8985d0795 100644
--- a/farm/eval.py
+++ b/farm/eval.py
@@ -75,7 +75,6 @@ def eval(self, model, return_preds_and_labels=False):
                 preds_all[head_num] += list(to_numpy(preds[head_num]))
                 label_all[head_num] += list(to_numpy(labels[head_num]))
                 if head.model_type == "span_classification":
-                    ids_all[head_num] += list(to_numpy(batch["id"]))
                     passage_start_t_all[head_num] += list(to_numpy(batch["passage_start_t"]))
 
         # Evaluate per prediction head
@@ -91,8 +90,7 @@ def eval(self, model, return_preds_and_labels=False):
             if hasattr(head, 'aggregate_preds'):
                 preds_all[head_num], label_all[head_num] = head.aggregate_preds(preds=preds_all[head_num],
                                                                           labels=label_all[head_num],
-                                                                          passage_start_t=passage_start_t_all[head_num],
-                                                                          ids=ids_all[head_num])
+                                                                          passage_start_t=passage_start_t_all[head_num])
 
             result = {"loss": loss_all[head_num] / len(self.data_loader.dataset),
                       "task_name": head.task_name}
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 8cd03dd22..3f788fe97 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1207,7 +1207,7 @@ def formatted_preds(self, logits=None, preds_p=None, baskets=None, **kwargs):
         assert logits is None, "Logits are not None, something is passed wrongly into formatted_preds() in infer.py"
         assert preds_p is not None, "No preds_p passed to formatted_preds()"
         samples = [s for b in baskets for s in b.samples]
-        ids = [s.id.split("-") for s in samples]
+        ids = [s.id for s in samples]
         passage_start_t = [s.features[0]["passage_start_t"] for s in samples]
         seq_2_start_t = [s.features[0]["seq_2_start_t"] for s in samples]
 
@@ -1238,7 +1238,6 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
             token_offsets = basket.samples[0].tokenized["document_offsets"]
             document_text = basket.raw.get("document_text", None)
             question = basket.raw.get("question_text", None)
-            external_id = basket.id
             if not document_text:
                 document_text = basket.raw.get("context", None)
             if not question:
@@ -1255,11 +1254,13 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
                 qa_answer.add_answer(pred_str)
                 full_preds.append(qa_answer)
             n_samples = full_preds[0].n_samples_in_doc
-            curr_doc_pred = QAPred(id=external_id,
+
+            pred_id = basket.id_external if basket.id_external else basket.id_internal
+
+            curr_doc_pred = QAPred(id=pred_id,
                                    prediction=full_preds,
                                    context=document_text,
                                    question=question,
-                                   question_id=external_id,
                                    token_offsets=token_offsets,
                                    context_window_size=self.context_window_size,
                                    aggregation_level="document",
@@ -1289,8 +1290,10 @@ def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, label
 
         # Iterate over the preds of each sample
         for sample_idx in range(n_samples):
-            id_1, id_2, _ = ids[sample_idx]
-            basket_id = f"{id_1}-{id_2}"
+
+            # Remove the final number in id which corresponds to sample's id
+            basket_id = ids[sample_idx]
+            basket_id = "-".join(basket_id.split("-")[:-1])
 
             # curr_passage_start_t is the token offset of the current passage
             # It will always be a multiple of doc_stride
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index ee52285b4..3adb66f52 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -86,7 +86,6 @@ class QAPred(Pred):
     token_offsets: List[int]
     context_window_size: int #TODO only needed for to_json() - can we get rid context_window_size, TODO Do we really need this?
     aggregation_level: str
-    question_id: Optional[str]  # TODO Is this needed when we already have PRed.id?
     answer_types: Optional[List[str]] = []
     ground_truth_answer: Optional[str] = None
     no_answer_gap: Optional[float] = None

From eb6834a0c4204ac32f136611fec414911bd3b928 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 17 Jun 2020 16:49:30 +0200
Subject: [PATCH 03/40] clean id handling

---
 farm/data_handler/processor.py   |  2 +-
 farm/eval.py                     | 10 ++++++++--
 farm/modeling/prediction_head.py |  7 +++----
 farm/modeling/predictions.py     |  2 +-
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 381ad9e30..89ee2c536 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1132,7 +1132,7 @@ def apply_tokenization(self, dictionary):
                     answers.append(a)
             # For inference where samples are read in as dicts without an id or answers
             except TypeError:
-                external_id = None
+                external_id = self._id_from_dict(dictionary)
                 question_text = question
             question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
             question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
diff --git a/farm/eval.py b/farm/eval.py
index 8985d0795..755ccdcfd 100644
--- a/farm/eval.py
+++ b/farm/eval.py
@@ -75,6 +75,7 @@ def eval(self, model, return_preds_and_labels=False):
                 preds_all[head_num] += list(to_numpy(preds[head_num]))
                 label_all[head_num] += list(to_numpy(labels[head_num]))
                 if head.model_type == "span_classification":
+                    ids_all[head_num] += list(to_numpy(batch["id"]))
                     passage_start_t_all[head_num] += list(to_numpy(batch["passage_start_t"]))
 
         # Evaluate per prediction head
@@ -88,9 +89,14 @@ def eval(self, model, return_preds_and_labels=False):
                 preds_all[head_num] = mlb.fit_transform(preds_all[head_num])
                 label_all[head_num] = mlb.transform(label_all[head_num])
             if hasattr(head, 'aggregate_preds'):
+                # Needed to convert NQ ids from np arrays to strings
+                ids_all_str = [x.astype(str) for x in ids_all[head_num]]
+                ids_all_list = [list(x) for x in ids_all_str]
+                head_ids = ["-".join(x) for x in ids_all_list]
                 preds_all[head_num], label_all[head_num] = head.aggregate_preds(preds=preds_all[head_num],
-                                                                          labels=label_all[head_num],
-                                                                          passage_start_t=passage_start_t_all[head_num])
+                                                                                labels=label_all[head_num],
+                                                                                passage_start_t=passage_start_t_all[head_num],
+                                                                                ids=head_ids)
 
             result = {"loss": loss_all[head_num] / len(self.data_loader.dataset),
                       "task_name": head.task_name}
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 3f788fe97..4fe062b7c 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1288,12 +1288,11 @@ def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, label
         all_basket_preds = {}
         all_basket_labels = {}
 
-        # Iterate over the preds of each sample
+        # Iterate over the preds of each sample - remove final number which is the sample id and not needed for aggregation
         for sample_idx in range(n_samples):
-
-            # Remove the final number in id which corresponds to sample's id
             basket_id = ids[sample_idx]
-            basket_id = "-".join(basket_id.split("-")[:-1])
+            basket_id = basket_id.split("-")[:-1]
+            basket_id = "-".join(basket_id)
 
             # curr_passage_start_t is the token offset of the current passage
             # It will always be a multiple of doc_stride
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 3adb66f52..b911160b4 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -98,7 +98,7 @@ def to_json(self, squad=False):
             "predictions": [
                 {
                     "question": self.question,
-                    "question_id": self.question_id,
+                    "question_id": self.id,
                     "ground_truth": None,
                     "answers": answers,
                     "no_ans_gap": self.no_answer_gap # Add no_ans_gap to current no_ans_boost for switching top prediction

From 24b5747cac620464e5e13c47962e49d743e13f02 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 18 Jun 2020 11:01:08 +0200
Subject: [PATCH 04/40] Better handling of different input dicts

---
 farm/modeling/prediction_head.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 4fe062b7c..34e9f0149 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1236,12 +1236,23 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
 
             # Unpack document offsets, clear text and squad_id
             token_offsets = basket.samples[0].tokenized["document_offsets"]
-            document_text = basket.raw.get("document_text", None)
-            question = basket.raw.get("question_text", None)
-            if not document_text:
-                document_text = basket.raw.get("context", None)
-            if not question:
-                question = basket.raw.get("qas")[0]
+
+            # These options reflect the different input dicts that can be assigned to the basket
+            # before any kind of normalization or preprocessing can happen
+            question_names = ["question_text", "qas", "questions"]
+            doc_names = ["document_text", "context", "text"]
+
+            def try_get(keys, dictionary):
+                for key in keys:
+                    if key in dictionary:
+                        ret = dictionary[key]
+                        if type(ret) == list:
+                            ret = ret[0]
+                        return ret
+                return None
+
+            document_text = try_get(doc_names, basket.raw)
+            question = try_get(question_names, basket.raw)
 
             # Iterate over each prediction on the one document
             full_preds = []

From c41ed347e836037c8d789a24885629ad0cbf73b3 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 18 Jun 2020 17:02:52 +0200
Subject: [PATCH 05/40] apply_tokenization merged

---
 farm/data_handler/processor.py   | 182 ++++++++++++-------------------
 farm/data_handler/utils.py       |  12 +-
 farm/modeling/prediction_head.py |  11 +-
 farm/utils.py                    |  10 ++
 4 files changed, 87 insertions(+), 128 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 9225ae628..9993d2a79 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -42,6 +42,7 @@
 )
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
 from farm.utils import MLFlowLogger as MlLogger
+from farm.utils import try_get
 
 
 logger = logging.getLogger(__name__)
@@ -1103,62 +1104,8 @@ def _dicts_to_baskets(self, dicts, indices):
                 baskets.append(basket)
         return baskets
 
-
     def apply_tokenization(self, dictionary):
-        """ This performs tokenization on all documents and questions. The result is a list (unnested)
-        where each entry is a dictionary for one document-question pair (potentially mutliple answers). """
-
-        raw_baskets = []
-        dictionary = convert_qa_input_dict(dictionary)
-        document_text = dictionary["context"]
-
-        document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
-        document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
-        questions = dictionary["qas"]
-        for question in questions:
-            answers = []
-            # For training and dev where labelled samples are read in from a SQuAD style file
-            try:
-                external_id = question["id"]
-                question_text = question["question"]
-                for answer in question["answers"]:
-                    if answer["text"] == "":
-                        answer_type = "is_impossible"
-                    else:
-                        answer_type = "span"
-                    a = {"text": answer["text"],
-                         "offset": answer["answer_start"],
-                         "answer_type": answer_type}
-                    answers.append(a)
-            # For inference where samples are read in as dicts without an id or answers
-            except TypeError:
-                external_id = self._id_from_dict(dictionary)
-                question_text = question
-            question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
-            question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
-
-            # TODO for Squad and NQ, answer_type should be paired with the question and not the passage
-            # TODO make this change for both processors
-            if "is_impossible" not in question:
-                answer_type = "span"
-            else:
-                if question["is_impossible"]:
-                    answer_type = "is_impossible"
-                else:
-                    answer_type = "span"
-            raw = {"document_text": document_text,
-                   "document_tokens": document_tokenized["tokens"],
-                   "document_offsets": document_tokenized["offsets"],
-                   "document_start_of_word": document_start_of_word,
-                   "question_text": question_text,
-                   "question_tokens": question_tokenized["tokens"],
-                   "question_offsets": question_tokenized["offsets"],
-                   "question_start_of_word": question_start_of_word,
-                   "answers": answers,
-                   "answer_type": answer_type,
-                   "external_id": external_id}
-            raw_baskets.append(raw)
-        return raw_baskets
+        return apply_tokenization(dictionary, self.tokenizer)
 
     def file_to_dicts(self, file: str) -> [dict]:
         nested_dicts = read_squad_file(filename=file)
@@ -1468,61 +1415,7 @@ def convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text):
         return start_c, end_c
 
     def apply_tokenization(self, dictionary):
-        """ This performs tokenization on all documents and questions. The result is a list
-        where each entry is a dictionary for one document-question pair (potentially mutliple answers). This is based on
-        the apply_tokenization method of SquadProcessor but slightly modified.
-
-        TODO: See if this can be merged with SquadProcessor.apply_tokenization()"""
-
-        raw_baskets = []
-        # Input dictionaries can have ["context", "qas"] (SQuAD format) as keys or
-        # ["text", "questions"] (FARM format). Both are supported
-        dictionary = convert_qa_input_dict(dictionary)
-        document_text = dictionary["context"]
-        document_id = dictionary.get("document_id", None)
-
-        document_tokenized = tokenize_with_metadata(document_text, self.tokenizer)
-        document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
-        questions = dictionary["qas"]
-        for question in questions:
-            answers = []
-            # For training and dev with labelled examples
-            try:
-                nq_id = question["id"]
-                question_text = question["question"]
-                for answer in question["answers"]:
-                    a = {"text": answer["text"],
-                         "offset": answer["answer_start"],
-                         "answer_type": question["answer_type"]}
-                    answers.append(a)
-            # For inference where samples are read in without an id or answers
-            except TypeError:
-                nq_id = None
-                question_text = question
-            question_tokenized = tokenize_with_metadata(question_text, self.tokenizer)
-            question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
-
-            # TODO compare data format with Squad to explain what this section is doing exactly
-            # TODO suspect that this might not be right for NQ
-            if "is_impossible" not in question:
-                answer_type = "span"
-            else:
-                answer_type = question["is_impossible"]
-
-            raw = {"document_text": document_text,
-                   "document_tokens": document_tokenized["tokens"],
-                   "document_offsets": document_tokenized["offsets"],
-                   "document_start_of_word": document_start_of_word,
-                   "document_id": document_id,
-                   "question_text": question_text,
-                   "question_tokens": question_tokenized["tokens"],
-                   "question_offsets": question_tokenized["offsets"],
-                   "question_start_of_word": question_start_of_word,
-                   "answers": answers,
-                   "answer_type": answer_type,
-                   "external_id": nq_id}
-            raw_baskets.append(raw)
-        return raw_baskets
+        return apply_tokenization(dictionary, self.tokenizer)
 
     def _sample_to_features(self, sample: Sample) -> dict:
         features = sample_to_features_qa(sample=sample,
@@ -1681,3 +1574,72 @@ def _sample_to_features(self, sample) -> dict:
             tokenizer=self.tokenizer
         )
         return features
+
+
+def apply_tokenization(dictionary, tokenizer):
+    raw_baskets = []
+    dictionary = convert_qa_input_dict(dictionary)
+    dictionary["qas"] = is_impossible_to_answer_type(dictionary["qas"])
+    document_text = dictionary["context"]
+
+    document_tokenized = tokenize_with_metadata(document_text, tokenizer)
+    document_start_of_word = [int(x) for x in document_tokenized["start_of_word"]]
+    questions = dictionary["qas"]
+    for question in questions:
+        answers = []
+        # For training and dev with labelled examples
+        try:
+            external_id = question["id"]
+            question_text = question["question"]
+            for answer in question["answers"]:
+                if answer["text"] == "":
+                    answer_type = "is_impossible"
+                else:
+                    answer_type = "span"
+                a = {"text": answer["text"],
+                     "offset": answer["answer_start"],
+                     "answer_type": answer_type}
+                answers.append(a)
+        # For inference where samples are read in as dicts without an id or answers
+        except TypeError:
+            external_id = try_get(["example_id", "external_id"], dictionary)
+            question_text = question
+
+        question_tokenized = tokenize_with_metadata(question_text, tokenizer)
+        question_start_of_word = [int(x) for x in question_tokenized["start_of_word"]]
+
+        # During inference, there is no_answer type. Also, question might be a str instead of a dict
+        if type(question) == str:
+            answer_type = None
+        elif type(question) == dict:
+            answer_type = question.get("answer_type", None)
+        else:
+            raise Exception("Question was neither in str nor dict format")
+
+        raw = {"document_text": document_text,
+               "document_tokens": document_tokenized["tokens"],
+               "document_offsets": document_tokenized["offsets"],
+               "document_start_of_word": document_start_of_word,
+               "question_text": question_text,
+               "question_tokens": question_tokenized["tokens"],
+               "question_offsets": question_tokenized["offsets"],
+               "question_start_of_word": question_start_of_word,
+               "answers": answers,
+               "answer_type": answer_type,
+               "external_id": external_id}
+        raw_baskets.append(raw)
+    return raw_baskets
+
+
+def is_impossible_to_answer_type(qas):
+    """ Converts questions from having an is_impossible field to having an answer_type field"""
+    new_qas = []
+    for q in qas:
+        answer_type = "span"
+        if "is_impossible" in q:
+            if q["is_impossible"] == True:
+                answer_type = "is_impossible"
+            del q["is_impossible"]
+            q["answer_type"] = answer_type
+        new_qas.append(q)
+    return new_qas
diff --git a/farm/data_handler/utils.py b/farm/data_handler/utils.py
index 7e00a4f75..59b60acbe 100644
--- a/farm/data_handler/utils.py
+++ b/farm/data_handler/utils.py
@@ -790,7 +790,9 @@ def split_with_metadata(text):
 
 def convert_qa_input_dict(infer_dict):
     """ Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or
-    ["text", "questions"] (api format). This function converts the latter into the former"""
+    ["text", "questions"] (api format). This function converts the latter into the former. It also converts the
+    is_impossible field to answer_type so that NQ and SQuAD dicts have the same format.
+    """
     try:
         # Check if infer_dict is already in internal json format
         if "context" in infer_dict and "qas" in infer_dict:
@@ -802,16 +804,10 @@ def convert_qa_input_dict(infer_dict):
         qas = [{"question": q,
                 "id": None,
                 "answers": [],
-                "is_impossible": False} for i, q in enumerate(questions)]
+                "answer_type": None} for i, q in enumerate(questions)]
         converted = {"qas": qas,
                      "context": text,
                      "document_id":document_id}
         return converted
     except KeyError:
         raise Exception("Input does not have the expected format")
-
-
-
-
-
-
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 34e9f0149..a692cf44d 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -16,7 +16,7 @@
 from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
 
 from farm.data_handler.utils import is_json
-from farm.utils import convert_iob_to_simple_tags, span_to_string
+from farm.utils import convert_iob_to_simple_tags, span_to_string, try_get
 from farm.modeling.predictions import QACandidate, QAPred
 
 logger = logging.getLogger(__name__)
@@ -1242,15 +1242,6 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
             question_names = ["question_text", "qas", "questions"]
             doc_names = ["document_text", "context", "text"]
 
-            def try_get(keys, dictionary):
-                for key in keys:
-                    if key in dictionary:
-                        ret = dictionary[key]
-                        if type(ret) == list:
-                            ret = ret[0]
-                        return ret
-                return None
-
             document_text = try_get(doc_names, basket.raw)
             question = try_get(question_names, basket.raw)
 
diff --git a/farm/utils.py b/farm/utils.py
index 56f77581f..0183184f3 100644
--- a/farm/utils.py
+++ b/farm/utils.py
@@ -424,3 +424,13 @@ def span_to_string(start_t, end_t, token_offsets, clear_text):
     else:
         end_ch = token_offsets[end_t]
     return clear_text[start_ch: end_ch].strip(), start_ch, end_ch
+
+
+def try_get(keys, dictionary):
+    for key in keys:
+        if key in dictionary:
+            ret = dictionary[key]
+            if type(ret) == list:
+                ret = ret[0]
+            return ret
+    return None
\ No newline at end of file

From 2508f8ca4126b16452fcd32bd0a779c81b017f48 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 18 Jun 2020 17:06:11 +0200
Subject: [PATCH 06/40] clean apply_tokenization

---
 farm/data_handler/processor.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 9993d2a79..ddf5af459 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1091,7 +1091,7 @@ def dataset_from_dicts(self, dicts, indices=None, return_baskets=False):
 
     def _dicts_to_baskets(self, dicts, indices):
         # Perform tokenization on documents and questions resulting in an unnested list of doc-question pairs
-        dicts_tokenized = [self.apply_tokenization(d) for d in dicts]
+        dicts_tokenized = [apply_tokenization(d, self.tokenizer) for d in dicts]
 
         baskets = []
 
@@ -1104,9 +1104,6 @@ def _dicts_to_baskets(self, dicts, indices):
                 baskets.append(basket)
         return baskets
 
-    def apply_tokenization(self, dictionary):
-        return apply_tokenization(dictionary, self.tokenizer)
-
     def file_to_dicts(self, file: str) -> [dict]:
         nested_dicts = read_squad_file(filename=file)
         dicts = [y for x in nested_dicts for y in x["paragraphs"]]
@@ -1235,7 +1232,7 @@ def _dict_to_samples(self, dictionary: dict, all_dicts=None) -> [Sample]:
         if not self.inference:
             dictionary = self.prepare_dict(dictionary=dictionary)
 
-        dictionary_tokenized = self.apply_tokenization(dictionary)[0]
+        dictionary_tokenized = apply_tokenization(dictionary, self.tokenizer)[0]
         n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
         samples = create_samples_qa(dictionary_tokenized,
                                     self.max_query_length,
@@ -1414,9 +1411,6 @@ def convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text):
             end_c = len(span)
         return start_c, end_c
 
-    def apply_tokenization(self, dictionary):
-        return apply_tokenization(dictionary, self.tokenizer)
-
     def _sample_to_features(self, sample: Sample) -> dict:
         features = sample_to_features_qa(sample=sample,
                                          tokenizer=self.tokenizer,

From e7b4e412d796aa5201ea47af2ed5acf22f81737b Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 11:31:35 +0200
Subject: [PATCH 07/40] Rename samples to passages

---
 farm/modeling/prediction_head.py | 28 ++++++++++++++--------------
 farm/modeling/predictions.py     | 22 ++++++++--------------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index a692cf44d..4da4885cb 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1143,7 +1143,7 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix,
                                                       answer_type="span",
                                                       offset_unit="token",
                                                       aggregation_level="passage",
-                                                      sample_idx=sample_idx))
+                                                      passage_id=sample_idx))
 
         no_answer_score = start_end_matrix[0, 0].item()
         top_candidates.append(QACandidate(offset_answer_start=0,
@@ -1152,7 +1152,7 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix,
                                           answer_type="is_impossible",
                                           offset_unit="token",
                                           aggregation_level="passage",
-                                          sample_idx=None))
+                                          passage_id=None))
 
         return top_candidates
 
@@ -1255,7 +1255,7 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
                                                 document_text)
                 qa_answer.add_answer(pred_str)
                 full_preds.append(qa_answer)
-            n_samples = full_preds[0].n_samples_in_doc
+            n_samples = full_preds[0].n_passages_in_doc
 
             pred_id = basket.id_external if basket.id_external else basket.id_internal
 
@@ -1268,8 +1268,8 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
                                    aggregation_level="document",
                                    answer_types=[],  # TODO
                                    no_answer_gap=no_ans_gap,
-                                   n_samples=n_samples
-            )
+                                   n_passages=n_samples
+                                   )
             ret.append(curr_doc_pred)
         return ret
 
@@ -1377,8 +1377,8 @@ def reduce_preds(self, preds):
                                                         answer_type=qa_answer.answer_type,
                                                         offset_unit="token",
                                                         aggregation_level="passage",
-                                                        sample_idx=sample_idx,
-                                                        n_samples_in_doc=n_samples)
+                                                        passage_id=sample_idx,
+                                                        n_passages_in_doc=n_samples)
                                             )
 
         # TODO add switch for more variation in answers, e.g. if varied_ans then never return overlapping answers
@@ -1400,8 +1400,8 @@ def reduce_preds(self, preds):
                                      answer_type="is_impossible",
                                      offset_unit="token",
                                      aggregation_level="document",
-                                     sample_idx=None,
-                                     n_samples_in_doc=n_samples)
+                                     passage_id=None,
+                                     n_passages_in_doc=n_samples)
 
         # Add no answer to positive answers, sort the order and return the n_best
         n_preds = [no_answer_pred] + pos_answer_dedup
@@ -1500,7 +1500,7 @@ def prepare_labels(self, labels, start_of_word, **kwargs):
     @staticmethod
     def merge_formatted_preds(preds_all):
         """ Merges results from the two prediction heads used for NQ style QA. Takes the prediction from QA head and
-        assigns it the appropriate classification label. This mapping is achieved through sample_idx.
+        assigns it the appropriate classification label. This mapping is achieved through passage_id.
         preds_all should contain [QuestionAnsweringHead.formatted_preds(), TextClassificationHead()]. The first item
         of this list should be of len=n_documents while the second item should be of len=n_passages"""
 
@@ -1517,16 +1517,16 @@ def chunk(iterable, lengths):
 
         cls_preds = preds_all[1][0]["predictions"]
         qa_preds = preds_all[0][0]
-        samples_per_doc = [doc_pred.n_samples for doc_pred in preds_all[0][0]]
+        samples_per_doc = [doc_pred.n_passages for doc_pred in preds_all[0][0]]
         cls_preds_grouped = chunk(cls_preds, samples_per_doc)
 
         for qa_doc_pred, cls_preds in zip(qa_preds, cls_preds_grouped):
             pred_qa_answers = qa_doc_pred.prediction
             pred_qa_answers_new = []
             for pred_qa_answer in pred_qa_answers:
-                sample_idx = pred_qa_answer.sample_idx
-                if sample_idx is not None:
-                    cls_pred = cls_preds[sample_idx]["label"]
+                passage_id = pred_qa_answer.passage_id
+                if passage_id is not None:
+                    cls_pred = cls_preds[passage_id]["label"]
                 # i.e. if is_impossible
                 else:
                     cls_pred = "is_impossible"
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 752dc54de..9a24e5ae8 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -1,7 +1,5 @@
 from farm.utils import span_to_string
-from abc import ABC
-from typing import List, Optional, Any
-from pydantic import BaseModel
+from typing import List, Any
 
 class Pred:
     """
@@ -36,13 +34,11 @@ def __init__(self,
                  answer_support: str=None,
                  offset_answer_support_start: int=None,
                  offset_answer_support_end: int=None,
-                 sample_idx: int=None,
                  context: str=None,
                  offset_context_start: int=None,
                  offset_context_end: int=None,
-                 n_samples_in_doc: int=None,
-                 document_id: str=None,
-                 passage_id: str=None
+                 n_passages_in_doc: int=None,
+                 passage_id: str=None,
                  ):
         # self.answer_type can be "is_impossible", "yes", "no" or "span"
         self.answer_type = answer_type
@@ -57,11 +53,10 @@ def __init__(self,
 
         # If self.answer_type is in ["yes", "no"] then self.answer_support is a text string
         # If self.answer is a string answer span or self.answer_type is "is_impossible", answer_support is None
-        # TODO sample_idx can probably be removed since we have passage_id
         self.answer_support = answer_support
         self.offset_answer_support_start = offset_answer_support_start
         self.offset_answer_support_end = offset_answer_support_end
-        self.sample_idx = sample_idx
+        self.passage_id = passage_id
 
         # self.context is the document or passage where the answer is found
         self.context = context
@@ -73,8 +68,7 @@ def __init__(self,
         self.offset_unit = offset_unit
         self.aggregation_level = aggregation_level
 
-        self.n_samples_in_doc = n_samples_in_doc
-        self.document_id = document_id
+        self.n_passages_in_doc = n_passages_in_doc
         self.passage_id = passage_id
 
 
@@ -94,7 +88,7 @@ def add_answer(self, string):
             assert self.offset_answer_start >= 0
 
     def to_list(self):
-        return [self.answer, self.offset_answer_start, self.offset_answer_end, self.score, self.sample_idx]
+        return [self.answer, self.offset_answer_start, self.offset_answer_end, self.score, self.passage_id]
 
 
 class QAPred(Pred):
@@ -115,7 +109,7 @@ def __init__(self,
                  answer_types: List[str]=None,
                  ground_truth_answer: str =None,
                  no_answer_gap: float =None,
-                 n_samples: int=None
+                 n_passages: int=None
                  ):
         super().__init__(id, prediction, context)
         self.question = question
@@ -125,7 +119,7 @@ def __init__(self,
         self.answer_types = answer_types
         self.ground_truth_answer = ground_truth_answer
         self.no_answer_gap = no_answer_gap
-        self.n_samples = n_samples
+        self.n_passages = n_passages
 
     def to_json(self, squad=False):
         answers = self.answers_to_json(squad)

From 56a71271d87356e691408053c55be663425e7ad1 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 12:04:01 +0200
Subject: [PATCH 08/40] Clean id handling

---
 farm/modeling/predictions.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 024dc3972..c6bbf1102 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -81,9 +81,9 @@ def add_cls(self, predicted_class: str):
         """
 
         if predicted_class in ["yes", "no"] and self.answer != "is_impossible":
+            self.answer_support = self.answer
             self.answer = predicted_class
             self.answer_type = predicted_class
-            self.answer_support = self.answer
             self.offset_answer_support_start = self.offset_answer_start
             self.offset_answer_support_end = self.offset_answer_end
 
@@ -137,7 +137,7 @@ def __init__(self,
         self.n_passages = n_passages
 
     def to_json(self, squad=False):
-        answers = self.answers_to_json(squad)
+        answers = self.answers_to_json(self.id, squad)
         ret = {
             "task": "qa",
             "predictions": [
@@ -152,21 +152,21 @@ def to_json(self, squad=False):
         }
         return ret
 
-    def answers_to_json(self, squad=False):
+    def answers_to_json(self, id, squad=False):
         ret = []
 
         # iterate over the top_n predictions of the one document
-        for qa_answer in self.prediction:
-            string = qa_answer.answer
-            start_t = qa_answer.offset_answer_start
-            end_t = qa_answer.offset_answer_end
+        for qa_candidate in self.prediction:
+            string = qa_candidate.answer
+            start_t = qa_candidate.offset_answer_start
+            end_t = qa_candidate.offset_answer_end
 
             _, ans_start_ch, ans_end_ch = span_to_string(start_t, end_t, self.token_offsets, self.context)
             context_string, context_start_ch, context_end_ch = self.create_context(ans_start_ch, ans_end_ch, self.context)
             if squad:
                 if string == "is_impossible":
                     string = ""
-            curr = {"score": qa_answer.score,
+            curr = {"score": qa_candidate.score,
                     "probability": None,
                     "answer": string,
                     "offset_answer_start": ans_start_ch,
@@ -174,7 +174,7 @@ def answers_to_json(self, squad=False):
                     "context": context_string,
                     "offset_context_start": context_start_ch,
                     "offset_context_end": context_end_ch,
-                    "document_id": qa_answer.document_id}
+                    "document_id": id}
             ret.append(curr)
         return ret
 

From c33ce4dba56bd6e732ad764afa79e9c7aa59e2d2 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 15:03:04 +0200
Subject: [PATCH 09/40] rename is_impossible to no_answer

---
 farm/data_handler/input_features.py |  8 ++--
 farm/data_handler/processor.py      | 64 ++++++++++++++---------------
 farm/data_handler/samples.py        | 28 -------------
 farm/modeling/prediction_head.py    |  8 ++--
 farm/modeling/predictions.py        | 10 ++---
 5 files changed, 45 insertions(+), 73 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 092b9e6f8..eaf09a507 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -407,7 +407,7 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
     When the answer is not fully contained in the passage, or the question
     is impossible to answer, the start_idx and end_idx are 0 i.e. start and end are on the very first token
     (in most models, this is the [CLS] token). Note that in our implementation NQ has 4 labels
-    ["is_impossible", "yes", "no", "span"] and this is what answer_type_list should look like"""
+    ["no_answer", "yes", "no", "span"] and this is what answer_type_list should look like"""
 
     label_idxs = np.full((max_answers, 2), fill_value=-1)
     answer_types = np.full((max_answers), fill_value=-1)
@@ -452,12 +452,12 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
         start_label_present = 1 in start_vec
         end_label_present = 1 in end_vec
 
-        # This is triggered if the answer is not in the passage or the question is_impossible
+        # This is triggered if the answer is not in the passage or the question warrants a no_answer
         # In both cases, the token at idx=0 (in BERT, this is the [CLS] token) is given both the start and end label
         if start_label_present is False and end_label_present is False:
             start_vec[0] = 1
             end_vec[0] = 1
-            answer_type = "is_impossible"
+            answer_type = "no_answer"
         elif start_label_present is False or end_label_present is False:
             raise Exception("The label vectors are lacking either a start or end label")
 
@@ -472,7 +472,7 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
         label_idxs[i, 1] = end_idx
 
         # Only Natural Questions trains a classification head on answer_type, SQuAD only has the QA head. answer_type_list
-        # will be None for SQuAD but something like ["is_impossible", "span", "yes", "no"] for Natural Questions
+        # will be None for SQuAD but something like ["no_answer", "span", "yes", "no"] for Natural Questions
         if answer_type_list:
             answer_types[i] = answer_type_list.index(answer_type)
 
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index c543a7b8b..64ae7a0dc 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1180,14 +1180,14 @@ def __init__(
         doc_stride=128,
         max_query_length=64,
         proxies=None,
-        keep_is_impossible=0.02,
+        keep_no_answer=0.02,
         downsample_context_size=None,
         inference=False,
         **kwargs):
         """
         Deals with all the preprocessing steps needed for Natural Questions. Follows Alberti 2019 et al. (https://arxiv.org/abs/1901.08634)
         in merging multiple disjoint short answers into the one longer label span and also by downsampling
-        samples of is_impossible during training
+        samples of no_answer during training
 
         :param tokenizer: Used to split a sentence (str) into tokens.
         :param max_seq_len: Samples are truncated after this many tokens.
@@ -1211,14 +1211,14 @@ def __init__(
         :type doc_stride: int
         :param max_query_length: Maximum length of the question (in number of subword tokens)
         :type max_query_length: int
-        :param keep_is_impossible: The probability that a sample with an is_impossible label is kept
-                                    (0.0 < keep_is_impossible <= 1.0). Only works if inference is False
-        :type keep_is_impossible: float
+        :param keep_no_answer: The probability that a sample with an no_answer label is kept
+                                    (0.0 < keep_no_answer <= 1.0). Only works if inference is False
+        :type keep_no_answer: float
         :param downsample_context_size: Downsampling before any data conversion by taking a short text window of size
                                         downsample_context_size around the long answer span. To disable set to None
         :type downsample_context_size: int
         :param inference: Whether we are currently using the Processsor for model inference. If True, the
-                          keep_is_impossible will be overridden and set to 1
+                          keep_no_answer will be overridden and set to 1
         :type inference: bool
         :param kwargs: placeholder for passing generic parameters
         :type kwargs: object
@@ -1228,11 +1228,11 @@ def __init__(
 
         # These are classification labels from Natural Questions. Note that in this implementation, we are merging
         # the "long_answer" and "short_answer" labels into the one "span" label
-        self.answer_type_list = ["is_impossible", "span", "yes", "no"]
+        self.answer_type_list = ["no_answer", "span", "yes", "no"]
 
         self.doc_stride = doc_stride
         self.max_query_length = max_query_length
-        self.keep_is_impossible = keep_is_impossible
+        self.keep_no_answer = keep_no_answer
         self.downsample_context_size = downsample_context_size
         self.inference = inference
 
@@ -1279,19 +1279,19 @@ def _dict_to_samples(self, dictionary: dict, all_dicts=None) -> [Sample]:
                                     self.max_seq_len,
                                     self.doc_stride,
                                     n_special_tokens)
-        # Downsample the number of samples with an is_impossible label. This fn will always return at least one sample
+        # Downsample the number of samples with an no_answer label. This fn will always return at least one sample
         # so that we don't end up with a basket with 0 samples
         if not self.inference:
-            samples = self.downsample(samples, self.keep_is_impossible)
+            samples = self.downsample(samples, self.keep_no_answer)
         return samples
 
     def downsample(self, samples, keep_prob):
-        # Downsamples samples with a is_impossible label (since there is an overrepresentation of these in NQ)
+        # Downsamples samples with a no_answer label (since there is an overrepresentation of these in NQ)
         # This method will always return at least one sample. This is done so that we don't end up with SampleBaskets
         # with 0 samples
         ret = []
         for s in samples:
-            if self.check_is_impossible(s):
+            if self.check_no_answer_sample(s):
                 if random_float() > 1 - keep_prob:
                     ret.append(s)
             else:
@@ -1300,21 +1300,6 @@ def downsample(self, samples, keep_prob):
             ret = [random.choice(samples)]
         return ret
 
-    @staticmethod
-    def check_is_impossible(sample):
-        sample_tok = sample.tokenized
-        if len(sample_tok["answers"]) == 0:
-            return True
-        first_answer = sample_tok["answers"][0]
-        if first_answer["start_t"] < sample_tok["passage_start_t"]:
-            return True
-        if first_answer["end_t"] > sample_tok["passage_start_t"] + len(sample_tok["passage_tokens"]):
-            return True
-        if first_answer["answer_type"] == "is_impossible":
-            return True
-        else:
-            return False
-
     def downsample_unprocessed(self, dictionary):
         doc_text = dictionary["document_text"]
         doc_tokens = doc_text.split(" ")
@@ -1359,7 +1344,7 @@ def downsample_unprocessed(self, dictionary):
     def prepare_dict(self, dictionary):
         """ Casts a Natural Questions dictionary that is loaded from a jsonl file into SQuAD format so that
         the same featurization functions can be called for both tasks. Each annotation can be one of four answer types,
-        ["yes", "no", "span", "is_impossible"]"""
+        ["yes", "no", "span", "no_answer"]"""
 
         if self.downsample_context_size is not None:
             dictionary = self.downsample_unprocessed(dictionary)
@@ -1376,12 +1361,12 @@ def prepare_dict(self, dictionary):
                                                             annotation["long_answer"]["end_token"],
                                                             tok_to_ch,
                                                             doc_text)
-            # Picks the span to be considered as annotation by choosing between short answer, long answer and is_impossible
+            # Picks the span to be considered as annotation by choosing between short answer, long answer and no_answer
             text, start_c = self.choose_span(sa_text, sa_start_c, la_text, la_start_c)
             converted_answers.append({"text": text,
                                       "answer_start": start_c})
         if len(converted_answers) == 0:
-            answer_type = "is_impossible"
+            answer_type = "no_answer"
         else:
             answer_type = dictionary["annotations"][0]["yes_no_answer"].lower()
             if answer_type == "none":
@@ -1404,6 +1389,21 @@ def check_no_answer(annotation):
         else:
             return True
 
+    @staticmethod
+    def check_no_answer_sample(sample):
+        sample_tok = sample.tokenized
+        if len(sample_tok["answers"]) == 0:
+            return True
+        first_answer = sample_tok["answers"][0]
+        if first_answer["start_t"] < sample_tok["passage_start_t"]:
+            return True
+        if first_answer["end_t"] > sample_tok["passage_start_t"] + len(sample_tok["passage_tokens"]):
+            return True
+        if first_answer["answer_type"] == "no_answer":
+            return True
+        else:
+            return False
+
     def retrieve_long_answer(self, start_t, end_t, tok_to_ch, doc_text):
         """ Retrieves the string long answer and also its starting character index"""
         start_c, end_c = self.convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text)
@@ -1627,7 +1627,7 @@ def apply_tokenization(dictionary, tokenizer):
             question_text = question["question"]
             for answer in question["answers"]:
                 if answer["text"] == "":
-                    answer_type = "is_impossible"
+                    answer_type = "no_answer"
                 else:
                     answer_type = "span"
                 a = {"text": answer["text"],
@@ -1672,7 +1672,7 @@ def is_impossible_to_answer_type(qas):
         answer_type = "span"
         if "is_impossible" in q:
             if q["is_impossible"] == True:
-                answer_type = "is_impossible"
+                answer_type = "no_answer"
             del q["is_impossible"]
             q["answer_type"] = answer_type
         new_qas.append(q)
diff --git a/farm/data_handler/samples.py b/farm/data_handler/samples.py
index 2be71d127..5f3e3af12 100644
--- a/farm/data_handler/samples.py
+++ b/farm/data_handler/samples.py
@@ -91,27 +91,6 @@ def __str__(self):
         return s
 
 
-class Squad_cleartext:
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        doc_tokens,
-        orig_answer_text,
-        start_position,
-        end_position,
-        is_impossible,
-    ):
-
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-
-
 def create_sample_one_label_one_text(raw_data, text_index, label_index, basket_id):
 
     # text = " ".join(raw_data[text_index:])
@@ -278,10 +257,3 @@ def offset_to_token_idx(token_offsets, ch_idx):
     for i in range(n_tokens):
         if (i + 1 == n_tokens) or (token_offsets[i] <= ch_idx < token_offsets[i + 1]):
             return i
-
-
-def check_if_training(dictionary):
-    if "is_impossible" in dictionary:
-        return True
-    return False
-
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index e19660914..cd47e4a81 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1149,7 +1149,7 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix,
         top_candidates.append(QACandidate(offset_answer_start=0,
                                           offset_answer_end=0,
                                           score=no_answer_score,
-                                          answer_type="is_impossible",
+                                          answer_type="no_answer",
                                           offset_unit="token",
                                           aggregation_level="passage",
                                           passage_id=None))
@@ -1397,7 +1397,7 @@ def reduce_preds(self, preds):
         no_answer_pred = QACandidate(offset_answer_start=-1,
                                      offset_answer_end=-1,
                                      score=best_overall_positive_score - no_ans_gap,
-                                     answer_type="is_impossible",
+                                     answer_type="no_answer",
                                      offset_unit="token",
                                      aggregation_level="document",
                                      passage_id=None,
@@ -1527,9 +1527,9 @@ def chunk(iterable, lengths):
                 passage_id = pred_qa_answer.passage_id
                 if passage_id is not None:
                     cls_pred = cls_preds[passage_id]["label"]
-                # i.e. if is_impossible
+                # i.e. if no_answer
                 else:
-                    cls_pred = "is_impossible"
+                    cls_pred = "no_answer"
                 pred_qa_answer.add_cls(cls_pred)
                 pred_qa_answers_new.append(pred_qa_answer)
             qa_doc_pred.prediction = pred_qa_answers_new
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index c6bbf1102..98af5ee78 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -40,7 +40,7 @@ def __init__(self,
                  n_passages_in_doc: int=None,
                  passage_id: str=None,
                  ):
-        # self.answer_type can be "is_impossible", "yes", "no" or "span"
+        # self.answer_type can be "no_answer", "yes", "no" or "span"
         self.answer_type = answer_type
         self.score = score
         self.probability = probability
@@ -52,7 +52,7 @@ def __init__(self,
         self.offset_answer_end = offset_answer_end
 
         # If self.answer_type is in ["yes", "no"] then self.answer_support is a text string
-        # If self.answer is a string answer span or self.answer_type is "is_impossible", answer_support is None
+        # If self.answer is a string answer span or self.answer_type is "no_answer", answer_support is None
         self.answer_support = answer_support
         self.offset_answer_support_start = offset_answer_support_start
         self.offset_answer_support_end = offset_answer_support_end
@@ -80,7 +80,7 @@ def add_cls(self, predicted_class: str):
         :return: None
         """
 
-        if predicted_class in ["yes", "no"] and self.answer != "is_impossible":
+        if predicted_class in ["yes", "no"] and self.answer != "no_answer":
             self.answer_support = self.answer
             self.answer = predicted_class
             self.answer_type = predicted_class
@@ -94,7 +94,7 @@ def to_doc_level(self, start, end):
 
     def add_answer(self, string):
         if string == "":
-            self.answer = "is_impossible"
+            self.answer = "no_answer"
             assert self.offset_answer_end == -1
             assert self.offset_answer_start == -1
         else:
@@ -164,7 +164,7 @@ def answers_to_json(self, id, squad=False):
             _, ans_start_ch, ans_end_ch = span_to_string(start_t, end_t, self.token_offsets, self.context)
             context_string, context_start_ch, context_end_ch = self.create_context(ans_start_ch, ans_end_ch, self.context)
             if squad:
-                if string == "is_impossible":
+                if string == "no_answer":
                     string = ""
             curr = {"score": qa_candidate.score,
                     "probability": None,

From 55e0ad2feb9e2bdf9de8a51e4789f4ea29591d09 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 16:18:20 +0200
Subject: [PATCH 10/40] Rename preds_p to preds

---
 farm/modeling/adaptive_model.py  | 22 ++++++++++++----------
 farm/modeling/prediction_head.py | 32 +++++++++++++++-----------------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/farm/modeling/adaptive_model.py b/farm/modeling/adaptive_model.py
index aada3c358..87d39921d 100644
--- a/farm/modeling/adaptive_model.py
+++ b/farm/modeling/adaptive_model.py
@@ -89,14 +89,16 @@ def formatted_preds(self, logits, **kwargs):
 
         elif n_heads == 1:
             preds_final = []
-            # TODO This is very specific to QA, make more general
+            # This try catch is to deal with the fact that sometimes we collect preds before passing it to
+            # formatted_preds (see Inferencer._get_predictions_and_aggregate()) and sometimes we don't
+            # (see Inferencer._get_predictions())
             try:
-                preds_p = kwargs["preds_p"]
-                temp = [y[0] for y in preds_p]
-                preds_p_flat = [item for sublist in temp for item in sublist]
-                kwargs["preds_p"] = preds_p_flat
+                preds = kwargs["preds"]
+                temp = [y[0] for y in preds]
+                preds_flat = [item for sublist in temp for item in sublist]
+                kwargs["preds"] = preds_flat
             except KeyError:
-                kwargs["preds_p"] = None
+                kwargs["preds"] = None
             head = self.prediction_heads[0]
             logits_for_head = logits[0]
             preds = head.formatted_preds(logits=logits_for_head, **kwargs)
@@ -109,17 +111,17 @@ def formatted_preds(self, logits, **kwargs):
         # This case is triggered by Natural Questions
         else:
             preds_final = [list() for _ in range(n_heads)]
-            preds = kwargs["preds_p"]
+            preds = kwargs["preds"]
             preds_for_heads = stack(preds)
             logits_for_heads = [None] * n_heads
 
             samples = [s for b in kwargs["baskets"] for s in b.samples]
             kwargs["samples"] = samples
 
-            del kwargs["preds_p"]
+            del kwargs["preds"]
 
-            for i, (head, preds_p_for_head, logits_for_head) in enumerate(zip(self.prediction_heads, preds_for_heads, logits_for_heads)):
-                preds = head.formatted_preds(logits=logits_for_head, preds_p=preds_p_for_head, **kwargs)
+            for i, (head, preds_for_head, logits_for_head) in enumerate(zip(self.prediction_heads, preds_for_heads, logits_for_heads)):
+                preds = head.formatted_preds(logits=logits_for_head, preds=preds_for_head, **kwargs)
                 preds_final[i].append(preds)
 
             # Look for a merge() function amongst the heads and if a single one exists, apply it to preds_final
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index cd47e4a81..bd062cae4 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -367,22 +367,20 @@ def prepare_labels(self, **kwargs):
             labels = [self.label_list[int(x[0])] for x in label_ids]
         return labels
 
-    def formatted_preds(self, logits=None, preds_p=None, samples=None, return_class_probs=False, **kwargs):
-        """ Like QuestionAnsweringHead.formatted_preds(), this fn can operate on either logits or preds_p. This
+    def formatted_preds(self, logits=None, preds=None, samples=None, return_class_probs=False, **kwargs):
+        """ Like QuestionAnsweringHead.formatted_preds(), this fn can operate on either logits or preds. This
         is needed since at inference, the order of operations is very different depending on whether we are performing
-        aggregation or not (compare Inferencer._get_predictions() vs Inferencer._get_predictions_and_aggregate())
+        aggregation or not (compare Inferencer._get_predictions() vs Inferencer._get_predictions_and_aggregate())"""
 
-        TODO: Preds_p should be renamed to preds"""
+        assert (logits is not None) or (preds is not None)
 
-        assert (logits is not None) or (preds_p is not None)
-
-        # When this method is used along side a QAHead at inference (e.g. Natural Questions), preds_p is the input and
+        # When this method is used along side a QAHead at inference (e.g. Natural Questions), preds is the input and
         # there is currently no good way of generating probs
         if logits is not None:
-            preds_p = self.logits_to_preds(logits)
+            preds = self.logits_to_preds(logits)
             probs = self.logits_to_probs(logits, return_class_probs)
         else:
-            probs = [None] * len(preds_p)
+            probs = [None] * len(preds)
 
         # TODO this block has to do with the difference in Basket and Sample structure between SQuAD and NQ
         try:
@@ -395,10 +393,10 @@ def formatted_preds(self, logits=None, preds_p=None, samples=None, return_class_
         if len(contexts_b) != 0:
             contexts = ["|".join([a, b]) for a,b in zip(contexts, contexts_b)]
 
-        assert len(preds_p) == len(probs) == len(contexts)
+        assert len(preds) == len(probs) == len(contexts)
 
         res = {"task": "text_classification", "predictions": []}
-        for pred, prob, context in zip(preds_p, probs, contexts):
+        for pred, prob, context in zip(preds, probs, contexts):
             if not return_class_probs:
                 pred_dict = {
                     "start": None,
@@ -1192,11 +1190,11 @@ def valid_answer_idxs(start_idx, end_idx, n_non_padding, max_answer_length, seq_
             return False
         return True
 
-    def formatted_preds(self, logits=None, preds_p=None, baskets=None, **kwargs):
-        """ Takes a list of predictions, each corresponding to one sample, and converts them into document level
+    def formatted_preds(self, logits=None, preds=None, baskets=None, **kwargs):
+        """ Takes a list of passage level predictions, each corresponding to one sample, and converts them into document level
         predictions. Leverages information in the SampleBaskets. Assumes that we are being passed predictions from
         ALL samples in the one SampleBasket i.e. all passages of a document. Logits should be None, because we have
-        already converted the logits to predictions before calling formatted_preds
+        already converted the logits to predictions before calling formatted_preds.
         (see Inferencer._get_predictions_and_aggregate()).
         """
 
@@ -1205,17 +1203,17 @@ def formatted_preds(self, logits=None, preds_p=None, baskets=None, **kwargs):
         # seq_2_start_t is the token index of the first token in passage relative to the input sequence (i.e. number of
         # special tokens and question tokens that come before the passage tokens)
         assert logits is None, "Logits are not None, something is passed wrongly into formatted_preds() in infer.py"
-        assert preds_p is not None, "No preds_p passed to formatted_preds()"
+        assert preds is not None, "No preds passed to formatted_preds()"
         samples = [s for b in baskets for s in b.samples]
         ids = [s.id for s in samples]
         passage_start_t = [s.features[0]["passage_start_t"] for s in samples]
         seq_2_start_t = [s.features[0]["seq_2_start_t"] for s in samples]
 
         # Aggregate passage level predictions to create document level predictions.
-        # This method assumes that all passages of each document are contained in preds_p
+        # This method assumes that all passages of each document are contained in preds
         # i.e. that there are no incomplete documents. The output of this step
         # are prediction spans
-        preds_d = self.aggregate_preds(preds_p, passage_start_t, ids, seq_2_start_t)
+        preds_d = self.aggregate_preds(preds, passage_start_t, ids, seq_2_start_t)
 
         assert len(preds_d) == len(baskets)
 

From 13087128a912a76617096f3b14b8b93ebc607706 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 16:21:39 +0200
Subject: [PATCH 11/40] Add QAInference type hints

---
 farm/infer.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/farm/infer.py b/farm/infer.py
index 890259b18..7c20d71c8 100644
--- a/farm/infer.py
+++ b/farm/infer.py
@@ -7,6 +7,7 @@
 from torch.utils.data.sampler import SequentialSampler
 from tqdm import tqdm
 from transformers.configuration_auto import AutoConfig
+from typing import Generator, List, Union
 
 from farm.data_handler.dataloader import NamedDataLoader
 from farm.data_handler.processor import Processor, InferenceProcessor, SquadProcessor, NERProcessor, TextClassificationProcessor
@@ -15,7 +16,7 @@
 from farm.modeling.adaptive_model import AdaptiveModel, BaseAdaptiveModel
 from farm.utils import initialize_device_settings
 from farm.utils import set_all_seeds, calc_chunksize, log_ascii_workers
-
+from farm.modeling.predictions import QAPred
 
 logger = logging.getLogger(__name__)
 
@@ -563,7 +564,7 @@ def _get_predictions_and_aggregate(self, dataset, tensor_names, baskets):
         # can assume that we have only complete docs i.e. all the samples of one doc are in the current chunk
         logits = [None]
         preds_all = self.model.formatted_preds(logits=logits, # For QA we collected preds per batch and do not want to pass logits
-                                               preds_p=unaggregated_preds_all,
+                                               preds=unaggregated_preds_all,
                                                baskets=baskets)
         return preds_all
 
@@ -593,6 +594,23 @@ def extract_vectors(self, dicts, extraction_strategy="cls_token", extraction_lay
         return self.inference_from_dicts(dicts)
 
 
+class QAInferencer(Inferencer):
+
+    def inference_from_dicts(self,
+                             dicts,
+                             return_json=True,
+                             multiprocessing_chunksize=None,
+                             streaming=False) -> Union[List[QAPred], Generator[QAPred]]:
+        return Inferencer.inference_from_dicts(dicts, return_json=True, multiprocessing_chunksize=None, streaming=False)
+
+    def inference_from_file(self,
+                            file,
+                            multiprocessing_chunksize=None,
+                            streaming=False,
+                            return_json=True) -> Union[List[QAPred], Generator[QAPred]]:
+        return Inferencer.inference_from_file(file, return_json=True, multiprocessing_chunksize=None, streaming=False)
+
+
 class FasttextInferencer:
     def __init__(self, model, name=None):
         self.model = model

From 61c5d7b95f5aff5dc9a66b074c1e7152f93914d9 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 16:23:52 +0200
Subject: [PATCH 12/40] Adjust examples to new changes

---
 examples/natural_questions.py  | 6 +++---
 examples/question_answering.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/natural_questions.py b/examples/natural_questions.py
index 1a8495351..c6e70b33d 100644
--- a/examples/natural_questions.py
+++ b/examples/natural_questions.py
@@ -6,7 +6,7 @@
 from farm.data_handler.data_silo import DataSilo
 from farm.data_handler.processor import NaturalQuestionsProcessor
 from farm.file_utils import fetch_archive_from_http
-from farm.infer import Inferencer
+from farm.infer import QAInferencer
 from farm.modeling.adaptive_model import AdaptiveModel
 from farm.modeling.language_model import LanguageModel
 from farm.modeling.optimization import initialize_optimizer
@@ -68,7 +68,7 @@ def question_answering():
         max_seq_len=384,
         train_filename=train_filename,
         dev_filename=dev_filename,
-        keep_is_impossible=keep_is_impossible,
+        keep_no_answer=keep_is_impossible,
         downsample_context_size=downsample_context_size,
         data_dir=Path("../data/natural_questions"),
     )
@@ -131,7 +131,7 @@ def question_answering():
         }
     ]
 
-    model = Inferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True)
+    model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True)
     result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects
 
     print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?"
diff --git a/examples/question_answering.py b/examples/question_answering.py
index bad2ba73c..0317780c8 100644
--- a/examples/question_answering.py
+++ b/examples/question_answering.py
@@ -7,7 +7,7 @@
 from farm.data_handler.data_silo import DataSilo
 from farm.data_handler.processor import SquadProcessor
 from farm.data_handler.utils import write_squad_predictions
-from farm.infer import Inferencer
+from farm.infer import QAInferencer
 from farm.modeling.adaptive_model import AdaptiveModel
 from farm.modeling.language_model import LanguageModel
 from farm.modeling.optimization import initialize_optimizer
@@ -110,7 +110,7 @@ def question_answering():
                 "context":  "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
             }]
 
-    model = Inferencer.load(save_dir, batch_size=40, gpu=True)
+    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
     result = model.inference_from_dicts(dicts=QA_input)[0]
 
     pprint.pprint(result)

From 335b087cf5a872f9516005282476f84506caed18 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Mon, 22 Jun 2020 16:54:13 +0200
Subject: [PATCH 13/40] Fix type hint error

---
 farm/infer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/farm/infer.py b/farm/infer.py
index 7c20d71c8..bba0b8bdb 100644
--- a/farm/infer.py
+++ b/farm/infer.py
@@ -600,14 +600,14 @@ def inference_from_dicts(self,
                              dicts,
                              return_json=True,
                              multiprocessing_chunksize=None,
-                             streaming=False) -> Union[List[QAPred], Generator[QAPred]]:
+                             streaming=False) -> Union[List[QAPred], Generator[QAPred, None, None]]:
         return Inferencer.inference_from_dicts(dicts, return_json=True, multiprocessing_chunksize=None, streaming=False)
 
     def inference_from_file(self,
                             file,
                             multiprocessing_chunksize=None,
                             streaming=False,
-                            return_json=True) -> Union[List[QAPred], Generator[QAPred]]:
+                            return_json=True) -> Union[List[QAPred], Generator[QAPred, None, None]]:
         return Inferencer.inference_from_file(file, return_json=True, multiprocessing_chunksize=None, streaming=False)
 
 

From abb41308537c717725c52cf3387bac6b37161418 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 23 Jun 2020 16:23:06 +0200
Subject: [PATCH 14/40] Check that label character index matches label str

---
 farm/data_handler/processor.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 64ae7a0dc..237345189 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1147,6 +1147,8 @@ def _dicts_to_baskets(self, dicts, indices):
     def file_to_dicts(self, file: str) -> [dict]:
         nested_dicts = read_squad_file(filename=file)
         dicts = [y for x in nested_dicts for y in x["paragraphs"]]
+        for d in dicts:
+            assert valid_answer(d)
         return dicts
 
     def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
@@ -1677,3 +1679,15 @@ def is_impossible_to_answer_type(qas):
             q["answer_type"] = answer_type
         new_qas.append(q)
     return new_qas
+
+def valid_answer(dictionary):
+    context = dictionary["context"]
+    for qa in dictionary["qas"]:
+        for answer in qa["answers"]:
+            len_answer = len(answer["text"])
+            start = answer["answer_start"]
+            end = answer["answer_start"] + len_answer
+            if context[start: end] != answer["text"]:
+                raise Exception(f"The answer extracted by start character index does not match the answer string: "
+                                 f"\t {context[start: end]} vs {answer['text']}")
+    return True

From cb893b72493ed710c51da59ff2f44e7e933c8fa7 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 23 Jun 2020 16:47:03 +0200
Subject: [PATCH 15/40] Minor improvements

---
 farm/modeling/predictions.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 98af5ee78..beed83992 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -1,7 +1,9 @@
 from farm.utils import span_to_string
 from typing import List, Any
+from abc import ABC
 
-class Pred:
+
+class Pred(ABC):
     """
     Base class for predictions of every task. Note that it inherits from pydantic.BaseModel which creates an
     __init__() with the attributes defined in this class (i.e. id, prediction, context)
@@ -115,7 +117,7 @@ class QAPred(Pred):
 
     def __init__(self,
                  id: str,
-                 prediction: List[Any],
+                 prediction: List[QACandidate],
                  context: str,
                  question: str,
                  token_offsets: List[int],

From 1b9e6413d3fbc9c1321ca0a5608d029d4e21fd22 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 24 Jun 2020 12:32:51 +0200
Subject: [PATCH 16/40] Enforce single label doc cls in preprocessing

---
 farm/data_handler/input_features.py | 11 +++++------
 farm/modeling/prediction_head.py    |  4 +---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index eaf09a507..7f7f80743 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -410,7 +410,7 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
     ["no_answer", "yes", "no", "span"] and this is what answer_type_list should look like"""
 
     label_idxs = np.full((max_answers, 2), fill_value=-1)
-    answer_types = np.full((max_answers), fill_value=-1)
+    answer_types = np.full((1), fill_value=-1)
 
     # If there are no answers
     if len(answers) == 0:
@@ -419,7 +419,6 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
         return label_idxs, answer_types
 
     for i, answer in enumerate(answers):
-        answer_type = answer["answer_type"]
         start_idx = answer["start_t"]
         end_idx = answer["end_t"]
 
@@ -471,10 +470,10 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
         label_idxs[i, 0] = start_idx
         label_idxs[i, 1] = end_idx
 
-        # Only Natural Questions trains a classification head on answer_type, SQuAD only has the QA head. answer_type_list
-        # will be None for SQuAD but something like ["no_answer", "span", "yes", "no"] for Natural Questions
-        if answer_type_list:
-            answer_types[i] = answer_type_list.index(answer_type)
+    # Only Natural Questions trains a classification head on answer_type, SQuAD only has the QA head. answer_type_list
+    # will be None for SQuAD but something like ["no_answer", "span", "yes", "no"] for Natural Questions
+    if answer_type_list:
+        answer_types[0] = answer_type_list.index(answers[0]["answer_type"])
 
     assert np.max(label_idxs) > -1
 
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index bd062cae4..7dc00ae53 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -335,9 +335,7 @@ def forward(self, X):
 
     def logits_to_loss(self, logits, **kwargs):
         label_ids = kwargs.get(self.label_tensor_name)
-        # In Natural Questions, each dev sample can have multiple labels
-        # For loss calculation we only use the first label
-        label_ids = label_ids.narrow(1,0,1)
+        label_ids = label_ids
         return self.loss_fct(logits, label_ids.view(-1))
 
     def logits_to_probs(self, logits, return_class_probs, **kwargs):

From 27e12ee172356e08a489aeafa8aa00272449c53e Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 24 Jun 2020 15:06:24 +0200
Subject: [PATCH 17/40] Refactor span_to_string, clean predictions objects

---
 farm/modeling/prediction_head.py | 38 ++++++++++++---------------
 farm/modeling/predictions.py     | 45 ++++++++++++++++++++++++--------
 farm/utils.py                    | 25 ------------------
 3 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 7dc00ae53..b284cc749 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1,22 +1,19 @@
-import itertools
 import json
 import logging
 import os
 import numpy as np
-import pandas as pd
-from scipy.special import expit, softmax
-import tqdm
+
 from pathlib import Path
 import torch
 from transformers.modeling_bert import BertForPreTraining, BertLayerNorm, ACT2FN
 from transformers.modeling_auto import AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoModelForSequenceClassification
-from transformers.configuration_auto import AutoConfig
+from typing import List
 
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
 
 from farm.data_handler.utils import is_json
-from farm.utils import convert_iob_to_simple_tags, span_to_string, try_get
+from farm.utils import convert_iob_to_simple_tags, try_get
 from farm.modeling.predictions import QACandidate, QAPred
 
 logger = logging.getLogger(__name__)
@@ -1243,14 +1240,11 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
 
             # Iterate over each prediction on the one document
             full_preds = []
-            for qa_answer, basket in zip(pred_d, baskets):
-                # This should be a method of Span
-                pred_str, _, _ = span_to_string(qa_answer.offset_answer_start,
-                                                qa_answer.offset_answer_end,
-                                                token_offsets,
-                                                document_text)
-                qa_answer.add_answer(pred_str)
-                full_preds.append(qa_answer)
+            for qa_candidate, basket in zip(pred_d, baskets):
+                pred_str, _, _ = qa_candidate.span_to_string(token_offsets,
+                                                             document_text)
+                qa_candidate.add_answer(pred_str)
+                full_preds.append(qa_candidate)
             n_samples = full_preds[0].n_passages_in_doc
 
             pred_id = basket.id_external if basket.id_external else basket.id_internal
@@ -1365,15 +1359,15 @@ def reduce_preds(self, preds):
         # Get all predictions in flattened list and sort by score
         pos_answers_flat = []
         for sample_idx, passage_preds in enumerate(preds):
-            for qa_answer in passage_preds:
-                if not (qa_answer.offset_answer_start == -1 and qa_answer.offset_answer_end == -1):
-                    pos_answers_flat.append(QACandidate(offset_answer_start=qa_answer.offset_answer_start,
-                                                        offset_answer_end=qa_answer.offset_answer_end,
-                                                        score=qa_answer.score,
-                                                        answer_type=qa_answer.answer_type,
+            for qa_candidate in passage_preds:
+                if not (qa_candidate.offset_answer_start == -1 and qa_candidate.offset_answer_end == -1):
+                    pos_answers_flat.append(QACandidate(offset_answer_start=qa_candidate.offset_answer_start,
+                                                        offset_answer_end=qa_candidate.offset_answer_end,
+                                                        score=qa_candidate.score,
+                                                        answer_type=qa_candidate.answer_type,
                                                         offset_unit="token",
                                                         aggregation_level="passage",
-                                                        passage_id=sample_idx,
+                                                        passage_id=str(sample_idx),
                                                         n_passages_in_doc=n_samples)
                                             )
 
@@ -1384,7 +1378,7 @@ def reduce_preds(self, preds):
         no_ans_gap = -min([nas - pbs for nas, pbs in zip(no_answer_scores, passage_best_score)])
 
         # "no answer" scores and positive answers scores are difficult to compare, because
-        # + a positive answer score is related to a specific text qa_answer
+        # + a positive answer score is related to a specific text qa_candidate
         # - a "no answer" score is related to all input texts
         # Thus we compute the "no answer" score relative to the best possible answer and adjust it by
         # the most significant difference between scores.
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index beed83992..74c2ec515 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -1,12 +1,10 @@
-from farm.utils import span_to_string
 from typing import List, Any
 from abc import ABC
 
 
 class Pred(ABC):
     """
-    Base class for predictions of every task. Note that it inherits from pydantic.BaseModel which creates an
-    __init__() with the attributes defined in this class (i.e. id, prediction, context)
+    Abstract base class for predictions of every task
     """
     def __init__(self,
                  id: str,
@@ -21,8 +19,7 @@ def to_json(self):
 
 class QACandidate:
     """
-    A single QA candidate answer. Note that it inherits from pydantic.BaseModel which builds the __init__() method.
-    See class definition to find list of compulsory and optional arguments and also comments on how they are used.
+    A single QA candidate answer.
     """
     def __init__(self,
                  answer_type: str,
@@ -42,6 +39,7 @@ def __init__(self,
                  n_passages_in_doc: int=None,
                  passage_id: str=None,
                  ):
+
         # self.answer_type can be "no_answer", "yes", "no" or "span"
         self.answer_type = answer_type
         self.score = score
@@ -73,6 +71,34 @@ def __init__(self,
         self.n_passages_in_doc = n_passages_in_doc
         self.passage_id = passage_id
 
+    def span_to_string(self, token_offsets, clear_text):
+
+        start_t = self.offset_answer_start
+        end_t = self.offset_answer_end
+
+        # If it is a no_answer prediction
+        if start_t == -1 and end_t == -1:
+            return "", 0, 0
+
+        n_tokens = len(token_offsets)
+
+        # We do this to point to the beginning of the first token after the span instead of
+        # the beginning of the last token in the span
+        end_t += 1
+
+        # Predictions sometimes land on the very final special token of the passage. But there are no
+        # special tokens on the document level. We will just interpret this as a span that stretches
+        # to the end of the document
+        end_t = min(end_t, n_tokens)
+
+        start_ch = token_offsets[start_t]
+        # i.e. pointing at the END of the last token
+        if end_t == n_tokens:
+            end_ch = len(clear_text)
+        else:
+            end_ch = token_offsets[end_t]
+        return clear_text[start_ch: end_ch].strip(), start_ch, end_ch
+
     def add_cls(self, predicted_class: str):
         """
         Adjust the final QA prediction depending on the prediction of the classification head (e.g. for binary answers in NQ)
@@ -110,9 +136,7 @@ def to_list(self):
 
 class QAPred(Pred):
     """Question Answering predictions for a passage or a document. The self.prediction attribute is populated by a
-    list of QACandidate objects. Note that this object inherits from the Pred class which is why some of
-    the attributes are found in the Pred class and not here. Pred in turn inherits from pydantic.BaseModel
-    which creates an __init__() method. See class definition for required and optional arguments.
+    list of QACandidate objects.
     """
 
     def __init__(self,
@@ -160,10 +184,8 @@ def answers_to_json(self, id, squad=False):
         # iterate over the top_n predictions of the one document
         for qa_candidate in self.prediction:
             string = qa_candidate.answer
-            start_t = qa_candidate.offset_answer_start
-            end_t = qa_candidate.offset_answer_end
 
-            _, ans_start_ch, ans_end_ch = span_to_string(start_t, end_t, self.token_offsets, self.context)
+            _, ans_start_ch, ans_end_ch = qa_candidate.span_to_string(self.token_offsets, self.context)
             context_string, context_start_ch, context_end_ch = self.create_context(ans_start_ch, ans_end_ch, self.context)
             if squad:
                 if string == "no_answer":
@@ -180,6 +202,7 @@ def answers_to_json(self, id, squad=False):
             ret.append(curr)
         return ret
 
+
     def create_context(self, ans_start_ch, ans_end_ch, clear_text):
         if ans_start_ch == 0 and ans_end_ch == 0:
             return "", 0, 0
diff --git a/farm/utils.py b/farm/utils.py
index 8270245f7..4756f81c7 100644
--- a/farm/utils.py
+++ b/farm/utils.py
@@ -424,31 +424,6 @@ def stack(list_of_lists):
             ret[i] += (x)
     return ret
 
-def span_to_string(start_t, end_t, token_offsets, clear_text):
-
-    # If it is a no_answer prediction
-    if start_t == -1 and end_t == -1:
-        return "", 0, 0
-
-    n_tokens = len(token_offsets)
-
-    # We do this to point to the beginning of the first token after the span instead of
-    # the beginning of the last token in the span
-    end_t += 1
-
-    # Predictions sometimes land on the very final special token of the passage. But there are no
-    # special tokens on the document level. We will just interpret this as a span that stretches
-    # to the end of the document
-    end_t = min(end_t, n_tokens)
-
-    start_ch = token_offsets[start_t]
-    # i.e. pointing at the END of the last token
-    if end_t == n_tokens:
-        end_ch = len(clear_text)
-    else:
-        end_ch = token_offsets[end_t]
-    return clear_text[start_ch: end_ch].strip(), start_ch, end_ch
-
 
 def try_get(keys, dictionary):
     for key in keys:

From 61bc193ad1d363f497a3b2a1a2d650ad75e81fe8 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 24 Jun 2020 15:08:26 +0200
Subject: [PATCH 18/40] Remove unneccessary iteration

---
 farm/modeling/prediction_head.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index b284cc749..12de84ade 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1227,8 +1227,9 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
         # Iterate over each set of document level prediction
         for pred_d, no_ans_gap, basket in zip(top_preds, no_ans_gaps, baskets):
 
-            # Unpack document offsets, clear text and squad_id
+            # Unpack document offsets, clear text and id
             token_offsets = basket.samples[0].tokenized["document_offsets"]
+            pred_id = basket.id_external if basket.id_external else basket.id_internal
 
             # These options reflect the different input dicts that can be assigned to the basket
             # before any kind of normalization or preprocessing can happen
@@ -1240,14 +1241,12 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
 
             # Iterate over each prediction on the one document
             full_preds = []
-            for qa_candidate, basket in zip(pred_d, baskets):
-                pred_str, _, _ = qa_candidate.span_to_string(token_offsets,
-                                                             document_text)
+            for qa_candidate in pred_d:
+                pred_str, _, _ = qa_candidate.span_to_string(token_offsets, document_text)
                 qa_candidate.add_answer(pred_str)
                 full_preds.append(qa_candidate)
             n_samples = full_preds[0].n_passages_in_doc
 
-            pred_id = basket.id_external if basket.id_external else basket.id_internal
 
             curr_doc_pred = QAPred(id=pred_id,
                                    prediction=full_preds,

From acf83583f93f3e8612ee5dfd075d64f3f02ada4c Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 24 Jun 2020 17:38:59 +0200
Subject: [PATCH 19/40] WIP clean and document predictions.py

---
 farm/modeling/prediction_head.py |  5 +--
 farm/modeling/predictions.py     | 60 +++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 12de84ade..3c82bba1b 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1247,7 +1247,6 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
                 full_preds.append(qa_candidate)
             n_samples = full_preds[0].n_passages_in_doc
 
-
             curr_doc_pred = QAPred(id=pred_id,
                                    prediction=full_preds,
                                    context=document_text,
@@ -1255,10 +1254,8 @@ def to_qa_preds(self, top_preds, no_ans_gaps, baskets):
                                    token_offsets=token_offsets,
                                    context_window_size=self.context_window_size,
                                    aggregation_level="document",
-                                   answer_types=[],  # TODO
                                    no_answer_gap=no_ans_gap,
-                                   n_passages=n_samples
-                                   )
+                                   n_passages=n_samples)
             ret.append(curr_doc_pred)
         return ret
 
diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 74c2ec515..e2fab0857 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -29,39 +29,42 @@ def __init__(self,
                  offset_unit: str,
                  aggregation_level: str,
                  probability: float=None,
-                 answer: str=None,
-                 answer_support: str=None,
-                 offset_answer_support_start: int=None,
-                 offset_answer_support_end: int=None,
-                 context: str=None,
-                 offset_context_start: int=None,
-                 offset_context_end: int=None,
                  n_passages_in_doc: int=None,
                  passage_id: str=None,
                  ):
+        """
+        :param answer_type: The category that this answer falls into e.g. "no_answer", "yes", "no" or "span"
+        :param score: The score representing the model's confidence of this answer
+        :param offset_answer_start: The index of the start of the answer span (whether it is char or tok is stated in self.offset_unit)
+        :param offset_answer_end: The index of the start of the answer span (whether it is char or tok is stated in self.offset_unit)
+        :param offset_unit: States whether the offsets refer to character or token indices
+        :param aggregation_level: States whether this candidate and its indices are on a passage level (pre aggregation) or on a document level (post aggregation)
+        :param probability: The probability the model assigns to the answer
+        :param n_passages_in_doc: Number of passages that make up the document
+        :param passage_id: The id of the passage which contains this candidate answer
+        """
 
         # self.answer_type can be "no_answer", "yes", "no" or "span"
         self.answer_type = answer_type
         self.score = score
         self.probability = probability
 
-        # If self.answer_type is "span", self.answer is a string answer span
+        # If self.answer_type is "span", self.answer is a string answer (generated by self.span_to_string())
         # Otherwise, it is None
-        self.answer = answer
+        self.answer = None
         self.offset_answer_start = offset_answer_start
         self.offset_answer_end = offset_answer_end
 
         # If self.answer_type is in ["yes", "no"] then self.answer_support is a text string
         # If self.answer is a string answer span or self.answer_type is "no_answer", answer_support is None
-        self.answer_support = answer_support
-        self.offset_answer_support_start = offset_answer_support_start
-        self.offset_answer_support_end = offset_answer_support_end
-        self.passage_id = passage_id
+        self.answer_support = None
+        self.offset_answer_support_start = None
+        self.offset_answer_support_end = None
 
         # self.context is the document or passage where the answer is found
-        self.context = context
-        self.offset_context_start = offset_context_start
-        self.offset_context_end = offset_context_end
+        self.context = None
+        self.offset_context_start = None
+        self.offset_context_end = None
 
         # Offset unit is either "token" or "char"
         # Aggregation level is either "doc" or "passage"
@@ -71,7 +74,17 @@ def __init__(self,
         self.n_passages_in_doc = n_passages_in_doc
         self.passage_id = passage_id
 
-    def span_to_string(self, token_offsets, clear_text):
+    def span_to_string(self, token_offsets: List[int], clear_text: str):
+        """
+        Generates a string answer span using self.offset_answer_start and self.offset_answer_end. If the candidate
+        is a no answer, an empty string is returned
+
+        :param token_offsets: A list of ints which give the start character index of the corresponding token
+        :param clear_text: The text from which the answer span is to be extracted
+        :return: The string answer span, followed by the start and end character indices
+        """
+
+        assert self.offset_unit == "token"
 
         start_t = self.offset_answer_start
         end_t = self.offset_answer_end
@@ -104,8 +117,7 @@ def add_cls(self, predicted_class: str):
         Adjust the final QA prediction depending on the prediction of the classification head (e.g. for binary answers in NQ)
         Currently designed so that the QA head's prediction will always be preferred over the Classification head
 
-        :param predicted_class: the predicted class value
-        :return: None
+        :param predicted_class: The predicted class e.g. "yes", "no", "no_answer", "span"
         """
 
         if predicted_class in ["yes", "no"] and self.answer != "no_answer":
@@ -147,15 +159,15 @@ def __init__(self,
                  token_offsets: List[int],
                  context_window_size: int,
                  aggregation_level: str,
-                 answer_types: List[str]=None,
-                 ground_truth_answer: str =None,
-                 no_answer_gap: float =None,
-                 n_passages: int=None
+                 no_answer_gap: float,
+                 n_passages: int,
+                 ground_truth_answer: str = None,
+                 answer_types: List[str] = [],
                  ):
         super().__init__(id, prediction, context)
         self.question = question
         self.token_offsets = token_offsets
-        self.context_window_size = context_window_size #TODO only needed for to_json() - can we get rid context_window_size, TODO Do we really need this?
+        self.context_window_size = context_window_size
         self.aggregation_level = aggregation_level
         self.answer_types = answer_types
         self.ground_truth_answer = ground_truth_answer

From 102763ff3a25b1591bcd685a2bd3c05874376c25 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 25 Jun 2020 11:10:17 +0200
Subject: [PATCH 20/40] Add documentation of Pred objects

---
 farm/modeling/predictions.py | 47 +++++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index e2fab0857..9cefa6c05 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -128,11 +128,14 @@ def add_cls(self, predicted_class: str):
             self.offset_answer_support_end = self.offset_answer_end
 
     def to_doc_level(self, start, end):
+        """ Populate the start and end indices with document level indices. Changes aggregation level to 'document'"""
         self.offset_answer_start = start
         self.offset_answer_end = end
         self.aggregation_level = "document"
 
     def add_answer(self, string):
+        """ Set the answer string. This method will check that the answer given is valid given the start
+        and end indices that are stored in the object. """
         if string == "":
             self.answer = "no_answer"
             assert self.offset_answer_end == -1
@@ -147,8 +150,9 @@ def to_list(self):
 
 
 class QAPred(Pred):
-    """Question Answering predictions for a passage or a document. The self.prediction attribute is populated by a
-    list of QACandidate objects.
+    """ A set of QA predictions for a passage or a document. The candidates are stored in QAPred.prediction which is a
+    list of QACandidate objects. Also contains all attributes needed to convert the object into json format and also
+    to create a context window for a UI
     """
 
     def __init__(self,
@@ -164,6 +168,19 @@ def __init__(self,
                  ground_truth_answer: str = None,
                  answer_types: List[str] = [],
                  ):
+        """
+        :param id: The id of the passage or document
+        :param prediction: A list of QACandidate objects for the given question and document
+        :param context: The text passage from which the answer can be extracted
+        :param question: The question being posed
+        :param token_offsets: A list of ints indicating the start char index of each token
+        :param context_window_size: The number of chars on each side of the answer span that should be included in the context window
+        :param aggregation_level: States whether this candidate and its indices are on a passage level (pre aggregation) or on a document level (post aggregation)
+        :param no_answer_gap: How much the QuestionAnsweringHead.no_ans_boost needs to change to turn a no_answer to a positive answer
+        :param n_passages: Number of passages in the context document
+        :param ground_truth_answer: Ground truth answers
+        :param answer_types: List of answer_types supported by this task e.g. ["span", "yes_no", "no_answer"]
+        """
         super().__init__(id, prediction, context)
         self.question = question
         self.token_offsets = token_offsets
@@ -175,6 +192,12 @@ def __init__(self,
         self.n_passages = n_passages
 
     def to_json(self, squad=False):
+        """
+        Converts the information stored in the object into a json format.
+
+        :param squad: If True, no_answers are represented by the empty string instead of "no_answer"
+        :return:
+        """
         answers = self.answers_to_json(self.id, squad)
         ret = {
             "task": "qa",
@@ -191,6 +214,13 @@ def to_json(self, squad=False):
         return ret
 
     def answers_to_json(self, id, squad=False):
+        """
+        Convert all answers into a json format
+
+        :param id: ID of the question document pair
+        :param squad: If True, no_answers are represented by the empty string instead of "no_answer"
+        :return:
+        """
         ret = []
 
         # iterate over the top_n predictions of the one document
@@ -199,8 +229,7 @@ def answers_to_json(self, id, squad=False):
 
             _, ans_start_ch, ans_end_ch = qa_candidate.span_to_string(self.token_offsets, self.context)
             context_string, context_start_ch, context_end_ch = self.create_context(ans_start_ch, ans_end_ch, self.context)
-            if squad:
-                if string == "no_answer":
+            if squad and string == "no_answer":
                     string = ""
             curr = {"score": qa_candidate.score,
                     "probability": None,
@@ -216,6 +245,16 @@ def answers_to_json(self, id, squad=False):
 
 
     def create_context(self, ans_start_ch, ans_end_ch, clear_text):
+        """
+        Extract from the clear_text a window that contains the answer and some amount of text on either
+        side of the answer. Useful for cases where the answer and its surrounding context needs to be
+        displayed in a UI.
+
+        :param ans_start_ch: Start character index of the answer
+        :param ans_end_ch: End character index of the answer
+        :param clear_text: The text from which the answer is extracted
+        :return:
+        """
         if ans_start_ch == 0 and ans_end_ch == 0:
             return "", 0, 0
         else:

From e3d4bb6dd24b739468db0044d77ae0c6ddfd1df5 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 25 Jun 2020 11:48:57 +0200
Subject: [PATCH 21/40] Fix list index bug

---
 farm/modeling/prediction_head.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index 3c82bba1b..f8442c602 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1506,20 +1506,20 @@ def chunk(iterable, lengths):
         samples_per_doc = [doc_pred.n_passages for doc_pred in preds_all[0][0]]
         cls_preds_grouped = chunk(cls_preds, samples_per_doc)
 
-        for qa_doc_pred, cls_preds in zip(qa_preds, cls_preds_grouped):
-            pred_qa_answers = qa_doc_pred.prediction
-            pred_qa_answers_new = []
-            for pred_qa_answer in pred_qa_answers:
-                passage_id = pred_qa_answer.passage_id
+        for qa_pred, cls_preds in zip(qa_preds, cls_preds_grouped):
+            qa_candidates = qa_pred.prediction
+            qa_candidates_new = []
+            for qa_candidate in qa_candidates:
+                passage_id = qa_candidate.passage_id
                 if passage_id is not None:
-                    cls_pred = cls_preds[passage_id]["label"]
+                    cls_pred = cls_preds[int(passage_id)]["label"]
                 # i.e. if no_answer
                 else:
                     cls_pred = "no_answer"
-                pred_qa_answer.add_cls(cls_pred)
-                pred_qa_answers_new.append(pred_qa_answer)
-            qa_doc_pred.prediction = pred_qa_answers_new
-            ret.append(qa_doc_pred)
+                qa_candidate.add_cls(cls_pred)
+                qa_candidates_new.append(qa_candidate)
+            qa_pred.prediction = qa_candidates_new
+            ret.append(qa_pred)
         return ret
 
 

From aa3333c4b71f7b04c5154172fb1781773a9b60cd Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 25 Jun 2020 12:25:20 +0200
Subject: [PATCH 22/40] Fix index in test sample

---
 test/samples/qa/train-sample.json | 2 +-
 test/test_question_answering.py   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/samples/qa/train-sample.json b/test/samples/qa/train-sample.json
index 71610b81a..0a1d2b9ff 100644
--- a/test/samples/qa/train-sample.json
+++ b/test/samples/qa/train-sample.json
@@ -1 +1 @@
-{"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 53}], "is_impossible": false}], "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia."}]}]}
\ No newline at end of file
+{"data": [{"paragraphs": [{"qas": [{"question": "In what country is Normandy located?", "id": "56ddde6b9a695914005b9628", "answers": [{"text": "France", "answer_start": 159}], "is_impossible": false}], "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia."}]}]}
\ No newline at end of file
diff --git a/test/test_question_answering.py b/test/test_question_answering.py
index d04ce0671..397c60e86 100644
--- a/test/test_question_answering.py
+++ b/test/test_question_answering.py
@@ -86,7 +86,6 @@ def test_qa(caplog=None):
                  "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
                 }]
 
-
     result1 = inferencer.inference_from_dicts(dicts=qa_format_1)
     result2 = inferencer.inference_from_dicts(dicts=qa_format_2)
     assert result1 == result2

From 1af739f048175098c2f614b538cfda591d17b27a Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 25 Jun 2020 12:27:10 +0200
Subject: [PATCH 23/40] Refactor data check

---
 farm/data_handler/processor.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 450af1bb6..d25f4f019 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1148,7 +1148,7 @@ def file_to_dicts(self, file: str) -> [dict]:
         nested_dicts = read_squad_file(filename=file)
         dicts = [y for x in nested_dicts for y in x["paragraphs"]]
         for d in dicts:
-            assert valid_answer(d)
+            check_valid_answer(d)
         return dicts
 
     def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
@@ -1681,7 +1681,7 @@ def is_impossible_to_answer_type(qas):
     return new_qas
 
   
-def valid_answer(dictionary):
+def check_valid_answer(dictionary):
     context = dictionary["context"]
     for qa in dictionary["qas"]:
         for answer in qa["answers"]:
@@ -1691,5 +1691,4 @@ def valid_answer(dictionary):
             if context[start: end] != answer["text"]:
                 raise Exception(f"The answer extracted by start character index does not match the answer string: "
                                  f"\t {context[start: end]} vs {answer['text']}")
-    return True
 

From 0d096986850f7a4ee06abee71e0d3c15d6216668 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 25 Jun 2020 14:25:20 +0200
Subject: [PATCH 24/40] Fix docstring

---
 farm/modeling/predictions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/modeling/predictions.py b/farm/modeling/predictions.py
index 0aec4de22..7994e2de8 100644
--- a/farm/modeling/predictions.py
+++ b/farm/modeling/predictions.py
@@ -185,7 +185,7 @@ def __init__(self,
         :param context: The text passage from which the answer can be extracted
         :param question: The question being posed
         :param token_offsets: A list of ints indicating the start char index of each token
-        :param context_window_size: The number of chars on each side of the answer span that should be included in the context window
+        :param context_window_size: The number of chars in the text window around the answer
         :param aggregation_level: States whether this candidate and its indices are on a passage level (pre aggregation) or on a document level (post aggregation)
         :param no_answer_gap: How much the QuestionAnsweringHead.no_ans_boost needs to change to turn a no_answer to a positive answer
         :param n_passages: Number of passages in the context document

From 0725e37fc959a574d3558f0bd597af96c5c06a20 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 30 Jun 2020 17:23:03 +0200
Subject: [PATCH 25/40] Simplify QA generate_labels()

---
 farm/data_handler/input_features.py | 84 +++++++++--------------------
 farm/data_handler/processor.py      | 25 +++++++--
 2 files changed, 47 insertions(+), 62 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 7f7f80743..c2a021533 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -307,7 +307,8 @@ def samples_to_features_bert_lm(sample, max_seq_len, tokenizer, next_sent_pred=T
     return [feature_dict]
 
 
-def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None, max_answers=6):
+def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks_mid,
+                          answer_type_list=None, max_answers=6):
     """ Prepares data for processing by the model. Supports cases where there are
     multiple answers for the one question/document pair. max_answers is by default set to 6 since
     that is the most number of answers in the squad2.0 dev set."""
@@ -329,9 +330,10 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
     labels, answer_types = generate_labels(answers,
                                            passage_len_t,
                                            question_len_t,
-                                           tokenizer,
-                                           answer_type_list=answer_type_list,
-                                           max_answers=max_answers)
+                                           max_answers,
+                                           sp_toks_start,
+                                           sp_toks_mid,
+                                           answer_type_list)
 
     # Generate a start of word vector for the full sequence (i.e. question + answer + special tokens).
     # This will allow us to perform evaluation during training without clear text.
@@ -398,7 +400,8 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
     return [feature_dict]
 
 
-def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answers, answer_type_list=None):
+def generate_labels(answers, passage_len_t, question_len_t, max_answers,
+                    sp_toks_start, sp_toks_mid, answer_type_list=None):
     """
     Creates QA label for each answer in answers. The labels are the index of the start and end token
     relative to the passage. They are contained in an array of size (max_answers, 2).
@@ -409,8 +412,13 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
     (in most models, this is the [CLS] token). Note that in our implementation NQ has 4 labels
     ["no_answer", "yes", "no", "span"] and this is what answer_type_list should look like"""
 
+    # Note here that label_idxs get passed to the QuestionAnsweringHead and answer_types get passed to the text
+    # classification head. label_idxs may contain multiple start, end labels since SQuAD dev and test sets
+    # can have multiple annotations. By contrast, Natural Questions only has one annotation per sample hence
+    # why answer_types is only of length 1
     label_idxs = np.full((max_answers, 2), fill_value=-1)
-    answer_types = np.full((1), fill_value=-1)
+    answer_types = np.asarray([-1])
+    answer_str = ""
 
     # If there are no answers
     if len(answers) == 0:
@@ -418,64 +426,24 @@ def generate_labels(answers, passage_len_t, question_len_t, tokenizer, max_answe
         answer_types[:] = 0
         return label_idxs, answer_types
 
+    # Iterate over the answers for the one sample
     for i, answer in enumerate(answers):
         start_idx = answer["start_t"]
         end_idx = answer["end_t"]
 
-        # We are going to operate on one-hot label vectors which will later be converted back to label indices.
-        # This is to take advantage of tokenizer.encode_plus() which applies model dependent special token conventions.
-        # The two label vectors (start and end) are composed of sections that correspond to the question and
-        # passage tokens. These are initialized here. The section corresponding to the question
-        # will always be composed of 0s.
-        start_vec_question = [0] * question_len_t
-        end_vec_question = [0] * question_len_t
-        start_vec_passage = [0] * passage_len_t
-        end_vec_passage = [0] * passage_len_t
-
-        # If the answer is in the current passage, populate the label vector with 1s for start and end
+        # Check that the start and end are contained within this passage
         if answer_in_passage(start_idx, end_idx, passage_len_t):
-            start_vec_passage[start_idx] = 1
-            end_vec_passage[end_idx] = 1
-
-        # Combine the sections of the label vectors. The length of each of these will be:
-        # question_len_t + passage_len_t + n_special_tokens
-        start_vec = combine_vecs(start_vec_question,
-                                    start_vec_passage,
-                                    tokenizer,
-                                    spec_tok_val=0)
-        end_vec = combine_vecs(end_vec_question,
-                                  end_vec_passage,
-                                  tokenizer,
-                                  spec_tok_val=0)
-
-        start_label_present = 1 in start_vec
-        end_label_present = 1 in end_vec
-
-        # This is triggered if the answer is not in the passage or the question warrants a no_answer
-        # In both cases, the token at idx=0 (in BERT, this is the [CLS] token) is given both the start and end label
-        if start_label_present is False and end_label_present is False:
-            start_vec[0] = 1
-            end_vec[0] = 1
-            answer_type = "no_answer"
-        elif start_label_present is False or end_label_present is False:
-            raise Exception("The label vectors are lacking either a start or end label")
-
-        # Ensure label vectors are one-hot
-        assert sum(start_vec) == 1
-        assert sum(end_vec) == 1
-
-        start_idx = start_vec.index(1)
-        end_idx = end_vec.index(1)
-
-        label_idxs[i, 0] = start_idx
-        label_idxs[i, 1] = end_idx
-
-    # Only Natural Questions trains a classification head on answer_type, SQuAD only has the QA head. answer_type_list
-    # will be None for SQuAD but something like ["no_answer", "span", "yes", "no"] for Natural Questions
-    if answer_type_list:
-        answer_types[0] = answer_type_list.index(answers[0]["answer_type"])
+            label_idxs[i][0] = sp_toks_start + question_len_t + sp_toks_mid + start_idx
+            label_idxs[i][1] = sp_toks_start + question_len_t + sp_toks_mid + end_idx
+            answer_str = answer["answer_type"]
+        # If the start or end of the span answer is outside the passage, treat passage as no_answer
+        else:
+            label_idxs[i][0] = 0
+            label_idxs[i][1] = 0
+            answer_str = "no_answer"
 
-    assert np.max(label_idxs) > -1
+    if answer_type_list:
+        answer_types[0] = answer_type_list.index(answer_str)
 
     return label_idxs, answer_types
 
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index d25f4f019..3f158653f 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -348,6 +348,7 @@ def _log_samples(self, n_samples):
             random_sample = random.choice(random_basket.samples)
             logger.info(random_sample)
 
+
     def _log_params(self):
         params = {
             "processor": self.__class__.__name__,
@@ -1035,7 +1036,20 @@ def estimate_n_samples(self, filepath, max_docs=500):
 # QA Processors ####
 #########################################
 
-class SquadProcessor(Processor):
+class QAProcessor(Processor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.initialize_special_tokens_count()
+
+    def initialize_special_tokens_count(self):
+        vec = self.tokenizer.build_inputs_with_special_tokens(token_ids_0=["a"],
+                                                              token_ids_1=["b"])
+        self.sp_toks_start = vec.index("a")
+        self.sp_toks_mid = vec.index("b") - self.sp_toks_start - 1
+        self.sp_toks_end = len(vec) - vec.index("b") - 1
+
+
+class SquadProcessor(QAProcessor):
     """ Used to handle the SQuAD dataset"""
 
     def __init__(
@@ -1164,10 +1178,12 @@ def _sample_to_features(self, sample) -> dict:
         # TODO, make this function return one set of features per sample
         features = sample_to_features_qa(sample=sample,
                                          tokenizer=self.tokenizer,
-                                         max_seq_len=self.max_seq_len)
+                                         max_seq_len=self.max_seq_len,
+                                         sp_toks_start=self.sp_toks_start,
+                                         sp_toks_mid=self.sp_toks_end)
         return features
 
-class NaturalQuestionsProcessor(Processor):
+class NaturalQuestionsProcessor(QAProcessor):
     """ Used to handle the Natural Question QA dataset"""
 
     def __init__(
@@ -1254,7 +1270,6 @@ def __init__(
         self.add_task("question_answering", "squad", ["start_token", "end_token"])
         self.add_task("text_classification", "f1_macro", self.answer_type_list, label_name="answer_type")
 
-
     def file_to_dicts(self, file: str) -> [dict]:
         dicts = read_jsonl(file, proxies=self.proxies)
         return dicts
@@ -1457,6 +1472,8 @@ def _sample_to_features(self, sample: Sample) -> dict:
         features = sample_to_features_qa(sample=sample,
                                          tokenizer=self.tokenizer,
                                          max_seq_len=self.max_seq_len,
+                                         sp_toks_start=self.sp_toks_start,
+                                         sp_toks_mid=self.sp_toks_mid,
                                          answer_type_list=self.answer_type_list)
         return features
 

From 35c67f0f89c87b2c28153b91066c87f276a23109 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 30 Jun 2020 17:31:03 +0200
Subject: [PATCH 26/40] Rename internal methods

---
 farm/data_handler/processor.py | 46 +++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 3f158653f..83572347f 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1287,7 +1287,7 @@ def _dict_to_samples(self, dictionary: dict, all_dicts=None) -> [Sample]:
         """
         # Turns a NQ dictionaries into a SQuAD style dictionaries
         if not self.inference:
-            dictionary = self.prepare_dict(dictionary=dictionary)
+            dictionary = self._prepare_dict(dictionary=dictionary)
 
         dictionary_tokenized = apply_tokenization(dictionary, self.tokenizer)[0]
         n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
@@ -1299,16 +1299,16 @@ def _dict_to_samples(self, dictionary: dict, all_dicts=None) -> [Sample]:
         # Downsample the number of samples with an no_answer label. This fn will always return at least one sample
         # so that we don't end up with a basket with 0 samples
         if not self.inference:
-            samples = self.downsample(samples, self.keep_no_answer)
+            samples = self._downsample(samples, self.keep_no_answer)
         return samples
 
-    def downsample(self, samples, keep_prob):
+    def _downsample(self, samples, keep_prob):
         # Downsamples samples with a no_answer label (since there is an overrepresentation of these in NQ)
         # This method will always return at least one sample. This is done so that we don't end up with SampleBaskets
         # with 0 samples
         ret = []
         for s in samples:
-            if self.check_no_answer_sample(s):
+            if self._check_no_answer_sample(s):
                 if random_float() > 1 - keep_prob:
                     ret.append(s)
             else:
@@ -1317,7 +1317,7 @@ def downsample(self, samples, keep_prob):
             ret = [random.choice(samples)]
         return ret
 
-    def downsample_unprocessed(self, dictionary):
+    def _downsample_unprocessed(self, dictionary):
         doc_text = dictionary["document_text"]
         doc_tokens = doc_text.split(" ")
         annotations = dictionary.get("annotations",[])
@@ -1325,7 +1325,7 @@ def downsample_unprocessed(self, dictionary):
         if len(annotations) == 1:
             annotation = annotations[0]
             # There seem to be cases where there is no answer but an annotation is given as a (-1, -1) long answer
-            if self.check_no_answer(annotation):
+            if self._check_no_answer(annotation):
                 dictionary["document_text"] = " ".join(doc_tokens[:self.max_seq_len+randint(1,self.downsample_context_size)])
             else:
                 # finding earliest start and latest end labels
@@ -1358,28 +1358,28 @@ def downsample_unprocessed(self, dictionary):
         return dictionary
 
 
-    def prepare_dict(self, dictionary):
+    def _prepare_dict(self, dictionary):
         """ Casts a Natural Questions dictionary that is loaded from a jsonl file into SQuAD format so that
         the same featurization functions can be called for both tasks. Each annotation can be one of four answer types,
         ["yes", "no", "span", "no_answer"]"""
 
         if self.downsample_context_size is not None:
-            dictionary = self.downsample_unprocessed(dictionary)
+            dictionary = self._downsample_unprocessed(dictionary)
 
         converted_answers = []
         doc_text = dictionary["document_text"]
         _, tok_to_ch = split_with_metadata(doc_text)
         for annotation in dictionary["annotations"]:
             # There seem to be cases where there is no answer but an annotation is given as a (-1, -1) long answer
-            if self.check_no_answer(annotation):
+            if self._check_no_answer(annotation):
                 continue
-            sa_text, sa_start_c = self.unify_short_answers(annotation["short_answers"], doc_text, tok_to_ch)
-            la_text, la_start_c = self.retrieve_long_answer(annotation["long_answer"]["start_token"],
-                                                            annotation["long_answer"]["end_token"],
-                                                            tok_to_ch,
-                                                            doc_text)
+            sa_text, sa_start_c = self._unify_short_answers(annotation["short_answers"], doc_text, tok_to_ch)
+            la_text, la_start_c = self._retrieve_long_answer(annotation["long_answer"]["start_token"],
+                                                             annotation["long_answer"]["end_token"],
+                                                             tok_to_ch,
+                                                             doc_text)
             # Picks the span to be considered as annotation by choosing between short answer, long answer and no_answer
-            text, start_c = self.choose_span(sa_text, sa_start_c, la_text, la_start_c)
+            text, start_c = self._choose_span(sa_text, sa_start_c, la_text, la_start_c)
             converted_answers.append({"text": text,
                                       "answer_start": start_c})
         if len(converted_answers) == 0:
@@ -1397,7 +1397,7 @@ def prepare_dict(self, dictionary):
         return converted
 
     @staticmethod
-    def check_no_answer(annotation):
+    def _check_no_answer(annotation):
         if annotation["long_answer"]["start_token"] > -1 or annotation["long_answer"]["end_token"] > -1:
             return False
         for sa in annotation["short_answers"]:
@@ -1407,7 +1407,7 @@ def check_no_answer(annotation):
             return True
 
     @staticmethod
-    def check_no_answer_sample(sample):
+    def _check_no_answer_sample(sample):
         sample_tok = sample.tokenized
         if len(sample_tok["answers"]) == 0:
             return True
@@ -1421,14 +1421,14 @@ def check_no_answer_sample(sample):
         else:
             return False
 
-    def retrieve_long_answer(self, start_t, end_t, tok_to_ch, doc_text):
+    def _retrieve_long_answer(self, start_t, end_t, tok_to_ch, doc_text):
         """ Retrieves the string long answer and also its starting character index"""
-        start_c, end_c = self.convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text)
+        start_c, end_c = self._convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text)
         text = doc_text[start_c: end_c]
         return text, start_c
 
     @staticmethod
-    def choose_span(sa_text, sa_start_c, la_text, la_start_c):
+    def _choose_span(sa_text, sa_start_c, la_text, la_start_c):
         if sa_text:
             return sa_text, sa_start_c
         elif la_text:
@@ -1436,7 +1436,7 @@ def choose_span(sa_text, sa_start_c, la_text, la_start_c):
         else:
             return "", -1
 
-    def unify_short_answers(self, short_answers, doc_text, tok_to_ch):
+    def _unify_short_answers(self, short_answers, doc_text, tok_to_ch):
         """ In cases where an NQ sample has multiple disjoint short answers, this fn generates the single shortest
         span that contains all the answers"""
         if not short_answers:
@@ -1448,13 +1448,13 @@ def unify_short_answers(self, short_answers, doc_text, tok_to_ch):
             short_answer_idxs.append(short_answer["end_token"])
         answer_start_t = min(short_answer_idxs)
         answer_end_t = max(short_answer_idxs)
-        answer_start_c, answer_end_c = self.convert_tok_to_ch(answer_start_t, answer_end_t, tok_to_ch, doc_text)
+        answer_start_c, answer_end_c = self._convert_tok_to_ch(answer_start_t, answer_end_t, tok_to_ch, doc_text)
         answer_text = doc_text[answer_start_c: answer_end_c]
         assert answer_text == " ".join(doc_text.split()[answer_start_t: answer_end_t])
         return answer_text, answer_start_c
 
     @staticmethod
-    def convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text):
+    def _convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text):
         n_tokens = len(tok_to_ch)
         if start_t == -1 and end_t == -1:
             return -1, -1

From 28a66e7799763cf4506fb9ed17962e935d9db2fd Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 30 Jun 2020 17:40:39 +0200
Subject: [PATCH 27/40] update docstring

---
 farm/data_handler/input_features.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index c2a021533..9a425b360 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -311,7 +311,24 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
                           answer_type_list=None, max_answers=6):
     """ Prepares data for processing by the model. Supports cases where there are
     multiple answers for the one question/document pair. max_answers is by default set to 6 since
-    that is the most number of answers in the squad2.0 dev set."""
+    that is the most number of answers in the squad2.0 dev set.
+
+    :param sample: A Sample object that contains one question / passage pair
+    :type sample: Sample
+    :param tokenizer: A Tokenizer object
+    :type tokenizer: Tokenizer
+    :param max_seq_len: The maximum sequence length
+    :type max_seq_len: int
+    :param sp_toks_start: The number of special tokens that come before the question tokens
+    :type sp_toks_start: int
+    :param sp_toks_mid: The number of special tokens that come between the question and passage tokens
+    :type sp_toks_mid: int
+    :param answer_type_list: A list of all the answer types that can be expected e.g. ["no_answer", "span", "yes", "no"] for Natural Questions
+    :type answer_type_list: List[str]
+    :param max_answers: The maximum number of answer annotations for a sample (In SQuAD, this is 6 hence the default)
+    :type max_answers: int
+    :return: dict (keys: [input_ids, padding_mask, segment_ids, answer_type_ids, passage_start_t, start_of_word, labels, id, seq_2_start_2])
+    """
 
     # Initialize some basic variables
     question_tokens = sample.tokenized["question_tokens"]
@@ -403,13 +420,13 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
 def generate_labels(answers, passage_len_t, question_len_t, max_answers,
                     sp_toks_start, sp_toks_mid, answer_type_list=None):
     """
-    Creates QA label for each answer in answers. The labels are the index of the start and end token
+    Creates QA label vector for each answer in answers. The labels are the index of the start and end token
     relative to the passage. They are contained in an array of size (max_answers, 2).
-    -1 used to fill array since there the number of answers is often less than max_answers.
+    -1 is used to fill array since there the number of answers is often less than max_answers.
     The index values take in to consideration the question tokens, and also special tokens such as [CLS].
     When the answer is not fully contained in the passage, or the question
     is impossible to answer, the start_idx and end_idx are 0 i.e. start and end are on the very first token
-    (in most models, this is the [CLS] token). Note that in our implementation NQ has 4 labels
+    (in most models, this is the [CLS] token). Note that in our implementation NQ has 4 answer types
     ["no_answer", "yes", "no", "span"] and this is what answer_type_list should look like"""
 
     # Note here that label_idxs get passed to the QuestionAnsweringHead and answer_types get passed to the text

From 651f8af9f72d622f906da626e604e8ffa3fd3d94 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 1 Jul 2020 15:07:14 +0200
Subject: [PATCH 28/40] add input_features test

---
 test/samples/qa/no_answer/clear_text.json |  1 +
 test/samples/qa/no_answer/features.json   |  1 +
 test/samples/qa/no_answer/tokenized.json  |  1 +
 test/samples/qa/span/clear_text.json      |  1 +
 test/samples/qa/span/features.json        |  1 +
 test/samples/qa/span/tokenized.json       |  1 +
 test/test_input_features.py               | 45 +++++++++++++++++++++++
 7 files changed, 51 insertions(+)
 create mode 100644 test/samples/qa/no_answer/clear_text.json
 create mode 100644 test/samples/qa/no_answer/features.json
 create mode 100644 test/samples/qa/no_answer/tokenized.json
 create mode 100644 test/samples/qa/span/clear_text.json
 create mode 100644 test/samples/qa/span/features.json
 create mode 100644 test/samples/qa/span/tokenized.json
 create mode 100644 test/test_input_features.py

diff --git a/test/samples/qa/no_answer/clear_text.json b/test/samples/qa/no_answer/clear_text.json
new file mode 100644
index 000000000..58e508c38
--- /dev/null
+++ b/test/samples/qa/no_answer/clear_text.json
@@ -0,0 +1 @@
+{"passage_text": "Note: The green arrows (), red arrows (), and blue dashes () represent changes in rank when compared to the new 2012 data HDI for 2011 \u2013 published in the 2012 report.", "question_text": "What dashes do not represent changes in rank? ", "passage_id": 0, "answers": []}
\ No newline at end of file
diff --git a/test/samples/qa/no_answer/features.json b/test/samples/qa/no_answer/features.json
new file mode 100644
index 000000000..f5c1a8534
--- /dev/null
+++ b/test/samples/qa/no_answer/features.json
@@ -0,0 +1 @@
+{"input_ids": [0, 2264, 385, 14829, 109, 45, 3594, 1022, 11, 7938, 116, 2, 2, 27728, 35, 20, 2272, 36486, 49038, 1275, 36486, 49038, 8, 2440, 385, 14829, 36418, 3594, 1022, 11, 7938, 77, 1118, 7, 5, 92, 1125, 414, 7951, 100, 13, 1466, 126, 1027, 11, 5, 1125, 266, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "padding_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "segment_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "answer_type_ids": [0], "passage_start_t": 0, "start_of_word": [0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "labels": [[0, 0], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]], "id": [1735, 0, 0], "seq_2_start_t": 13}
\ No newline at end of file
diff --git a/test/samples/qa/no_answer/tokenized.json b/test/samples/qa/no_answer/tokenized.json
new file mode 100644
index 000000000..799aab2de
--- /dev/null
+++ b/test/samples/qa/no_answer/tokenized.json
@@ -0,0 +1 @@
+{"passage_start_t": 0, "passage_tokens": ["Note", ":", "\u0120The", "\u0120green", "\u0120arrows", "\u0120(),", "\u0120red", "\u0120arrows", "\u0120(),", "\u0120and", "\u0120blue", "\u0120d", "ashes", "\u0120()", "\u0120represent", "\u0120changes", "\u0120in", "\u0120rank", "\u0120when", "\u0120compared", "\u0120to", "\u0120the", "\u0120new", "\u01202012", "\u0120data", "\u0120HD", "I", "\u0120for", "\u01202011", "\u0120\u00e2\u0122\u0135", "\u0120published", "\u0120in", "\u0120the", "\u01202012", "\u0120report", "."], "passage_offsets": [0, 4, 6, 10, 16, 23, 27, 31, 38, 42, 46, 51, 52, 58, 61, 71, 79, 82, 87, 92, 101, 104, 108, 112, 117, 122, 124, 126, 130, 135, 137, 147, 150, 154, 159, 165], "passage_start_of_word": [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0], "question_tokens": ["What", "\u0120d", "ashes", "\u0120do", "\u0120not", "\u0120represent", "\u0120changes", "\u0120in", "\u0120rank", "?"], "question_offsets": [0, 5, 6, 12, 15, 19, 29, 37, 40, 44], "question_start_of_word": [1, 1, 0, 1, 1, 1, 1, 1, 1, 0], "answers": [], "document_offsets": [0, 4, 6, 10, 16, 23, 27, 31, 38, 42, 46, 51, 52, 58, 61, 71, 79, 82, 87, 92, 101, 104, 108, 112, 117, 122, 124, 126, 130, 135, 137, 147, 150, 154, 159, 165]}
\ No newline at end of file
diff --git a/test/samples/qa/span/clear_text.json b/test/samples/qa/span/clear_text.json
new file mode 100644
index 000000000..d07a1c903
--- /dev/null
+++ b/test/samples/qa/span/clear_text.json
@@ -0,0 +1 @@
+{"passage_text": "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".", "question_text": "When did Beyonce start becoming popular?", "passage_id": 0, "answers": [{"text": "in the late 1990s", "start_c": 269, "end_c": 285}]}
\ No newline at end of file
diff --git a/test/samples/qa/span/features.json b/test/samples/qa/span/features.json
new file mode 100644
index 000000000..eec1f38be
--- /dev/null
+++ b/test/samples/qa/span/features.json
@@ -0,0 +1 @@
+{"input_ids": [0, 1779, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 2, 40401, 261, 12695, 272, 354, 6591, 10690, 1634, 12, 43732, 48229, 5605, 43621, 16948, 49066, 267, 35423, 10659, 282, 1090, 35423, 10278, 73, 19417, 12, 975, 2191, 12, 28357, 43, 36, 5400, 772, 204, 6, 14130, 43, 16, 41, 470, 3250, 6, 2214, 9408, 6, 638, 3436, 8, 3390, 4, 8912, 8, 1179, 11, 2499, 6, 1184, 6, 79, 3744, 11, 1337, 6970, 8, 7950, 9150, 25, 10, 920, 6, 8, 1458, 7, 9444, 11, 5, 628, 4525, 29, 25, 483, 3250, 9, 248, 947, 387, 1816, 12, 13839, 23313, 18, 7442, 4, 1554, 4628, 30, 69, 1150, 6, 4101, 16152, 10690, 1634, 6, 5, 333, 1059, 65, 9, 5, 232, 18, 275, 12, 11393, 1816, 1134, 9, 70, 86, 4, 2667, 25224, 794, 5, 800, 9, 12674, 12695, 18, 2453, 2642, 6, 34880, 9412, 11, 3437, 36, 35153, 238, 61, 2885, 69, 25, 10, 5540, 3025, 3612, 6, 2208, 292, 12727, 4229, 8, 3520, 5, 18919, 6003, 727, 346, 12, 1264, 7695, 22, 347, 36616, 11, 3437, 113, 8, 22, 30047, 5637, 845, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "padding_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "segment_ids": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "answer_type_ids": [-1], "passage_start_t": 0, "start_of_word": [0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "labels": [[85, 89], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1]], "id": [0, 0, 0], "seq_2_start_t": 11}
\ No newline at end of file
diff --git a/test/samples/qa/span/tokenized.json b/test/samples/qa/span/tokenized.json
new file mode 100644
index 000000000..9d6ddc609
--- /dev/null
+++ b/test/samples/qa/span/tokenized.json
@@ -0,0 +1 @@
+{"passage_start_t": 0, "passage_tokens": ["Bey", "on", "c\u00c3\u00a9", "\u0120G", "is", "elle", "\u0120Know", "les", "-", "Carter", "\u0120(/", "bi", "\u00cb", "\u0132", "\u00cb\u012a", "j", "\u00c9", "\u0134", "n", "se", "\u00c9", "\u00aa", "/", "\u0120bee", "-", "Y", "ON", "-", "say", ")", "\u0120(", "born", "\u0120September", "\u01204", ",", "\u01201981", ")", "\u0120is", "\u0120an", "\u0120American", "\u0120singer", ",", "\u0120song", "writer", ",", "\u0120record", "\u0120producer", "\u0120and", "\u0120actress", ".", "\u0120Born", "\u0120and", "\u0120raised", "\u0120in", "\u0120Houston", ",", "\u0120Texas", ",", "\u0120she", "\u0120performed", "\u0120in", "\u0120various", "\u0120singing", "\u0120and", "\u0120dancing", "\u0120competitions", "\u0120as", "\u0120a", "\u0120child", ",", "\u0120and", "\u0120rose", "\u0120to", "\u0120fame", "\u0120in", "\u0120the", "\u0120late", "\u01201990", "s", "\u0120as", "\u0120lead", "\u0120singer", "\u0120of", "\u0120R", "&", "B", "\u0120girl", "-", "group", "\u0120Destiny", "'s", "\u0120Child", ".", "\u0120Man", "aged", "\u0120by", "\u0120her", "\u0120father", ",", "\u0120Mat", "hew", "\u0120Know", "les", ",", "\u0120the", "\u0120group", "\u0120became", "\u0120one", "\u0120of", "\u0120the", "\u0120world", "'s", "\u0120best", "-", "selling", "\u0120girl", "\u0120groups", "\u0120of", "\u0120all", "\u0120time", ".", "\u0120Their", "\u0120hiatus", "\u0120saw", "\u0120the", "\u0120release", "\u0120of", "\u0120Beyon", "c\u00c3\u00a9", "'s", "\u0120debut", "\u0120album", ",", "\u0120Danger", "ously", "\u0120in", "\u0120Love", "\u0120(", "2003", "),", "\u0120which", "\u0120established", "\u0120her", "\u0120as", "\u0120a", "\u0120solo", "\u0120artist", "\u0120worldwide", ",", "\u0120earned", "\u0120five", "\u0120Grammy", "\u0120Awards", "\u0120and", "\u0120featured", "\u0120the", "\u0120Billboard", "\u0120Hot", "\u0120100", "\u0120number", "-", "one", "\u0120singles", "\u0120\"", "C", "razy", "\u0120in", "\u0120Love", "\"", "\u0120and", "\u0120\"", "Baby", "\u0120Boy", "\"."], "passage_offsets": [0, 3, 5, 8, 9, 11, 16, 20, 23, 24, 31, 33, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 45, 48, 49, 50, 52, 53, 56, 58, 59, 64, 74, 75, 77, 81, 83, 86, 89, 98, 104, 106, 110, 116, 118, 125, 134, 138, 145, 147, 152, 156, 163, 166, 173, 175, 180, 182, 186, 196, 199, 207, 215, 219, 227, 240, 243, 245, 250, 252, 256, 261, 264, 269, 272, 276, 281, 285, 287, 290, 295, 302, 305, 306, 307, 309, 313, 314, 320, 327, 330, 335, 337, 340, 345, 348, 352, 358, 360, 363, 367, 371, 374, 376, 380, 386, 393, 397, 400, 404, 409, 412, 416, 417, 425, 430, 437, 440, 444, 448, 450, 456, 463, 467, 471, 479, 482, 487, 490, 492, 498, 503, 505, 511, 517, 520, 525, 526, 530, 533, 539, 551, 555, 558, 560, 565, 572, 581, 583, 590, 595, 602, 609, 613, 622, 626, 636, 640, 644, 650, 651, 655, 663, 664, 665, 670, 673, 677, 679, 683, 684, 689, 692], "passage_start_of_word": [1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0], "question_tokens": ["When", "\u0120did", "\u0120Beyon", "ce", "\u0120start", "\u0120becoming", "\u0120popular", "?"], "question_offsets": [0, 5, 9, 14, 17, 23, 32, 39], "question_start_of_word": [1, 1, 1, 0, 1, 1, 1, 0], "answers": [{"start_t": 74, "end_t": 78, "answer_type": "span"}], "document_offsets": [0, 3, 5, 8, 9, 11, 16, 20, 23, 24, 31, 33, 35, 36, 37, 39, 40, 41, 42, 43, 45, 46, 47, 45, 48, 49, 50, 52, 53, 56, 58, 59, 64, 74, 75, 77, 81, 83, 86, 89, 98, 104, 106, 110, 116, 118, 125, 134, 138, 145, 147, 152, 156, 163, 166, 173, 175, 180, 182, 186, 196, 199, 207, 215, 219, 227, 240, 243, 245, 250, 252, 256, 261, 264, 269, 272, 276, 281, 285, 287, 290, 295, 302, 305, 306, 307, 309, 313, 314, 320, 327, 330, 335, 337, 340, 345, 348, 352, 358, 360, 363, 367, 371, 374, 376, 380, 386, 393, 397, 400, 404, 409, 412, 416, 417, 425, 430, 437, 440, 444, 448, 450, 456, 463, 467, 471, 479, 482, 487, 490, 492, 498, 503, 505, 511, 517, 520, 525, 526, 530, 533, 539, 551, 555, 558, 560, 565, 572, 581, 583, 590, 595, 602, 609, 613, 622, 626, 636, 640, 644, 650, 651, 655, 663, 664, 665, 670, 673, 677, 679, 683, 684, 689, 692]}
\ No newline at end of file
diff --git a/test/test_input_features.py b/test/test_input_features.py
new file mode 100644
index 000000000..bd4273c08
--- /dev/null
+++ b/test/test_input_features.py
@@ -0,0 +1,45 @@
+import json
+import logging
+
+from farm.data_handler.input_features import sample_to_features_qa
+from farm.data_handler.samples import Sample
+from farm.modeling.tokenization import Tokenizer
+
+
+MODEL = "roberta-base"
+SP_TOKENS_START = 1
+SP_TOKENS_MID = 2
+
+def to_list(x):
+    try:
+        return x.tolist()
+    except:
+        return x
+
+def test_sample_to_features_qa(caplog):
+    if caplog:
+        caplog.set_level(logging.CRITICAL)
+
+    sample_types = ["span", "no_answer"]
+
+    for sample_type in sample_types:
+        clear_text = json.load(open(f"samples/qa/{sample_type}/clear_text.json"))
+        tokenized = json.load(open(f"samples/qa/{sample_type}/tokenized.json"))
+        features_gold = json.load(open(f"samples/qa/{sample_type}/features.json"))
+        max_seq_len = len(features_gold["input_ids"])
+
+        tokenizer = Tokenizer.load(pretrained_model_name_or_path=MODEL, do_lower_case=False)
+        curr_id = "-".join([str(x) for x in features_gold["id"]])
+
+        s = Sample(id=curr_id, clear_text=clear_text, tokenized=tokenized)
+        features = sample_to_features_qa(s, tokenizer, max_seq_len, SP_TOKENS_START, SP_TOKENS_MID)[0]
+        features = to_list(features)
+
+        keys = features_gold.keys()
+        for k in keys:
+            value_gold = features_gold[k]
+            value = to_list(features[k])
+            assert value == value_gold, f"Mismatch between the {k} features in the {sample_type} test sample."
+
+if __name__ == "__main__":
+    test_sample_to_features_qa(None)

From 2867a75150c1c57cd202a9b22b178e9a10ca219d Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 1 Jul 2020 15:11:13 +0200
Subject: [PATCH 29/40] Add docstring

---
 farm/data_handler/processor.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 83572347f..c6529c9de 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1037,6 +1037,12 @@ def estimate_n_samples(self, filepath, max_docs=500):
 #########################################
 
 class QAProcessor(Processor):
+    """
+    This is class inherits from Processor and is the parent to SquadProcessor and NaturalQuestionsProcessor.
+    Its main role is to extend the __init__() so that the number of starting, intermediate and end special tokens
+    are calculated from the tokenizer and store as attributes. These are used by the child processors in their
+    sample_to_features() methods
+    """
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.initialize_special_tokens_count()

From 3446ff88b1fb7701703bf20b4ca56b99669300f6 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 1 Jul 2020 16:04:43 +0200
Subject: [PATCH 30/40] Fix import and error handling

---
 farm/data_handler/processor.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index c6529c9de..e5fab49f0 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -40,6 +40,7 @@
     get_sequence_pair,
     join_sentences
 )
+
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
 from farm.utils import MLFlowLogger as MlLogger
 from farm.utils import try_get
@@ -1167,11 +1168,10 @@ def _dicts_to_baskets(self, dicts, indices):
     def file_to_dicts(self, file: str) -> [dict]:
         nested_dicts = read_squad_file(filename=file)
         dicts = [y for x in nested_dicts for y in x["paragraphs"]]
-        for d in dicts:
-            check_valid_answer(d)
         return dicts
 
     def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
+        check_valid_answer(dictionary)
         n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
         samples = create_samples_qa(dictionary=dictionary,
                                        max_query_len=self.max_query_length,
@@ -1712,6 +1712,5 @@ def check_valid_answer(dictionary):
             start = answer["answer_start"]
             end = answer["answer_start"] + len_answer
             if context[start: end] != answer["text"]:
-                raise Exception(f"The answer extracted by start character index does not match the answer string: "
-                                 f"\t {context[start: end]} vs {answer['text']}")
+                raise Exception
 

From 71738d231e3dbc932e362466ccd3734c6c121b2d Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 1 Jul 2020 16:19:40 +0200
Subject: [PATCH 31/40] Fix answer check

---
 farm/data_handler/input_features.py |  8 +++-----
 farm/data_handler/processor.py      | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/farm/data_handler/input_features.py b/farm/data_handler/input_features.py
index 9a425b360..669f2cad7 100644
--- a/farm/data_handler/input_features.py
+++ b/farm/data_handler/input_features.py
@@ -70,7 +70,6 @@ def sample_to_features_text(
     input_ids = pad(input_ids, max_seq_len, tokenizer.pad_token_id, pad_on_left=pad_on_left)
     padding_mask = pad(padding_mask, max_seq_len, 0, pad_on_left=pad_on_left)
 
-
     assert len(input_ids) == max_seq_len
     assert len(padding_mask) == max_seq_len
     assert len(segment_ids) == max_seq_len
@@ -401,10 +400,9 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, sp_toks_start, sp_toks
     if tokenizer.__class__.__name__ in ["XLMRobertaTokenizer", "RobertaTokenizer"]:
         segment_ids = np.zeros_like(segment_ids)
 
-    # Todo: explain how only the first of labels will be used in train, and the full array will be used in eval
-    # TODO Offset, start of word and spec_tok_mask are not actually needed by model.forward() but are needed for model.formatted_preds()
-    # TODO passage_start_t is index of passage's first token  relative to document
-    # I don't think we actually need offsets anymore
+    # The first of the labels will be used in train, and the full array will be used in eval.
+    # start of word and spec_tok_mask are not actually needed by model.forward() but are needed for model.formatted_preds()
+    # passage_start_t is index of passage's first token relative to document
     feature_dict = {"input_ids": input_ids,
                     "padding_mask": padding_mask,
                     "segment_ids": segment_ids,
diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index e5fab49f0..d90db9380 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1171,7 +1171,6 @@ def file_to_dicts(self, file: str) -> [dict]:
         return dicts
 
     def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
-        check_valid_answer(dictionary)
         n_special_tokens = self.tokenizer.num_special_tokens_to_add(pair=True)
         samples = create_samples_qa(dictionary=dictionary,
                                        max_query_len=self.max_query_length,
@@ -1181,7 +1180,7 @@ def _dict_to_samples(self, dictionary: dict, **kwargs) -> [Sample]:
         return samples
 
     def _sample_to_features(self, sample) -> dict:
-        # TODO, make this function return one set of features per sample
+        check_valid_answer(sample)
         features = sample_to_features_qa(sample=sample,
                                          tokenizer=self.tokenizer,
                                          max_seq_len=self.max_seq_len,
@@ -1704,13 +1703,12 @@ def is_impossible_to_answer_type(qas):
     return new_qas
 
   
-def check_valid_answer(dictionary):
-    context = dictionary["context"]
-    for qa in dictionary["qas"]:
-        for answer in qa["answers"]:
-            len_answer = len(answer["text"])
-            start = answer["answer_start"]
-            end = answer["answer_start"] + len_answer
-            if context[start: end] != answer["text"]:
-                raise Exception
+def check_valid_answer(sample):
+    passage_text = sample.clear_text["passage_text"]
+    for answer in sample.clear_text["answers"]:
+        len_answer = len(answer["text"])
+        start = answer["start_c"]
+        end = answer["end_c"] + len_answer
+        if passage_text[start: end] != answer["text"]:
+            raise Exception
 

From 41ec4289b27044bb781a79a76ebd07db3a627b8c Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 1 Jul 2020 16:31:33 +0200
Subject: [PATCH 32/40] Fix sample check

---
 farm/data_handler/processor.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index d90db9380..9f3c39825 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1706,9 +1706,12 @@ def is_impossible_to_answer_type(qas):
 def check_valid_answer(sample):
     passage_text = sample.clear_text["passage_text"]
     for answer in sample.clear_text["answers"]:
-        len_answer = len(answer["text"])
+        len_passage = len(passage_text)
         start = answer["start_c"]
-        end = answer["end_c"] + len_answer
-        if passage_text[start: end] != answer["text"]:
+        end = answer["end_c"]
+        # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
+        if start < 0 or end > len_passage:
+            continue
+        if passage_text[start: end + 1] != answer["text"]:
             raise Exception
 

From 74c6e9c738fd6779ec12fe7bd615583f140e7fd9 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Wed, 1 Jul 2020 17:35:29 +0200
Subject: [PATCH 33/40] move sample check to _sample_to_features

---
 farm/data_handler/processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 9f3c39825..ca70bebd0 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1474,6 +1474,7 @@ def _convert_tok_to_ch(start_t, end_t, tok_to_ch, doc_text):
         return start_c, end_c
 
     def _sample_to_features(self, sample: Sample) -> dict:
+        check_valid_answer(sample)
         features = sample_to_features_qa(sample=sample,
                                          tokenizer=self.tokenizer,
                                          max_seq_len=self.max_seq_len,

From 171db3e764885d4866ab8ce272e0ce8e97e21979 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 15:34:05 +0200
Subject: [PATCH 34/40] Pass QA inferencer args properly

---
 farm/infer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/farm/infer.py b/farm/infer.py
index d3cf618f4..af765aebd 100644
--- a/farm/infer.py
+++ b/farm/infer.py
@@ -632,14 +632,14 @@ def inference_from_dicts(self,
                              return_json=True,
                              multiprocessing_chunksize=None,
                              streaming=False) -> Union[List[QAPred], Generator[QAPred, None, None]]:
-        return Inferencer.inference_from_dicts(dicts, return_json=True, multiprocessing_chunksize=None, streaming=False)
+        return Inferencer.inference_from_dicts(self, dicts, return_json=return_json, multiprocessing_chunksize=None, streaming=False)
 
     def inference_from_file(self,
                             file,
                             multiprocessing_chunksize=None,
                             streaming=False,
                             return_json=True) -> Union[List[QAPred], Generator[QAPred, None, None]]:
-        return Inferencer.inference_from_file(file, return_json=True, multiprocessing_chunksize=None, streaming=False)
+        return Inferencer.inference_from_file(self, file, return_json=return_json, multiprocessing_chunksize=None, streaming=False)
 
 
 class FasttextInferencer:

From b4825f058029a22d6434688ff291eead09a213e8 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 15:34:46 +0200
Subject: [PATCH 35/40] Rename span to qa_candidate

---
 farm/evaluation/metrics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/farm/evaluation/metrics.py b/farm/evaluation/metrics.py
index 311ee934c..702ab9469 100644
--- a/farm/evaluation/metrics.py
+++ b/farm/evaluation/metrics.py
@@ -142,9 +142,9 @@ def squad_EM(preds, labels):
     n_docs = len(preds)
     n_correct = 0
     for doc_idx in range(n_docs):
-        span = preds[doc_idx][0][0]
-        pred_start = span.offset_answer_start
-        pred_end = span.offset_answer_end
+        qa_candidate = preds[doc_idx][0][0]
+        pred_start = qa_candidate.offset_answer_start
+        pred_end = qa_candidate.offset_answer_end
         curr_labels = labels[doc_idx]
         if (pred_start, pred_end) in curr_labels:
             n_correct += 1

From fce94c75fda508a06e3d422456f821d6eb4b492f Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 15:43:47 +0200
Subject: [PATCH 36/40] Arg passing error causing Eval bug

---
 farm/data_handler/processor.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index ca70bebd0..8b2b23959 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -38,7 +38,9 @@
     split_with_metadata,
     convert_qa_input_dict,
     get_sequence_pair,
-    join_sentences
+    join_sentences,
+    SampleError
+
 )
 
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences
@@ -1185,7 +1187,7 @@ def _sample_to_features(self, sample) -> dict:
                                          tokenizer=self.tokenizer,
                                          max_seq_len=self.max_seq_len,
                                          sp_toks_start=self.sp_toks_start,
-                                         sp_toks_mid=self.sp_toks_end)
+                                         sp_toks_mid=self.sp_toks_mid)
         return features
 
 class NaturalQuestionsProcessor(QAProcessor):
@@ -1713,6 +1715,8 @@ def check_valid_answer(sample):
         # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
         if start < 0 or end > len_passage:
             continue
-        if passage_text[start: end + 1] != answer["text"]:
-            raise Exception
+        answer_indices = passage_text[start: end + 1]
+        answer_text = answer["text"]
+        if answer_indices != answer_text:
+            raise SampleError(f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'""")
 

From 07b84e674d1f95428d5b0163aec66a9fc9e9fb32 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 16:15:31 +0200
Subject: [PATCH 37/40] Fix bug in answer check

---
 farm/data_handler/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 8b2b23959..78561fb6a 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1713,7 +1713,7 @@ def check_valid_answer(sample):
         start = answer["start_c"]
         end = answer["end_c"]
         # Cases where the answer is not within the current passage will be turned into no answers by the featurization fn
-        if start < 0 or end > len_passage:
+        if start < 0 or end >= len_passage:
             continue
         answer_indices = passage_text[start: end + 1]
         answer_text = answer["text"]

From 473bf636eae8faee84cb253f0a4be56693d7522f Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 16:22:30 +0200
Subject: [PATCH 38/40] remove import

---
 farm/data_handler/processor.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 78561fb6a..21a97985e 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -38,9 +38,7 @@
     split_with_metadata,
     convert_qa_input_dict,
     get_sequence_pair,
-    join_sentences,
-    SampleError
-
+    join_sentences
 )
 
 from farm.modeling.tokenization import Tokenizer, tokenize_with_metadata, truncate_sequences

From ea764002af9c392757a9ca8797ebcf5a439a6323 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 16:45:15 +0200
Subject: [PATCH 39/40] Remove reference to SampleError

---
 farm/data_handler/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py
index 21a97985e..8b7260c2a 100644
--- a/farm/data_handler/processor.py
+++ b/farm/data_handler/processor.py
@@ -1716,5 +1716,5 @@ def check_valid_answer(sample):
         answer_indices = passage_text[start: end + 1]
         answer_text = answer["text"]
         if answer_indices != answer_text:
-            raise SampleError(f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'""")
+            raise ValueError(f"""Answer using start/end indices is '{answer_indices}' while gold label text is '{answer_text}'""")
 

From 7d16016050271ce7e547aca7120db82d6c27f2a7 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Thu, 2 Jul 2020 17:19:50 +0200
Subject: [PATCH 40/40] Fix onnx sample

---
 farm/modeling/adaptive_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/farm/modeling/adaptive_model.py b/farm/modeling/adaptive_model.py
index 87d39921d..b1e124d3d 100644
--- a/farm/modeling/adaptive_model.py
+++ b/farm/modeling/adaptive_model.py
@@ -664,7 +664,7 @@ def convert_to_onnx(self, output_path, opset_version=11, optimize_for=None):
                     {
                         "question": "In what country is Normandy located?",
                         "id": "56ddde6b9a695914005b9628",
-                        "answers": [{"text": "France", "answer_start": 159}],
+                        "answers": [],
                         "is_impossible": False,
                     }
                 ],