diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml
index a4a314601..a2fb87c4b 100644
--- a/.github/workflows/cml.yaml
+++ b/.github/workflows/cml.yaml
@@ -23,7 +23,7 @@ jobs:
           MACHINE="cml$(date +%s)"
           docker-machine create \
             --driver amazonec2 \
-            --amazonec2-instance-type p3.2xlarge \
+            --amazonec2-instance-type p3.8xlarge \
             --amazonec2-vpc-id $VPC \
             --amazonec2-region us-east-1 \
             --amazonec2-zone c \
@@ -43,6 +43,7 @@ jobs:
             --gpus all \
             -v /docker_machine/machine:/root/.docker/machine \
             --net host \
+            --ipc host \
             -e DOCKER_MACHINE=$MACHINE \
             -e repo_token=$repo_token \
             -e RUNNER_LABELS=$RUNNER_LABELS \
@@ -67,7 +68,11 @@ jobs:
           apt-get install python3-dev -y
           pip install -r requirements.txt
           pip install .
-          cd test/benchmarks && python question_answering_components.py
-          echo -en "## Benchmarks: QA per component\n" >> report.md
-          cat results_per_component.md >> report.md
-          cml-send-comment report.md
+          cd test/benchmarks && python question_answering_accuracy.py
+          echo -en "## Benchmarks: QA Accuracy\n" >> accuracy_report.md
+          cat results_accuracy.md >> accuracy_report.md
+          cml-send-comment accuracy_report.md
+          python question_answering_components.py
+          echo -en "## Benchmarks: QA per component\n" >> components_report.md
+          cat results_per_component.md >> components_report.md
+          cml-send-comment components_report.md
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
index ae83c9d18..55fb9f5df 100644
--- a/farm/modeling/prediction_head.py
+++ b/farm/modeling/prediction_head.py
@@ -1180,6 +1180,7 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx, st
         start_idx_candidates = set()
         end_idx_candidates = set()
 
+        start_matrix_softmax_start = torch.softmax(start_matrix[:, 0], dim=-1)
         # Iterate over all candidates and break when we have all our n_best candidates
         for candidate_idx in range(n_candidates):
             if len(top_candidates) == self.n_best_per_sample:
@@ -1194,7 +1195,6 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx, st
                 if self.duplicate_filtering > -1 and (start_idx in start_idx_candidates or end_idx in end_idx_candidates):
                     continue
                 score = start_end_matrix[start_idx, end_idx].item()
-                start_matrix_softmax_start = torch.softmax(start_matrix[:, 0], dim=-1)
                 confidence = start_matrix_softmax_start[start_idx].item()
                 top_candidates.append(QACandidate(offset_answer_start=start_idx,
                                                   offset_answer_end=end_idx,
@@ -1722,7 +1722,7 @@ def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], label_ids, *
         # Check if DDP is initialized
         try:
             rank = torch.distributed.get_rank()
-        except AssertionError:
+        except (AssertionError, RuntimeError):
             rank = -1
 
         # Prepare predicted scores
diff --git a/requirements.txt b/requirements.txt
index da91a7f99..6147860f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ wheel
 # PyTorch
 # Temp. disabled the next line as it gets currently resolved to https://download.pytorch.org/whl/rocm3.8/torch-1.7.1%2Brocm3.8-cp38-cp38-linux_x86_64.whl
 #--find-links=https://download.pytorch.org/whl/torch_stable.html
-torch>1.5,<1.8
+torch>1.5,<1.9
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
diff --git a/test/benchmarks/question_answering_accuracy.py b/test/benchmarks/question_answering_accuracy.py
index ba96f0815..39ba9d9df 100644
--- a/test/benchmarks/question_answering_accuracy.py
+++ b/test/benchmarks/question_answering_accuracy.py
@@ -4,6 +4,8 @@
 from time import time
 
 import numpy as np
+from pprint import pformat
+import pandas as pd
 from dotmap import DotMap
 
 from farm.data_handler.data_silo import DataSilo
@@ -20,6 +22,10 @@
 from farm.train import Trainer
 from farm.utils import set_all_seeds, initialize_device_settings
 
+logger = logging.getLogger(__name__)
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+n_gpu_factor=4
+error_messages = []
 
 def test_evaluation():
     ##########################
@@ -28,7 +34,7 @@ def test_evaluation():
     lang_model = "deepset/roberta-base-squad2"
     do_lower_case = False
 
-    test_assertions = True
+    test_assertions = False
 
     data_dir = Path("testsave/data/squad20")
     evaluation_filename = "dev-v2.0.json"
@@ -39,6 +45,7 @@ def test_evaluation():
     model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering")
     model.prediction_heads[0].no_ans_boost = 0
     model.prediction_heads[0].n_best = 1
+    model.prediction_heads[0].n_best_per_sample = 1
 
     tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,do_lower_case=do_lower_case)
     processor = SquadProcessor(
@@ -56,7 +63,7 @@ def test_evaluation():
 
     starttime = time()
 
-    data_silo = DataSilo(processor=processor, batch_size=40*4)
+    data_silo = DataSilo(processor=processor, batch_size=40*n_gpu_factor)
     model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)
     model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None, distributed=False, use_amp=None)
 
@@ -64,27 +71,51 @@ def test_evaluation():
 
     # 1. Test FARM internal evaluation
     results = evaluator.eval(model)
-    f1_score = results[0]["f1"]
-    em_score = results[0]["EM"]
-    tnacc = results[0]["top_n_accuracy"]
+    f1_score = results[0]["f1"] * 100
+    em_score = results[0]["EM"] * 100
+    tnacc = results[0]["top_n_accuracy"] * 100
     elapsed = time() - starttime
     print(results)
     print(elapsed)
 
-    gold_EM = 0.784721
-    gold_f1 = 0.826671
-    gold_tnacc = 0.843594 # top 1 recall
+    gold_EM = 78.4721
+    gold_f1 = 82.6671
+    gold_tnacc = 84.3594 # top 1 recall
     gold_elapsed = 40 # 4x V100
     if test_assertions:
         np.testing.assert_allclose(em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
         np.testing.assert_allclose(f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
-        np.testing.assert_allclose(tnacc, gold_tnacc, rtol=0.001, err_msg=f"FARM Eval changed for top 1 accuracy by: {em_score-gold_EM}")
+        np.testing.assert_allclose(tnacc, gold_tnacc, rtol=0.001, err_msg=f"FARM Eval changed for top 1 accuracy by: {tnacc-gold_tnacc}")
         np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds")
 
+    if not np.allclose(f1_score, gold_f1, rtol=0.001):
+        error_messages.append(f"FARM Eval changed for f1 score by: {round(f1_score - gold_f1, 4)}")
+    if not np.allclose(em_score, gold_EM, rtol=0.001):
+        error_messages.append(f"FARM Eval changed for EM by: {round(em_score - gold_EM, 4)}")
+    if not np.allclose(tnacc, gold_tnacc, rtol=0.001):
+        error_messages.append(f"FARM Eval changed for top 1 accuracy by: {round(tnacc-gold_tnacc, 4)}")
+    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
+        error_messages.append(f"FARM Eval speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds")
+
+    benchmark_result = [{ "run": "FARM internal evaluation",
+          "f1_change": round(f1_score - gold_f1, 4),
+          "em_change": round(em_score - gold_EM, 4),
+          "tnacc_change": round(tnacc - gold_tnacc, 4),
+          "elapsed_change": round(elapsed - gold_elapsed, 4),
+          "f1": f1_score,
+          "em": em_score,
+          "tnacc": round(tnacc, 4),
+          "elapsed": elapsed,
+          "f1_gold": gold_f1,
+          "em_gold": gold_EM,
+          "tnacc_gold": gold_tnacc,
+          "elapsed_gold": gold_elapsed
+          }]
+    logger.info("\n\n" + pformat(benchmark_result[0]) + "\n")
 
     # # 2. Test FARM predictions with outside eval script
     starttime = time()
-    model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40*4, gpu=device.type=="cuda")
+    model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40*n_gpu_factor, gpu=device.type=="cuda")
     filename = data_dir / evaluation_filename
     result = model.inference_from_file(file=filename, return_json=False, multiprocessing_chunksize=80)
     results_squad = [x.to_squad_eval() for x in result]
@@ -120,6 +151,29 @@ def test_evaluation():
                                    err_msg=f"Eval with official script changed for f1 score by: {f1_score - gold_f1}")
         np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1,
                                    err_msg=f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds")
+    if not np.allclose(f1_score, gold_f1, rtol=0.001):
+        error_messages.append(f"Eval with official script changed for f1 score by: {round(f1_score - gold_f1, 4)}")
+    if not np.allclose(em_score, gold_EM, rtol=0.001):
+        error_messages.append(f"Eval with official script changed for EM by: {round(em_score - gold_EM, 4)}")
+    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
+        error_messages.append(f"Inference speed changed significantly by: {round(elapsed - gold_elapsed,4)} seconds")
+
+    benchmark_result.append({"run": "outside eval script",
+          "f1_change": round(f1_score - gold_f1, 4),
+          "em_change": round(em_score - gold_EM, 4),
+          "tnacc_change": "-",
+          "elapsed_change": round(elapsed - gold_elapsed, 4),
+          "f1": f1_score,
+          "em": em_score,
+          "tnacc": "-",
+          "elapsed": elapsed,
+          "f1_gold": gold_f1,
+          "em_gold": gold_EM,
+          "tnacc_gold": "-",
+          "elapsed_gold": gold_elapsed
+          })
+    logger.info("\n\n" + pformat(benchmark_result[1]) + "\n")
+    return benchmark_result
 
 
 def train_evaluation_single(seed=42):
@@ -130,11 +184,12 @@ def train_evaluation_single(seed=42):
     device, n_gpu = initialize_device_settings(use_cuda=True)
     # GPU utilization on 4x V100
     # 40*4, 14.3/16GB on master, 12.6/16 on others
-    batch_size = 40*4
+    batch_size = 40*n_gpu_factor
     n_epochs = 2
     evaluate_every = 2000000 # disabling dev eval
     lang_model = "roberta-base"
     do_lower_case = False  # roberta is a cased model
+    test_assertions = False
     train_filename = "train-v2.0.json"
     dev_filename = "dev-v2.0.json"
 
@@ -153,9 +208,9 @@ def train_evaluation_single(seed=42):
         test_filename=None,
         data_dir=Path("testsave/data/squad20"),
     )
-    data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
+    data_silo = DataSilo(processor=processor, batch_size=batch_size)
     language_model = LanguageModel.load(lang_model)
-    prediction_head = QuestionAnsweringHead(n_best=5)
+    prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1)
     model = AdaptiveModel(
         language_model=language_model,
         prediction_heads=[prediction_head],
@@ -202,20 +257,58 @@ def train_evaluation_single(seed=42):
 
 
     gold_f1 = 82.155
-    gold_EM = 77.714
-    gold_tnrecall = 97.3721 #
+    gold_EM = 78.6575#77.714
+    gold_tnrecall = 97.3721
     gold_elapsed = 1135
-    np.testing.assert_allclose(f1_score, gold_f1, rtol=0.01,
-                               err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
-    np.testing.assert_allclose(em_score, gold_EM, rtol=0.01,
-                               err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
-    np.testing.assert_allclose(tnacc, gold_tnrecall, rtol=0.01,
-                               err_msg=f"FARM Training changed for top 5 accuracy by: {em_score - gold_EM}")
-    np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds")
+    if test_assertions:
+        np.testing.assert_allclose(f1_score, gold_f1, rtol=0.01,
+                                   err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
+        np.testing.assert_allclose(em_score, gold_EM, rtol=0.01,
+                                   err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
+        np.testing.assert_allclose(tnacc, gold_tnrecall, rtol=0.01,
+                                   err_msg=f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}")
+        np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds")
+    if not np.allclose(f1_score, gold_f1, rtol=0.01):
+        error_messages.append(f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}")
+    if not np.allclose(em_score, gold_EM, rtol=0.01):
+        error_messages.append(f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}")
+    if not np.allclose(tnacc, gold_tnrecall, rtol=0.01):
+        error_messages.append(f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}")
+    if not np.allclose(elapsed, gold_elapsed, rtol=0.1):
+        error_messages.append(f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds")
+
+    benchmark_result = [{"run": "train evaluation",
+              "f1_change": round(f1_score - gold_f1, 4),
+              "em_change": round(em_score - gold_EM, 4),
+              "tnacc_change": round(tnacc - gold_tnrecall, 4),
+              "elapsed_change": round(elapsed - gold_elapsed, 4),
+              "f1": f1_score,
+              "em": em_score,
+              "tnacc": round(tnacc, 4),
+              "elapsed": elapsed,
+              "f1_gold": gold_f1,
+              "em_gold": gold_EM,
+              "tnacc_gold": gold_tnrecall,
+              "elapsed_gold": gold_elapsed
+              }]
+    logger.info("\n\n" + pformat(benchmark_result) + "\n")
+    return benchmark_result
 
 if __name__ == "__main__":
-    logging.disable(logging.WARNING)
-
-    test_evaluation()
+    logger.info("QA Accuracy Benchmark")
+    benchmark_results = []
+    benchmark_results.extend(test_evaluation())
+    benchmark_results.extend(train_evaluation_single(seed=42))
+
+    output_file = f"results_accuracy.csv"
+    df = pd.DataFrame.from_records(benchmark_results)
+    df.to_csv(output_file)
+    with open(output_file.replace(".csv", ".md"), "w") as f:
+        if error_messages:
+            f.write("### :warning: QA Accuracy Benchmark Failed\n")
+            for error_message in error_messages:
+                f.write(error_message+"\n")
+        else:
+            f.write("### :heavy_check_mark: QA Accuracy Benchmark Passed\n")
+        f.write(str(df.to_markdown()))
 
-    train_evaluation_single(seed=42)
diff --git a/test/test_ner_amp.py b/test/test_ner_amp.py
index e2fd50962..252aa718d 100644
--- a/test/test_ner_amp.py
+++ b/test/test_ner_amp.py
@@ -94,7 +94,7 @@ def test_ner_amp(caplog):
 
     assert result[0]["predictions"][0][0]["context"] == "Crown"
     assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32)
-    assert result[0]["predictions"][0][0]["probability"] > 0.124
+    assert np.isclose(result[0]["predictions"][0][0]["probability"], 0.124, rtol=0.05)
     assert result[0]["predictions"][0][0]["label"] == "OTH"