diff --git a/.github/workflows/cml.yaml b/.github/workflows/cml.yaml index a4a314601..a2fb87c4b 100644 --- a/.github/workflows/cml.yaml +++ b/.github/workflows/cml.yaml @@ -23,7 +23,7 @@ jobs: MACHINE="cml$(date +%s)" docker-machine create \ --driver amazonec2 \ - --amazonec2-instance-type p3.2xlarge \ + --amazonec2-instance-type p3.8xlarge \ --amazonec2-vpc-id $VPC \ --amazonec2-region us-east-1 \ --amazonec2-zone c \ @@ -43,6 +43,7 @@ jobs: --gpus all \ -v /docker_machine/machine:/root/.docker/machine \ --net host \ + --ipc host \ -e DOCKER_MACHINE=$MACHINE \ -e repo_token=$repo_token \ -e RUNNER_LABELS=$RUNNER_LABELS \ @@ -67,7 +68,11 @@ jobs: apt-get install python3-dev -y pip install -r requirements.txt pip install . - cd test/benchmarks && python question_answering_components.py - echo -en "## Benchmarks: QA per component\n" >> report.md - cat results_per_component.md >> report.md - cml-send-comment report.md + cd test/benchmarks && python question_answering_accuracy.py + echo -en "## Benchmarks: QA Accuracy\n" >> accuracy_report.md + cat results_accuracy.md >> accuracy_report.md + cml-send-comment accuracy_report.md + python question_answering_components.py + echo -en "## Benchmarks: QA per component\n" >> components_report.md + cat results_per_component.md >> components_report.md + cml-send-comment components_report.md diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py index ae83c9d18..55fb9f5df 100644 --- a/farm/modeling/prediction_head.py +++ b/farm/modeling/prediction_head.py @@ -1180,6 +1180,7 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx, st start_idx_candidates = set() end_idx_candidates = set() + start_matrix_softmax_start = torch.softmax(start_matrix[:, 0], dim=-1) # Iterate over all candidates and break when we have all our n_best candidates for candidate_idx in range(n_candidates): if len(top_candidates) == self.n_best_per_sample: @@ -1194,7 +1195,6 @@ def get_top_candidates(self, sorted_candidates, start_end_matrix, sample_idx, st if self.duplicate_filtering > -1 and (start_idx in start_idx_candidates or end_idx in end_idx_candidates): continue score = start_end_matrix[start_idx, end_idx].item() - start_matrix_softmax_start = torch.softmax(start_matrix[:, 0], dim=-1) confidence = start_matrix_softmax_start[start_idx].item() top_candidates.append(QACandidate(offset_answer_start=start_idx, offset_answer_end=end_idx, @@ -1722,7 +1722,7 @@ def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], label_ids, * # Check if DDP is initialized try: rank = torch.distributed.get_rank() - except AssertionError: + except (AssertionError, RuntimeError): rank = -1 # Prepare predicted scores diff --git a/requirements.txt b/requirements.txt index da91a7f99..6147860f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ wheel # PyTorch # Temp. disabled the next line as it gets currently resolved to https://download.pytorch.org/whl/rocm3.8/torch-1.7.1%2Brocm3.8-cp38-cp38-linux_x86_64.whl #--find-links=https://download.pytorch.org/whl/torch_stable.html -torch>1.5,<1.8 +torch>1.5,<1.9 # progress bars in model download and training scripts tqdm # Accessing files from S3 directly. diff --git a/test/benchmarks/question_answering_accuracy.py b/test/benchmarks/question_answering_accuracy.py index ba96f0815..39ba9d9df 100644 --- a/test/benchmarks/question_answering_accuracy.py +++ b/test/benchmarks/question_answering_accuracy.py @@ -4,6 +4,8 @@ from time import time import numpy as np +from pprint import pformat +import pandas as pd from dotmap import DotMap from farm.data_handler.data_silo import DataSilo @@ -20,6 +22,10 @@ from farm.train import Trainer from farm.utils import set_all_seeds, initialize_device_settings +logger = logging.getLogger(__name__) +os.environ["TOKENIZERS_PARALLELISM"] = "true" +n_gpu_factor=4 +error_messages = [] def test_evaluation(): ########################## @@ -28,7 +34,7 @@ def test_evaluation(): lang_model = "deepset/roberta-base-squad2" do_lower_case = False - test_assertions = True + test_assertions = False data_dir = Path("testsave/data/squad20") evaluation_filename = "dev-v2.0.json" @@ -39,6 +45,7 @@ def test_evaluation(): model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="question_answering") model.prediction_heads[0].no_ans_boost = 0 model.prediction_heads[0].n_best = 1 + model.prediction_heads[0].n_best_per_sample = 1 tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,do_lower_case=do_lower_case) processor = SquadProcessor( @@ -56,7 +63,7 @@ def test_evaluation(): starttime = time() - data_silo = DataSilo(processor=processor, batch_size=40*4) + data_silo = DataSilo(processor=processor, batch_size=40*n_gpu_factor) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) model, _ = optimize_model(model=model, device=device, local_rank=-1, optimizer=None, distributed=False, use_amp=None) @@ -64,27 +71,51 @@ def test_evaluation(): # 1. Test FARM internal evaluation results = evaluator.eval(model) - f1_score = results[0]["f1"] - em_score = results[0]["EM"] - tnacc = results[0]["top_n_accuracy"] + f1_score = results[0]["f1"] * 100 + em_score = results[0]["EM"] * 100 + tnacc = results[0]["top_n_accuracy"] * 100 elapsed = time() - starttime print(results) print(elapsed) - gold_EM = 0.784721 - gold_f1 = 0.826671 - gold_tnacc = 0.843594 # top 1 recall + gold_EM = 78.4721 + gold_f1 = 82.6671 + gold_tnacc = 84.3594 # top 1 recall gold_elapsed = 40 # 4x V100 if test_assertions: np.testing.assert_allclose(em_score, gold_EM, rtol=0.001, err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}") np.testing.assert_allclose(f1_score, gold_f1, rtol=0.001, err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}") - np.testing.assert_allclose(tnacc, gold_tnacc, rtol=0.001, err_msg=f"FARM Eval changed for top 1 accuracy by: {em_score-gold_EM}") + np.testing.assert_allclose(tnacc, gold_tnacc, rtol=0.001, err_msg=f"FARM Eval changed for top 1 accuracy by: {tnacc-gold_tnacc}") np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds") + if not np.allclose(f1_score, gold_f1, rtol=0.001): + error_messages.append(f"FARM Eval changed for f1 score by: {round(f1_score - gold_f1, 4)}") + if not np.allclose(em_score, gold_EM, rtol=0.001): + error_messages.append(f"FARM Eval changed for EM by: {round(em_score - gold_EM, 4)}") + if not np.allclose(tnacc, gold_tnacc, rtol=0.001): + error_messages.append(f"FARM Eval changed for top 1 accuracy by: {round(tnacc-gold_tnacc, 4)}") + if not np.allclose(elapsed, gold_elapsed, rtol=0.1): + error_messages.append(f"FARM Eval speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds") + + benchmark_result = [{ "run": "FARM internal evaluation", + "f1_change": round(f1_score - gold_f1, 4), + "em_change": round(em_score - gold_EM, 4), + "tnacc_change": round(tnacc - gold_tnacc, 4), + "elapsed_change": round(elapsed - gold_elapsed, 4), + "f1": f1_score, + "em": em_score, + "tnacc": round(tnacc, 4), + "elapsed": elapsed, + "f1_gold": gold_f1, + "em_gold": gold_EM, + "tnacc_gold": gold_tnacc, + "elapsed_gold": gold_elapsed + }] + logger.info("\n\n" + pformat(benchmark_result[0]) + "\n") # # 2. Test FARM predictions with outside eval script starttime = time() - model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40*4, gpu=device.type=="cuda") + model = Inferencer(model=model, processor=processor, task_type="question_answering", batch_size=40*n_gpu_factor, gpu=device.type=="cuda") filename = data_dir / evaluation_filename result = model.inference_from_file(file=filename, return_json=False, multiprocessing_chunksize=80) results_squad = [x.to_squad_eval() for x in result] @@ -120,6 +151,29 @@ def test_evaluation(): err_msg=f"Eval with official script changed for f1 score by: {f1_score - gold_f1}") np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds") + if not np.allclose(f1_score, gold_f1, rtol=0.001): + error_messages.append(f"Eval with official script changed for f1 score by: {round(f1_score - gold_f1, 4)}") + if not np.allclose(em_score, gold_EM, rtol=0.001): + error_messages.append(f"Eval with official script changed for EM by: {round(em_score - gold_EM, 4)}") + if not np.allclose(elapsed, gold_elapsed, rtol=0.1): + error_messages.append(f"Inference speed changed significantly by: {round(elapsed - gold_elapsed,4)} seconds") + + benchmark_result.append({"run": "outside eval script", + "f1_change": round(f1_score - gold_f1, 4), + "em_change": round(em_score - gold_EM, 4), + "tnacc_change": "-", + "elapsed_change": round(elapsed - gold_elapsed, 4), + "f1": f1_score, + "em": em_score, + "tnacc": "-", + "elapsed": elapsed, + "f1_gold": gold_f1, + "em_gold": gold_EM, + "tnacc_gold": "-", + "elapsed_gold": gold_elapsed + }) + logger.info("\n\n" + pformat(benchmark_result[1]) + "\n") + return benchmark_result def train_evaluation_single(seed=42): @@ -130,11 +184,12 @@ def train_evaluation_single(seed=42): device, n_gpu = initialize_device_settings(use_cuda=True) # GPU utilization on 4x V100 # 40*4, 14.3/16GB on master, 12.6/16 on others - batch_size = 40*4 + batch_size = 40*n_gpu_factor n_epochs = 2 evaluate_every = 2000000 # disabling dev eval lang_model = "roberta-base" do_lower_case = False # roberta is a cased model + test_assertions = False train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" @@ -153,9 +208,9 @@ def train_evaluation_single(seed=42): test_filename=None, data_dir=Path("testsave/data/squad20"), ) - data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) + data_silo = DataSilo(processor=processor, batch_size=batch_size) language_model = LanguageModel.load(lang_model) - prediction_head = QuestionAnsweringHead(n_best=5) + prediction_head = QuestionAnsweringHead(n_best=5, n_best_per_sample=1) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], @@ -202,20 +257,58 @@ def train_evaluation_single(seed=42): gold_f1 = 82.155 - gold_EM = 77.714 - gold_tnrecall = 97.3721 # + gold_EM = 78.6575#77.714 + gold_tnrecall = 97.3721 gold_elapsed = 1135 - np.testing.assert_allclose(f1_score, gold_f1, rtol=0.01, - err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") - np.testing.assert_allclose(em_score, gold_EM, rtol=0.01, - err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") - np.testing.assert_allclose(tnacc, gold_tnrecall, rtol=0.01, - err_msg=f"FARM Training changed for top 5 accuracy by: {em_score - gold_EM}") - np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds") + if test_assertions: + np.testing.assert_allclose(f1_score, gold_f1, rtol=0.01, + err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}") + np.testing.assert_allclose(em_score, gold_EM, rtol=0.01, + err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}") + np.testing.assert_allclose(tnacc, gold_tnrecall, rtol=0.01, + err_msg=f"FARM Training changed for top 5 accuracy by: {tnacc - gold_tnrecall}") + np.testing.assert_allclose(elapsed, gold_elapsed, rtol=0.1, err_msg=f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds") + if not np.allclose(f1_score, gold_f1, rtol=0.01): + error_messages.append(f"FARM Training changed for f1 score by: {round(f1_score - gold_f1, 4)}") + if not np.allclose(em_score, gold_EM, rtol=0.01): + error_messages.append(f"FARM Training changed for EM by: {round(em_score - gold_EM, 4)}") + if not np.allclose(tnacc, gold_tnrecall, rtol=0.01): + error_messages.append(f"FARM Training changed for top 5 accuracy by: {round(tnacc - gold_tnrecall, 4)}") + if not np.allclose(elapsed, gold_elapsed, rtol=0.1): + error_messages.append(f"FARM Training speed changed significantly by: {round(elapsed - gold_elapsed, 4)} seconds") + + benchmark_result = [{"run": "train evaluation", + "f1_change": round(f1_score - gold_f1, 4), + "em_change": round(em_score - gold_EM, 4), + "tnacc_change": round(tnacc - gold_tnrecall, 4), + "elapsed_change": round(elapsed - gold_elapsed, 4), + "f1": f1_score, + "em": em_score, + "tnacc": round(tnacc, 4), + "elapsed": elapsed, + "f1_gold": gold_f1, + "em_gold": gold_EM, + "tnacc_gold": gold_tnrecall, + "elapsed_gold": gold_elapsed + }] + logger.info("\n\n" + pformat(benchmark_result) + "\n") + return benchmark_result if __name__ == "__main__": - logging.disable(logging.WARNING) - - test_evaluation() + logger.info("QA Accuracy Benchmark") + benchmark_results = [] + benchmark_results.extend(test_evaluation()) + benchmark_results.extend(train_evaluation_single(seed=42)) + + output_file = f"results_accuracy.csv" + df = pd.DataFrame.from_records(benchmark_results) + df.to_csv(output_file) + with open(output_file.replace(".csv", ".md"), "w") as f: + if error_messages: + f.write("### :warning: QA Accuracy Benchmark Failed\n") + for error_message in error_messages: + f.write(error_message+"\n") + else: + f.write("### :heavy_check_mark: QA Accuracy Benchmark Passed\n") + f.write(str(df.to_markdown())) - train_evaluation_single(seed=42) diff --git a/test/test_ner_amp.py b/test/test_ner_amp.py index e2fd50962..252aa718d 100644 --- a/test/test_ner_amp.py +++ b/test/test_ner_amp.py @@ -94,7 +94,7 @@ def test_ner_amp(caplog): assert result[0]["predictions"][0][0]["context"] == "Crown" assert isinstance(result[0]["predictions"][0][0]["probability"], np.float32) - assert result[0]["predictions"][0][0]["probability"] > 0.124 + assert np.isclose(result[0]["predictions"][0][0]["probability"], 0.124, rtol=0.05) assert result[0]["predictions"][0][0]["label"] == "OTH"