From b010a8a20697b5816248018b658dbcf1afaf6d4f Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 09:16:58 +0100
Subject: [PATCH 01/22] Update config

---
 .../classification/CovidHierarchicalModel.py  | 86 +++++++++----------
 1 file changed, 41 insertions(+), 45 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py
index b4815f38a..ace411a53 100644
--- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py
+++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py
@@ -1,6 +1,4 @@
-import codecs
 import logging
-import pickle
 import random
 import math
 from pathlib import Path
@@ -8,6 +6,7 @@
 from typing import Any, Callable
 
 import PIL
+import numpy as np
 import pandas as pd
 import param
 import torch
@@ -16,6 +15,7 @@
 from torchvision.transforms import Compose
 
 from InnerEye.Common.common_util import ModelProcessing, get_best_epoch_results_path
+from InnerEye.Common.metrics_constants import LoggingColumns
 
 from InnerEye.ML.SSL.lightning_containers.ssl_container import EncoderName
 
@@ -32,14 +32,12 @@
 from InnerEye.ML.model_testing import MODEL_OUTPUT_CSV
 
 from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImagingFeatureType
-from InnerEye.ML.reports.notebook_report import generate_notebook, get_ipynb_report_name, str_or_empty
 
 from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
 from InnerEye.ML.utils.run_recovery import RunRecovery
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 
 from InnerEye.ML.configs.ssl.CovidContainers import COVID_DATASET_ID
-from InnerEye.Common import fixed_paths as fixed_paths_innereye
 
 
 class CovidHierarchicalModel(ScalarModelBase):
@@ -64,8 +62,7 @@ class CovidHierarchicalModel(ScalarModelBase):
                                         "is assumed to contain unique ids.")
 
     def __init__(self, covid_dataset_id: str = COVID_DATASET_ID, **kwargs: Any):
-        super().__init__(target_names=['CVX03vs12', 'CVX0vs3', 'CVX1vs2'],
-                         loss_type=ScalarLoss.CustomClassification,
+        super().__init__(loss_type=ScalarLoss.CustomClassification,
                          class_names=['CVX0', 'CVX1', 'CVX2', 'CVX3'],
                          max_num_gpus=1,
                          azure_dataset_id=covid_dataset_id,
@@ -84,7 +81,7 @@ def __init__(self, covid_dataset_id: str = COVID_DATASET_ID, **kwargs: Any):
                          l_rate_step_gamma=1.0,
                          l_rate_multi_step_milestones=None,
                          should_validate=False)  # validate only after adding kwargs
-        self.num_classes = 3
+        self.num_classes = 4
         self.add_and_validate(kwargs)
 
     def validate(self) -> None:
@@ -192,39 +189,24 @@ def _get_ssl_checkpoint_path(self) -> Path:
     def pre_process_dataset_dataframe(self) -> None:
         pass
 
-    @staticmethod
-    def get_posthoc_label_transform() -> Callable:
-        import torch
-
-        def multiclass_to_hierarchical_labels(classes: torch.Tensor) -> torch.Tensor:
-            classes = classes.clone()
-            cvx03vs12 = classes[..., 1] + classes[..., 2]
-            cvx0vs3 = classes[..., 3]
-            cvx1vs2 = classes[..., 2]
-            cvx0vs3[cvx03vs12 == 1] = float('nan')  # CVX0vs3 only gets gradient for CVX03
-            cvx1vs2[cvx03vs12 == 0] = float('nan')  # CVX1vs2 only gets gradient for CVX12
-            return torch.stack([cvx03vs12, cvx0vs3, cvx1vs2], -1)
-
-        return multiclass_to_hierarchical_labels
-
     @staticmethod
     def get_loss_function() -> Callable:
         import torch
         import torch.nn.functional as F
 
-        def nan_bce_with_logits(output: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-            """Compute BCE with logits, ignoring NaN values"""
-            valid = labels.isfinite()
-            losses = F.binary_cross_entropy_with_logits(output[valid], labels[valid], reduction='none')
-            return losses.sum() / labels.shape[0]
+        def custom_loss(output: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+            labels = torch.argmax(labels, dim=-1)
+            return F.cross_entropy(input=output, target=labels, reduction="sum")
+
+        return custom_loss
 
-        return nan_bce_with_logits
+    def get_post_loss_logits_normalization_function(self) -> Callable:
+        return torch.nn.Softmax()
 
     def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """
-        Generate a custom report for the CovidDataset Hierarchical model. At the moment, this report will read the
-        file model_output.csv generated for the training, validation or test sets and compute a 4 class accuracy
-        and confusion matrix based on this.
+        Generate a custom report for the Covid model. This report will read the file model_output.csv generated for
+        the training, validation or test sets and compute a 4 class accuracy and confusion matrix based on this.
         :param report_dir: Directory report is to be written to
         :param model_proc: Whether this is a single or ensemble model (model_output.csv will be located in different
         paths for single vs ensemble runs.)
@@ -234,24 +216,38 @@ def get_output_csv_path(mode: ModelExecutionMode) -> Path:
             p = get_best_epoch_results_path(mode=mode, model_proc=model_proc)
             return self.outputs_folder / p / MODEL_OUTPUT_CSV
 
+        def get_labels_and_predictions(df: pd.DataFrame) -> pd.DataFrame:
+            labels = []
+            predictions = []
+            for i, target in enumerate(self.target_names):
+                predictions.append(df[df[LoggingColumns.Hue.value] == target][LoggingColumns.ModelOutput.value].item())
+                labels.append(df[df[LoggingColumns.Hue.value] == target][LoggingColumns.Label.value])
+
+            return pd.DataFrame.from_dict({LoggingColumns.Patient.value: [df.iloc[0][LoggingColumns.Patient.value]],
+                                           LoggingColumns.ModelOutput.value: [np.argmax(predictions)],
+                                           LoggingColumns.Label.value: [np.argmax(labels)]})
+
+        def get_accuracy(df):
+            df = df.groupby(LoggingColumns.Patient.value, as_index=False).apply(get_labels_and_predictions).reset_index(
+                drop=True)
+            df["tp+tn"] = df.apply(
+                lambda x: 1 if x[LoggingColumns.ModelOutput.value] == x[LoggingColumns.Label.value] else 0, axis=1)
+            return np.sum(df["tp+tn"].values) / len(df)
+
         train_metrics = get_output_csv_path(ModelExecutionMode.TRAIN)
         val_metrics = get_output_csv_path(ModelExecutionMode.VAL)
         test_metrics = get_output_csv_path(ModelExecutionMode.TEST)
 
-        notebook_params = \
-            {
-                'innereye_path': str(fixed_paths_innereye.repository_root_directory()),
-                'train_metrics_csv': str_or_empty(train_metrics),
-                'val_metrics_csv': str_or_empty(val_metrics),
-                'test_metrics_csv': str_or_empty(test_metrics),
-                "config": codecs.encode(pickle.dumps(self), "base64").decode(),
-                "is_crossval_report": False
-            }
-        template = Path(__file__).absolute().parent.parent / "reports" / "CovidHierarchicalModelReport.ipynb"
-        return generate_notebook(template,
-                                 notebook_params=notebook_params,
-                                 result_notebook=report_dir / get_ipynb_report_name(
-                                     f"{self.model_category.value}_hierarchical"))
+        msg = f"Multiclass Accuracy Train: {get_accuracy(pd.read_csv(train_metrics))}\n" if train_metrics.exists() else ""
+        msg += f"Multiclass Accuracy Val: {get_accuracy(pd.read_csv(val_metrics))}\n" if val_metrics.exists() else ""
+        msg += f"Multiclass Accuracy Test: {get_accuracy(pd.read_csv(test_metrics))}\n" if test_metrics.exists() else ""
+
+        report = report_dir / "report.txt"
+        report.write_text(msg)
+
+        logging.info(msg)
+
+        return report
 
 
 class DicomPreparation:

From 6292d3997e45281d7cdae2ad5a65d438b509f70d Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 09:23:15 +0100
Subject: [PATCH 02/22] Refactor

---
 ...ovidHierarchicalModel.py => CovidModel.py} |   2 +-
 .../CovidHierarchicalModelReport.ipynb        | 160 ------------------
 .../covid_hierarchical_model_report.py        | 104 ------------
 3 files changed, 1 insertion(+), 265 deletions(-)
 rename InnerEye/ML/configs/classification/{CovidHierarchicalModel.py => CovidModel.py} (99%)
 delete mode 100644 InnerEye/ML/configs/reports/CovidHierarchicalModelReport.ipynb
 delete mode 100644 InnerEye/ML/configs/reports/covid_hierarchical_model_report.py

diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidModel.py
similarity index 99%
rename from InnerEye/ML/configs/classification/CovidHierarchicalModel.py
rename to InnerEye/ML/configs/classification/CovidModel.py
index ace411a53..8921baf43 100644
--- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -40,7 +40,7 @@
 from InnerEye.ML.configs.ssl.CovidContainers import COVID_DATASET_ID
 
 
-class CovidHierarchicalModel(ScalarModelBase):
+class CovidModel(ScalarModelBase):
     """
     Model to train a CovidDataset model from scratch or finetune from SSL-pretrained model.
 
diff --git a/InnerEye/ML/configs/reports/CovidHierarchicalModelReport.ipynb b/InnerEye/ML/configs/reports/CovidHierarchicalModelReport.ipynb
deleted file mode 100644
index 6e87854d9..000000000
--- a/InnerEye/ML/configs/reports/CovidHierarchicalModelReport.ipynb
+++ /dev/null
@@ -1,160 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%javascript\n",
-    "IPython.OutputArea.prototype._should_scroll = function(lines) {\n",
-    "    return false;\n",
-    "}\n",
-    "// Stops auto-scrolling so entire output is visible: see https://stackoverflow.com/a/41646403"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2",
-   "metadata": {
-    "tags": [
-     "parameters"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "# Default parameter values. They will be overwritten by papermill notebook parameters.\n",
-    "# This cell must carry the tag \"parameters\" in its metadata.\n",
-    "from pathlib import Path\n",
-    "import pickle\n",
-    "import codecs\n",
-    "\n",
-    "innereye_path = Path.cwd().parent.parent.parent.parent\n",
-    "train_metrics_csv = \"\"\n",
-    "val_metrics_csv = \"\"\n",
-    "test_metrics_csv = \"\"\n",
-    "config = \"\"\n",
-    "is_crossval_report = False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "\n",
-    "if str(innereye_path) not in sys.path:\n",
-    "    sys.path.append(str(innereye_path))\n",
-    "\n",
-    "%matplotlib inline\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "config = pickle.loads(codecs.decode(config.encode(), \"base64\"))\n",
-    "\n",
-    "from InnerEye.ML.common import ModelExecutionMode\n",
-    "from InnerEye.ML.reports.notebook_report import print_header\n",
-    "from InnerEye.ML.configs.reports.covid_hierarchical_model_report import print_metrics_from_csv\n",
-    "\n",
-    "import warnings\n",
-    "warnings.filterwarnings(\"ignore\")\n",
-    "plt.rcParams['figure.figsize'] = (20, 10)\n",
-    "\n",
-    "#convert params to Path\n",
-    "train_metrics_csv = Path(train_metrics_csv)\n",
-    "val_metrics_csv = Path(val_metrics_csv)\n",
-    "test_metrics_csv = Path(test_metrics_csv)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4",
-   "metadata": {},
-   "source": [
-    "# Metrics\n",
-    "## Train Set"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if train_metrics_csv.is_file():\n",
-    "    print_metrics_from_csv(csv_to_set_optimal_threshold=train_metrics_csv,\n",
-    "                           csv_to_compute_metrics=train_metrics_csv,\n",
-    "                           config=config, is_crossval_report=is_crossval_report)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6",
-   "metadata": {},
-   "source": [
-    "## Validation Set"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if val_metrics_csv.is_file():\n",
-    "    print_metrics_from_csv(csv_to_set_optimal_threshold=val_metrics_csv,\n",
-    "                           csv_to_compute_metrics=val_metrics_csv,\n",
-    "                           config=config, is_crossval_report=is_crossval_report)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8",
-   "metadata": {},
-   "source": [
-    "## Test Set"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if val_metrics_csv.is_file() and test_metrics_csv.is_file():\n",
-    "    print_metrics_from_csv(csv_to_set_optimal_threshold=val_metrics_csv,\n",
-    "                           csv_to_compute_metrics=test_metrics_csv,\n",
-    "                           config=config, is_crossval_report=is_crossval_report)"
-   ]
-  }
- ],
- "metadata": {
-  "celltoolbar": "Tags",
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/InnerEye/ML/configs/reports/covid_hierarchical_model_report.py b/InnerEye/ML/configs/reports/covid_hierarchical_model_report.py
deleted file mode 100644
index f46e797e6..000000000
--- a/InnerEye/ML/configs/reports/covid_hierarchical_model_report.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import pandas as pd
-import numpy as np
-
-from pathlib import Path
-from sklearn.metrics import accuracy_score, confusion_matrix
-from typing import Dict
-
-from InnerEye.Common.metrics_constants import LoggingColumns
-from InnerEye.ML.reports.classification_report import get_labels_and_predictions_from_dataframe, LabelsAndPredictions
-from InnerEye.ML.reports.notebook_report import print_table
-from InnerEye.ML.scalar_config import ScalarModelBase
-
-TARGET_NAMES = ['CVX03vs12', 'CVX0vs3', 'CVX1vs2']
-MULTICLASS_HUE_NAME = "Multiclass"
-
-
-def get_label_from_label_dict(label_dict: Dict[str, float]) -> int:
-    """
-    Converts strings CVX03vs12, CVX1vs2, CVX0vs3 to the corresponding class as int.
-    """
-    if label_dict['CVX03vs12'] == 0:
-        assert np.isnan(label_dict['CVX1vs2'])
-        if label_dict['CVX0vs3'] == 0:
-            label = 0
-        elif label_dict['CVX0vs3'] == 1:
-            label = 3
-        else:
-            raise ValueError("CVX0vs3 should be 0 or 1.")
-    elif label_dict['CVX03vs12'] == 1:
-        assert np.isnan(label_dict['CVX0vs3'])
-        if label_dict['CVX1vs2'] == 0:
-            label = 1
-        elif label_dict['CVX1vs2'] == 1:
-            label = 2
-        else:
-            raise ValueError("CVX1vs2 should be 0 or 1.")
-    else:
-        raise ValueError("CVX03vs12 should be 0 or 1.")
-    return label
-
-
-def get_model_prediction_by_probabilities(output_dict: Dict[str, float]) -> int:
-    """
-    Based on the values for CVX03vs12, CVX0vs3 and CVX1vs2 predicted by the model, predict the CVX scores as followed:
-    score(CVX0) = [1 - score(CVX03vs12)][1 - score(CVX0vs3)]
-    score(CVX1) = score(CVX03vs12)[1 - score(CVX1vs2)]
-    score(CVX2) = score(CVX03vs12)score(CVX1vs2)
-    score(CVX3) = [1 - score(CVX03vs12)]score(CVX0vs3)
-    """
-    cvx0 = (1 - output_dict['CVX03vs12']) * (1 - output_dict['CVX0vs3'])
-    cvx3 = (1 - output_dict['CVX03vs12']) * output_dict['CVX0vs3']
-    cvx1 = output_dict['CVX03vs12'] * (1 - output_dict['CVX1vs2'])
-    cvx2 = output_dict['CVX03vs12'] * output_dict['CVX1vs2']
-    return np.argmax([cvx0, cvx1, cvx2, cvx3])
-
-
-def get_dataframe_with_covid_labels(metrics_df: pd.DataFrame) -> pd.DataFrame:
-    def get_CVX_labels(df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Given a dataframe (with only one subject) with the model outputs for CVX03vs12, CVX0vs3 and CVX1vs2,
-        returns a corresponding dataframe with scores for CVX0, CVX1, CVX2 and CVX3 for this subject. See
-        `get_model_prediction_by_probabilities` for details on mapping the model output to CVX labels.
-        """
-        df_by_hue = df[df[LoggingColumns.Hue.value].isin(TARGET_NAMES)].set_index(LoggingColumns.Hue.value)
-        model_output = get_model_prediction_by_probabilities(df_by_hue[LoggingColumns.ModelOutput.value].to_dict())
-        label = get_label_from_label_dict(df_by_hue[LoggingColumns.Label.value].to_dict())
-
-        return pd.DataFrame.from_dict({LoggingColumns.Patient.value: [df.iloc[0][LoggingColumns.Patient.value]],
-                                       LoggingColumns.ModelOutput.value: [model_output],
-                                       LoggingColumns.Label.value: [label]})
-
-    df = metrics_df.copy()
-    # Group by subject, and for each subject, convert the CVX03vs12, CVX0vs3 and CVX1vs2 predictions to CVX labels.
-    df = df.groupby(LoggingColumns.Patient.value, as_index=False).apply(get_CVX_labels).reset_index(drop=True)
-    df[LoggingColumns.Hue.value] = [MULTICLASS_HUE_NAME] * len(df)
-    return df
-
-
-def get_labels_and_predictions_covid_labels(csv: Path) -> LabelsAndPredictions:
-    metrics_df = pd.read_csv(csv)
-    df = get_dataframe_with_covid_labels(metrics_df=metrics_df)
-    return get_labels_and_predictions_from_dataframe(df)
-
-
-def print_metrics_from_csv(csv_to_set_optimal_threshold: Path,
-                           csv_to_compute_metrics: Path,
-                           config: ScalarModelBase,
-                           is_crossval_report: bool) -> None:
-    assert config.target_names == TARGET_NAMES
-
-    predictions_to_compute_metrics = get_labels_and_predictions_covid_labels(
-        csv=csv_to_compute_metrics)
-
-    acc = accuracy_score(predictions_to_compute_metrics.labels, predictions_to_compute_metrics.model_outputs)
-    rows = [[f"{acc:.4f}"]]
-    print_table(rows, header=["Multiclass Accuracy"])
-
-    conf_matrix = confusion_matrix(predictions_to_compute_metrics.labels, predictions_to_compute_metrics.model_outputs)
-    rows = []
-    header = ["", "CVX0 predicted", "CVX1 predicted", "CVX2 predicted", "CVX3 predicted"]
-    for i in range(conf_matrix.shape[0]):
-        line = [f"CVX{i} GT"] + list(conf_matrix[i])
-        rows.append(line)
-    print_table(rows, header=header)

From 6ea667bf34280232d1dc1711e322c3ea4713a19f Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 09:29:00 +0100
Subject: [PATCH 03/22] Log multiclass accuracy

---
 InnerEye/ML/configs/classification/CovidModel.py |  2 +-
 InnerEye/ML/lightning_models.py                  | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 8921baf43..da5d02ba1 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -48,7 +48,7 @@ class CovidModel(ScalarModelBase):
     --pretraining_run_recovery_id=id_of_your_ssl_model, this will download the checkpoints of the run to your
     machine and load the corresponding pretrained model.
 
-    To recover from a particular checkpoint from your SSL run e.g. "recovery_epoch=499.ckpt" please use hte
+    To recover from a particular checkpoint from your SSL run e.g. "recovery_epoch=499.ckpt" please use the
     --name_of_checkpoint argument.
     """
     use_pretrained_model = param.Boolean(default=False, doc="If True, start training from a model pretrained with SSL."
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 490e693ee..7b875a3ee 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -7,6 +7,7 @@
 import torch
 from pytorch_lightning.utilities import move_data_to_device
 from torch.nn import ModuleDict, ModuleList
+import torchmetrics
 
 from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
 from InnerEye.Common.metrics_constants import LoggingColumns, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX
@@ -197,7 +198,8 @@ def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None:
         # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike)
         self.train_metric_computers = self.create_metric_computers()
         self.val_metric_computers = self.create_metric_computers()
-
+        self.train_accuracy = torchmetrics.Accuracy()
+        self.val_accuracy = torchmetrics.Accuracy()
         # if config.compute_grad_cam:
         #     model_to_evaluate = self.train_val_params.mean_teacher_model if \
         #         config.compute_mean_teacher_model else self.train_val_params.model
@@ -279,6 +281,7 @@ def training_or_validation_step(self,
         subject_ids = model_inputs_and_labels.subject_ids
         loss = self.loss_fn(logits, labels)
         self.write_loss(is_training, loss)
+        self.compute_and_log_accuracy(logits, model_inputs_and_labels.labels, is_training)
         self.compute_and_log_metrics(logits, labels, subject_ids, is_training)
         self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
                           value=len(model_inputs_and_labels.subject_ids),
@@ -286,6 +289,15 @@ def training_or_validation_step(self,
                           reduce_fx=sum)
         return loss
 
+    def compute_and_log_accuracy(self, logits, labels, is_training):
+        posteriors = self.logits_to_posterior(logits)
+        labels = torch.argmax(labels.data.to(dtype=torch.int), dim=-1)
+        metric = self.train_accuracy if is_training else self.val_accuracy
+        metric(posteriors, labels)
+        self.log_on_epoch(name="MulticlassAccuracy",
+                          value=metric,
+                          is_training=is_training)
+
     def compute_and_log_metrics(self,
                                 logits: torch.Tensor,
                                 targets: torch.Tensor,

From 8557cdf10ddce1b4dbfea2d13fcd92cda58acf85 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 09:33:53 +0100
Subject: [PATCH 04/22] Remove get_posthoc_label_transform

---
 InnerEye/ML/lightning_models.py | 4 +---
 InnerEye/ML/model_testing.py    | 3 ---
 InnerEye/ML/scalar_config.py    | 8 --------
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 7b875a3ee..586e6e02e 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -179,7 +179,6 @@ def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None:
         super().__init__(config, *args, **kwargs)
         self.model = config.create_model()
         raw_loss = model_util.create_scalar_loss_function(config)
-        self.posthoc_label_transform = config.get_posthoc_label_transform()
         if isinstance(config, SequenceModelBase):
             self.loss_fn = lambda model_output, loss: apply_sequence_model_loss(raw_loss, model_output, loss)
             self.target_indices = config.get_target_indices()
@@ -272,7 +271,6 @@ def training_or_validation_step(self,
         """
         model_inputs_and_labels = get_scalar_model_inputs_and_labels(self.model, self.target_indices, sample)
         labels = model_inputs_and_labels.labels
-        labels = self.posthoc_label_transform(labels)
         if is_training:
             logits = self.model(*model_inputs_and_labels.model_inputs)
         else:
@@ -281,7 +279,7 @@ def training_or_validation_step(self,
         subject_ids = model_inputs_and_labels.subject_ids
         loss = self.loss_fn(logits, labels)
         self.write_loss(is_training, loss)
-        self.compute_and_log_accuracy(logits, model_inputs_and_labels.labels, is_training)
+        self.compute_and_log_accuracy(logits, labels, is_training)
         self.compute_and_log_metrics(logits, labels, subject_ids, is_training)
         self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
                           value=len(model_inputs_and_labels.subject_ids),
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 9b6c7521e..5fefdc44a 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -423,8 +423,6 @@ def classification_model_test(config: ScalarModelBase,
     :param model_proc: whether we are testing an ensemble or single model
     :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs.
     """
-    posthoc_label_transform = config.get_posthoc_label_transform()
-
     checkpoint_paths = checkpoint_handler.get_checkpoints_to_test()
     if not checkpoint_paths:
         raise ValueError("There were no checkpoints available for model testing.")
@@ -457,7 +455,6 @@ def classification_model_test(config: ScalarModelBase,
         result = pipeline.predict(sample)
         model_output = result.posteriors
         label = result.labels.to(device=model_output.device)
-        label = posthoc_label_transform(label)
         sample_id = result.subject_ids[0]
         if output_logger:
             for i in range(len(config.target_names)):
diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index 0bea54279..ceb20dc43 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -359,14 +359,6 @@ def get_label_transform(self) -> Union[Callable, List[Callable]]:
         """
         return LabelTransformation.identity
 
-    def get_posthoc_label_transform(self) -> Callable:
-        """
-        Return a transformation to apply to the labels after they are loaded, for computing losses, metrics, and
-        reports. The transformed labels refer to the config's target_names, if defined (class_names, otherwise).
-        If not overriden, this method does not change the loaded labels.
-        """
-        return lambda x: x  # no-op by default
-
     def read_dataset_into_dataframe_and_pre_process(self) -> None:
         assert self.local_dataset is not None
         file_path = self.local_dataset / self.dataset_csv

From e25d202db460ed92dc760c4138c3084b0b3704b3 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 12:42:43 +0100
Subject: [PATCH 05/22] Refactor metrics

---
 .../ML/configs/classification/CovidModel.py   |  26 ++++-
 InnerEye/ML/lightning_models.py               | 108 ++----------------
 InnerEye/ML/scalar_config.py                  |  85 ++++++++++++++
 3 files changed, 118 insertions(+), 101 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index da5d02ba1..4f4609999 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -3,14 +3,17 @@
 import math
 from pathlib import Path
 
-from typing import Any, Callable
+from typing import Any, Callable, List
 
 import PIL
 import numpy as np
 import pandas as pd
 import param
 import torch
+import torchmetrics
+
 from PIL import Image
+from torch.nn import ModuleList, ModuleDict
 from pytorch_lightning import LightningModule
 from torchvision.transforms import Compose
 
@@ -28,6 +31,7 @@
 from InnerEye.ML.deep_learning_config import LRSchedulerType, MultiprocessingStartMethod, \
     OptimizerType
 
+from InnerEye.ML.metrics_dict import DataframeLogger
 from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
 from InnerEye.ML.model_testing import MODEL_OUTPUT_CSV
 
@@ -39,6 +43,7 @@
 
 from InnerEye.ML.configs.ssl.CovidContainers import COVID_DATASET_ID
 
+from InnerEye.ML.metrics_dict import MetricsDict
 
 class CovidModel(ScalarModelBase):
     """
@@ -203,6 +208,25 @@ def custom_loss(output: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
     def get_post_loss_logits_normalization_function(self) -> Callable:
         return torch.nn.Softmax()
 
+    def create_metric_computers(self) -> ModuleDict:
+        return ModuleDict({MetricsDict.DEFAULT_HUE_KEY: ModuleList([torchmetrics.Accuracy()])})
+
+    def compute_and_log_metrics(self,
+                                logits: torch.Tensor,
+                                targets: torch.Tensor,
+                                subject_ids: List[str],
+                                is_training: bool,
+                                metrics: ModuleDict,
+                                logger: DataframeLogger,
+                                current_epoch: int) -> None:
+        posteriors = self.logits_to_posterior(logits)
+        labels = torch.argmax(targets.data.to(dtype=torch.int), dim=-1)
+        metric = self.train_accuracy if is_training else self.val_accuracy
+        metric[MetricsDict.DEFAULT_HUE_KEY][0](posteriors, labels)
+        self.log_on_epoch(name="MultiClassAccuracy",
+                          value=metric,
+                          is_training=is_training)
+
     def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """
         Generate a custom report for the Covid model. This report will read the file model_output.csv generated for
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 586e6e02e..adcc1d6cc 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -2,24 +2,18 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List
 
 import torch
 from pytorch_lightning.utilities import move_data_to_device
-from torch.nn import ModuleDict, ModuleList
-import torchmetrics
 
 from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
 from InnerEye.Common.metrics_constants import LoggingColumns, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX
-from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.config import SegmentationModelBase
 from InnerEye.ML.dataset.sample import CroppedSample
 from InnerEye.ML.dataset.scalar_sample import ScalarItem
 from InnerEye.ML.lightning_base import InnerEyeLightning
-from InnerEye.ML.lightning_metrics import Accuracy05, AccuracyAtOptimalThreshold, AreaUnderPrecisionRecallCurve, \
-    AreaUnderRocCurve, BinaryCrossEntropyWithLogits, ExplainedVariance, FalseNegativeRateOptimalThreshold, \
-    FalsePositiveRateOptimalThreshold, MeanAbsoluteError, MeanSquaredError, MetricForMultipleStructures, \
-    OptimalThreshold, ScalarMetricsBase
+from InnerEye.ML.lightning_metrics import MetricForMultipleStructures
 from InnerEye.ML.metrics import compute_dice_across_patches
 from InnerEye.ML.metrics_dict import DataframeLogger, MetricsDict, SequenceMetricsDict
 from InnerEye.ML.model_config_base import ModelConfigBase
@@ -27,7 +21,7 @@
 from InnerEye.ML.sequence_config import SequenceModelBase
 from InnerEye.ML.utils import image_util, metrics_util, model_util
 from InnerEye.ML.utils.model_util import get_scalar_model_inputs_and_labels
-from InnerEye.ML.utils.sequence_utils import apply_sequence_model_loss, get_masked_model_outputs_and_labels
+from InnerEye.ML.utils.sequence_utils import apply_sequence_model_loss
 
 SUBJECT_OUTPUT_PER_RANK_PREFIX = f"{SUBJECT_METRICS_FILE_NAME}.rank"
 
@@ -195,43 +189,14 @@ def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None:
         self.loss_type = config.loss_type
         # These two fields store the PyTorch Lightning Metrics objects that will compute metrics on validation
         # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike)
-        self.train_metric_computers = self.create_metric_computers()
-        self.val_metric_computers = self.create_metric_computers()
-        self.train_accuracy = torchmetrics.Accuracy()
-        self.val_accuracy = torchmetrics.Accuracy()
+        self.train_metric_computers = config.create_metric_computers()
+        self.val_metric_computers = config.create_metric_computers()
         # if config.compute_grad_cam:
         #     model_to_evaluate = self.train_val_params.mean_teacher_model if \
         #         config.compute_mean_teacher_model else self.train_val_params.model
         #     self.guided_grad_cam = VisualizationMaps(model_to_evaluate, config)
         #     config.visualization_folder.mkdir(exist_ok=True)
 
-    def create_metric_computers(self) -> ModuleDict:
-        """
-        Gets a set of objects that compute all the metrics for the type of model that is being trained,
-        across all prediction targets (sequence positions when using a sequence model).
-        :return: A dictionary mapping from names of prediction targets to a list of metric computers.
-        """
-        # The metric computers should be stored in an object that derives from torch.Module,
-        # so that they are picked up when moving the whole LightningModule to GPU.
-        # https://github.com/PyTorchLightning/pytorch-lightning/issues/4713
-        return ModuleDict({p: self._get_metrics_computers() for p in self.target_names})
-
-    def _get_metrics_computers(self) -> ModuleList:
-        """
-        Gets the objects that compute metrics for the present kind of models, for a single prediction target.
-        """
-        if self.is_classification_model:
-            return ModuleList([Accuracy05(),
-                               AccuracyAtOptimalThreshold(),
-                               OptimalThreshold(),
-                               FalsePositiveRateOptimalThreshold(),
-                               FalseNegativeRateOptimalThreshold(),
-                               AreaUnderRocCurve(),
-                               AreaUnderPrecisionRecallCurve(),
-                               BinaryCrossEntropyWithLogits()])
-        else:
-            return ModuleList([MeanAbsoluteError(), MeanSquaredError(), ExplainedVariance()])
-
     def forward(self, *model_inputs: torch.Tensor) -> torch.Tensor:  # type: ignore
         """
         Runs a list of model input tensors through the model and returns the results.
@@ -279,72 +244,15 @@ def training_or_validation_step(self,
         subject_ids = model_inputs_and_labels.subject_ids
         loss = self.loss_fn(logits, labels)
         self.write_loss(is_training, loss)
-        self.compute_and_log_accuracy(logits, labels, is_training)
-        self.compute_and_log_metrics(logits, labels, subject_ids, is_training)
+        metrics = self.train_metric_computers if is_training else self.val_metric_computers
+        logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
+        self.config.compute_and_log_metrics(logits, labels, subject_ids, is_training, metrics, logger, self.current_epoch)
         self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
                           value=len(model_inputs_and_labels.subject_ids),
                           is_training=is_training,
                           reduce_fx=sum)
         return loss
 
-    def compute_and_log_accuracy(self, logits, labels, is_training):
-        posteriors = self.logits_to_posterior(logits)
-        labels = torch.argmax(labels.data.to(dtype=torch.int), dim=-1)
-        metric = self.train_accuracy if is_training else self.val_accuracy
-        metric(posteriors, labels)
-        self.log_on_epoch(name="MulticlassAccuracy",
-                          value=metric,
-                          is_training=is_training)
-
-    def compute_and_log_metrics(self,
-                                logits: torch.Tensor,
-                                targets: torch.Tensor,
-                                subject_ids: List[str],
-                                is_training: bool) -> None:
-        """
-        Computes all the metrics for a given (logits, labels) pair, and writes them to the loggers.
-        :param logits: The model output before normalization.
-        :param targets: The expected model outputs.
-        :param subject_ids: The subject IDs for the present minibatch.
-        :param is_training: If True, write the metrics as training metrics, otherwise as validation metrics.
-        :return:
-        """
-        metrics = self.train_metric_computers if is_training else self.val_metric_computers
-        per_subject_outputs: List[Tuple[str, str, torch.Tensor, torch.Tensor]] = []
-        for i, (prediction_target, metric_list) in enumerate(metrics.items()):
-            # mask the model outputs and labels if required
-            masked = get_masked_model_outputs_and_labels(
-                logits[:, i, ...], targets[:, i, ...], subject_ids)
-            # compute metrics on valid masked tensors only
-            if masked is not None:
-                _logits = masked.model_outputs.data
-                _posteriors = self.logits_to_posterior(_logits)
-                # Classification metrics expect labels as integers, but they are float throughout the rest of the code
-                labels_dtype = torch.int if self.is_classification_model else _posteriors.dtype
-                _labels = masked.labels.data.to(dtype=labels_dtype)
-                _subject_ids = masked.subject_ids
-                assert _subject_ids is not None
-                for metric in metric_list:
-                    if isinstance(metric, ScalarMetricsBase) and metric.compute_from_logits:
-                        metric(_logits, _labels)
-                    else:
-                        metric(_posteriors, _labels)
-                per_subject_outputs.extend(
-                    zip(_subject_ids, [prediction_target] * len(_subject_ids), _posteriors.tolist(), _labels.tolist()))
-        # Write a full breakdown of per-subject predictions and labels to a file. These files are local to the current
-        # rank in distributed training, and will be aggregated after training.
-        logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
-        data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
-        for subject, prediction_target, model_output, label in per_subject_outputs:
-            logger.add_record({
-                LoggingColumns.Epoch.value: self.current_epoch,
-                LoggingColumns.Patient.value: subject,
-                LoggingColumns.Hue.value: prediction_target,
-                LoggingColumns.ModelOutput.value: model_output,
-                LoggingColumns.Label.value: label,
-                LoggingColumns.DataSplit.value: data_split.value
-            })
-
     def training_or_validation_epoch_end(self, is_training: bool) -> None:
         """
         Writes all training or validation metrics that were aggregated over the epoch to the loggers.
diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index ceb20dc43..d562bd45e 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -8,18 +8,27 @@
 
 import pandas as pd
 import param
+import torch
 from azureml.core import ScriptRunConfig
 from azureml.train.hyperdrive import HyperDriveConfig
 
+from torch.nn import ModuleDict, ModuleList
+
 from InnerEye.Common.common_util import print_exception
 from InnerEye.Common.generic_parsing import ListOrDictParam
+from InnerEye.Common.metrics_constants import LoggingColumns
 from InnerEye.Common.type_annotations import TupleInt3
 
 from InnerEye.ML.common import ModelExecutionMode, OneHotEncoderBase
 from InnerEye.ML.deep_learning_config import ModelCategory
+from InnerEye.ML.lightning_metrics import Accuracy05, AccuracyAtOptimalThreshold, AreaUnderPrecisionRecallCurve, \
+    AreaUnderRocCurve, BinaryCrossEntropyWithLogits, ExplainedVariance, FalseNegativeRateOptimalThreshold, \
+    FalsePositiveRateOptimalThreshold, MeanAbsoluteError, MeanSquaredError, OptimalThreshold, ScalarMetricsBase
+from InnerEye.ML.metrics_dict import DataframeLogger
 from InnerEye.ML.model_config_base import ModelConfigBase, ModelTransformsPerExecutionMode
 from InnerEye.ML.utils.csv_util import CSV_CHANNEL_HEADER, CSV_SUBJECT_HEADER
 from InnerEye.ML.utils.split_dataset import DatasetSplits
+from InnerEye.ML.utils.sequence_utils import get_masked_model_outputs_and_labels
 
 DEFAULT_KEY = "Default"
 
@@ -492,6 +501,82 @@ def get_scalar_item_transform(self) -> ModelTransformsPerExecutionMode:
             val=ScalarItemAugmentation(image_transform.val, segmentation_transform.val),
             test=ScalarItemAugmentation(image_transform.test, segmentation_transform.test))
 
+    def create_metric_computers(self) -> ModuleDict:
+        """
+        Gets a set of objects that compute all the metrics for the type of model that is being trained,
+        across all prediction targets (sequence positions when using a sequence model).
+        :return: A dictionary mapping from names of prediction targets to a list of metric computers.
+        """
+        # The metric computers should be stored in an object that derives from torch.Module,
+        # so that they are picked up when moving the whole LightningModule to GPU.
+        # https://github.com/PyTorchLightning/pytorch-lightning/issues/4713
+        return ModuleDict({p: self._get_metrics_computers() for p in self.target_names})
+
+    def _get_metrics_computers(self) -> ModuleList:
+        """
+        Gets the objects that compute metrics for the present kind of models, for a single prediction target.
+        """
+        if self.is_classification_model:
+            return ModuleList([Accuracy05(),
+                               AccuracyAtOptimalThreshold(),
+                               OptimalThreshold(),
+                               FalsePositiveRateOptimalThreshold(),
+                               FalseNegativeRateOptimalThreshold(),
+                               AreaUnderRocCurve(),
+                               AreaUnderPrecisionRecallCurve(),
+                               BinaryCrossEntropyWithLogits()])
+        else:
+            return ModuleList([MeanAbsoluteError(), MeanSquaredError(), ExplainedVariance()])
+
+    def compute_and_log_metrics(self,
+                                logits: torch.Tensor,
+                                targets: torch.Tensor,
+                                subject_ids: List[str],
+                                is_training: bool,
+                                metrics: ModuleDict,
+                                logger: DataframeLogger,
+                                current_epoch: int) -> None:
+        """
+        Computes all the metrics for a given (logits, labels) pair, and writes them to the loggers.
+        :param logits: The model output before normalization.
+        :param targets: The expected model outputs.
+        :param subject_ids: The subject IDs for the present minibatch.
+        :param is_training: If True, write the metrics as training metrics, otherwise as validation metrics.
+        :return:
+        """
+        per_subject_outputs: List[Tuple[str, str, torch.Tensor, torch.Tensor]] = []
+        for i, (prediction_target, metric_list) in enumerate(metrics.items()):
+            # mask the model outputs and labels if required
+            masked = get_masked_model_outputs_and_labels(
+                logits[:, i, ...], targets[:, i, ...], subject_ids)
+            # compute metrics on valid masked tensors only
+            if masked is not None:
+                _logits = masked.model_outputs.data
+                _posteriors = self.logits_to_posterior(_logits)
+                # Classification metrics expect labels as integers, but they are float throughout the rest of the code
+                labels_dtype = torch.int if self.is_classification_model else _posteriors.dtype
+                _labels = masked.labels.data.to(dtype=labels_dtype)
+                _subject_ids = masked.subject_ids
+                assert _subject_ids is not None
+                for metric in metric_list:
+                    if isinstance(metric, ScalarMetricsBase) and metric.compute_from_logits:
+                        metric(_logits, _labels)
+                    else:
+                        metric(_posteriors, _labels)
+                per_subject_outputs.extend(
+                    zip(_subject_ids, [prediction_target] * len(_subject_ids), _posteriors.tolist(), _labels.tolist()))
+        # Write a full breakdown of per-subject predictions and labels to a file. These files are local to the current
+        # rank in distributed training, and will be aggregated after training.
+        data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
+        for subject, prediction_target, model_output, label in per_subject_outputs:
+            logger.add_record({
+                LoggingColumns.Epoch.value: current_epoch,
+                LoggingColumns.Patient.value: subject,
+                LoggingColumns.Hue.value: prediction_target,
+                LoggingColumns.ModelOutput.value: model_output,
+                LoggingColumns.Label.value: label,
+                LoggingColumns.DataSplit.value: data_split.value
+            })
 
 def get_non_image_features_dict(default_channels: List[str],
                                 specific_channels: Optional[Dict[str, List[str]]] = None) -> Dict[str, List[str]]:

From 84c74b46fee041fd6dc4f0a450baa858673ebc8e Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 12:52:21 +0100
Subject: [PATCH 06/22] Move DEFAULT_KEY

---
 InnerEye/ML/metrics_dict.py  | 3 ++-
 InnerEye/ML/scalar_config.py | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/InnerEye/ML/metrics_dict.py b/InnerEye/ML/metrics_dict.py
index 96617e1ef..fd2dc8fad 100644
--- a/InnerEye/ML/metrics_dict.py
+++ b/InnerEye/ML/metrics_dict.py
@@ -21,7 +21,6 @@
 from InnerEye.Common.metrics_constants import INTERNAL_TO_LOGGING_COLUMN_NAMES, LoggingColumns, MetricType, \
     MetricTypeOrStr, SEQUENCE_POSITION_HUE_NAME_PREFIX
 from InnerEye.ML.common import ModelExecutionMode
-from InnerEye.ML.scalar_config import DEFAULT_KEY
 from InnerEye.ML.utils.metrics_util import binary_classification_accuracy, mean_absolute_error, \
     mean_squared_error, r2_score
 
@@ -29,6 +28,8 @@
 T = TypeVar('T', np.ndarray, float)
 MetricsPerExecutionModeAndEpoch = Dict[ModelExecutionMode, Dict[Union[int, str], 'ScalarMetricsDict']]
 
+DEFAULT_KEY = "Default"
+
 
 def average_metric_values(values: List[float], skip_nan_when_averaging: bool) -> float:
     """
diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index d562bd45e..0791d856e 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -24,14 +24,12 @@
 from InnerEye.ML.lightning_metrics import Accuracy05, AccuracyAtOptimalThreshold, AreaUnderPrecisionRecallCurve, \
     AreaUnderRocCurve, BinaryCrossEntropyWithLogits, ExplainedVariance, FalseNegativeRateOptimalThreshold, \
     FalsePositiveRateOptimalThreshold, MeanAbsoluteError, MeanSquaredError, OptimalThreshold, ScalarMetricsBase
-from InnerEye.ML.metrics_dict import DataframeLogger
+from InnerEye.ML.metrics_dict import DEFAULT_KEY, DataframeLogger
 from InnerEye.ML.model_config_base import ModelConfigBase, ModelTransformsPerExecutionMode
 from InnerEye.ML.utils.csv_util import CSV_CHANNEL_HEADER, CSV_SUBJECT_HEADER
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 from InnerEye.ML.utils.sequence_utils import get_masked_model_outputs_and_labels
 
-DEFAULT_KEY = "Default"
-
 
 class AggregationType(Enum):
     """

From 1bf6623694d1639b643c3dc20021c58ea2346f89 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 13:21:21 +0100
Subject: [PATCH 07/22] Log to DataframeLogger

---
 .../ML/configs/classification/CovidModel.py   | 29 ++++++++++++-------
 InnerEye/ML/lightning_models.py               |  3 +-
 InnerEye/ML/scalar_config.py                  |  2 +-
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 4f4609999..830330cd8 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -31,19 +31,17 @@
 from InnerEye.ML.deep_learning_config import LRSchedulerType, MultiprocessingStartMethod, \
     OptimizerType
 
-from InnerEye.ML.metrics_dict import DataframeLogger
+from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImagingFeatureType
 from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
 from InnerEye.ML.model_testing import MODEL_OUTPUT_CSV
 
-from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImagingFeatureType
 
+from InnerEye.ML.configs.ssl.CovidContainers import COVID_DATASET_ID
 from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
 from InnerEye.ML.utils.run_recovery import RunRecovery
 from InnerEye.ML.utils.split_dataset import DatasetSplits
+from InnerEye.ML.metrics_dict import MetricsDict, DataframeLogger
 
-from InnerEye.ML.configs.ssl.CovidContainers import COVID_DATASET_ID
-
-from InnerEye.ML.metrics_dict import MetricsDict
 
 class CovidModel(ScalarModelBase):
     """
@@ -219,13 +217,22 @@ def compute_and_log_metrics(self,
                                 metrics: ModuleDict,
                                 logger: DataframeLogger,
                                 current_epoch: int) -> None:
-        posteriors = self.logits_to_posterior(logits)
+        posteriors = self.get_post_loss_logits_normalization_function()(logits)
         labels = torch.argmax(targets.data.to(dtype=torch.int), dim=-1)
-        metric = self.train_accuracy if is_training else self.val_accuracy
-        metric[MetricsDict.DEFAULT_HUE_KEY][0](posteriors, labels)
-        self.log_on_epoch(name="MultiClassAccuracy",
-                          value=metric,
-                          is_training=is_training)
+        metric = metrics[MetricsDict.DEFAULT_HUE_KEY][0]
+        metric(posteriors, labels)
+
+        data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
+        per_subject_outputs = list(zip(subject_ids, [MetricsDict.DEFAULT_HUE_KEY] * len(subject_ids), posteriors.tolist(), labels.tolist()))
+        for subject, prediction_target, model_output, label in per_subject_outputs:
+            logger.add_record({
+                LoggingColumns.Epoch.value: current_epoch,
+                LoggingColumns.Patient.value: subject,
+                LoggingColumns.Hue.value: prediction_target,
+                LoggingColumns.ModelOutput.value: model_output,
+                LoggingColumns.Label.value: label,
+                LoggingColumns.DataSplit.value: data_split.value
+            })
 
     def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index adcc1d6cc..110663824 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -191,6 +191,7 @@ def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None:
         # and training set, in particular ones that are not possible to compute from a single minibatch (AUC and alike)
         self.train_metric_computers = config.create_metric_computers()
         self.val_metric_computers = config.create_metric_computers()
+        self.compute_and_log_metrics = config.compute_and_log_metrics
         # if config.compute_grad_cam:
         #     model_to_evaluate = self.train_val_params.mean_teacher_model if \
         #         config.compute_mean_teacher_model else self.train_val_params.model
@@ -246,7 +247,7 @@ def training_or_validation_step(self,
         self.write_loss(is_training, loss)
         metrics = self.train_metric_computers if is_training else self.val_metric_computers
         logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
-        self.config.compute_and_log_metrics(logits, labels, subject_ids, is_training, metrics, logger, self.current_epoch)
+        self.compute_and_log_metrics(logits, labels, subject_ids, is_training, metrics, logger, self.current_epoch)
         self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
                           value=len(model_inputs_and_labels.subject_ids),
                           is_training=is_training,
diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index 0791d856e..0a471b6f6 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -550,7 +550,7 @@ def compute_and_log_metrics(self,
             # compute metrics on valid masked tensors only
             if masked is not None:
                 _logits = masked.model_outputs.data
-                _posteriors = self.logits_to_posterior(_logits)
+                _posteriors = self.get_post_loss_logits_normalization_function()(_logits)
                 # Classification metrics expect labels as integers, but they are float throughout the rest of the code
                 labels_dtype = torch.int if self.is_classification_model else _posteriors.dtype
                 _labels = masked.labels.data.to(dtype=labels_dtype)

From 119249db88ec6dec56255441dcdcc8d9a0b74a4c Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 13:41:20 +0100
Subject: [PATCH 08/22] Fix tests

---
 .../test_hierarchical_covid_model_report.py   | 22 -------------------
 Tests/ML/test_metrics.py                      |  8 +++----
 2 files changed, 4 insertions(+), 26 deletions(-)
 delete mode 100644 Tests/ML/configs/utils/test_hierarchical_covid_model_report.py

diff --git a/Tests/ML/configs/utils/test_hierarchical_covid_model_report.py b/Tests/ML/configs/utils/test_hierarchical_covid_model_report.py
deleted file mode 100644
index 1dea1bff5..000000000
--- a/Tests/ML/configs/utils/test_hierarchical_covid_model_report.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import pandas as pd
-from math import nan
-
-from InnerEye.Common.metrics_constants import LoggingColumns
-from InnerEye.ML.configs.reports.covid_hierarchical_model_report import MULTICLASS_HUE_NAME, \
-    get_dataframe_with_covid_labels
-
-
-def test_get_dataframe_with_covid_labels() -> None:
-
-    df = pd.DataFrame.from_dict({LoggingColumns.Patient.value: [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
-                                 LoggingColumns.Hue.value: ['CVX03vs12', 'CVX0vs3', 'CVX1vs2'] * 4,
-                                 LoggingColumns.Label.value: [0, 0, nan, 0, 1, nan, 1, nan, 0, 1, nan, 1],
-                                 LoggingColumns.ModelOutput.value: [0.1, 0.1, 0.5, 0.1, 0.9, 0.5, 0.9, 0.9, 0.9, 0.1, 0.2, 0.1]})
-    expected_df = pd.DataFrame.from_dict({LoggingColumns.Patient.value: [1, 2, 3, 4],
-                                          LoggingColumns.ModelOutput.value: [0, 3, 2, 0],
-                                          LoggingColumns.Label.value: [0, 3, 1, 2],
-                                          LoggingColumns.Hue.value: [MULTICLASS_HUE_NAME] * 4
-                                          })
-
-    multiclass_df = get_dataframe_with_covid_labels(df)
-    assert expected_df.equals(multiclass_df)
diff --git a/Tests/ML/test_metrics.py b/Tests/ML/test_metrics.py
index b30a754fb..dfc74a1a2 100644
--- a/Tests/ML/test_metrics.py
+++ b/Tests/ML/test_metrics.py
@@ -164,8 +164,8 @@ def test_get_column_name_for_logging() -> None:
 
 
 def test_classification_metrics() -> None:
-    classification_module = ScalarLightning(DummyClassification())
-    metrics = classification_module._get_metrics_computers()
+    config = DummyClassification()
+    metrics = config._get_metrics_computers()
     logits = [torch.tensor([2.1972, 1.3863, 0.4055]), torch.tensor([-0.8473, 2.1972, -0.4055])]
     posteriors = [torch.sigmoid(logit) for logit in logits]
     labels = [torch.tensor([1, 1, 0]), torch.tensor([0, 0, 0])]
@@ -203,8 +203,8 @@ def test_classification_metrics() -> None:
 
 
 def test_regression_metrics() -> None:
-    regression_module = ScalarLightning(DummyRegression())
-    metrics = regression_module._get_metrics_computers()
+    config = DummyRegression()
+    metrics = config._get_metrics_computers()
     outputs = [torch.tensor([1., 2., 1.]), torch.tensor([4., 0., 2.])]
     labels = [torch.tensor([1., 1., 0.]), torch.tensor([2., 0., 2.])]
     for output, label in zip(outputs, labels):

From 2ffd4929e4b0927d8341fe21f0ce67616bc9d1b9 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 13:45:48 +0100
Subject: [PATCH 09/22] Bug fix

---
 InnerEye/ML/configs/classification/CovidModel.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 830330cd8..3e681942c 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -10,7 +10,6 @@
 import pandas as pd
 import param
 import torch
-import torchmetrics
 
 from PIL import Image
 from torch.nn import ModuleList, ModuleDict
@@ -31,11 +30,11 @@
 from InnerEye.ML.deep_learning_config import LRSchedulerType, MultiprocessingStartMethod, \
     OptimizerType
 
+from InnerEye.ML.lightning_metrics import Accuracy05
 from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImagingFeatureType
 from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
 from InnerEye.ML.model_testing import MODEL_OUTPUT_CSV
 
-
 from InnerEye.ML.configs.ssl.CovidContainers import COVID_DATASET_ID
 from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
 from InnerEye.ML.utils.run_recovery import RunRecovery
@@ -207,7 +206,7 @@ def get_post_loss_logits_normalization_function(self) -> Callable:
         return torch.nn.Softmax()
 
     def create_metric_computers(self) -> ModuleDict:
-        return ModuleDict({MetricsDict.DEFAULT_HUE_KEY: ModuleList([torchmetrics.Accuracy()])})
+        return ModuleDict({MetricsDict.DEFAULT_HUE_KEY: ModuleList([Accuracy05()])})
 
     def compute_and_log_metrics(self,
                                 logits: torch.Tensor,

From 9562e87a8161091fd5c11ebe8569cf983770ef76 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 14:57:15 +0100
Subject: [PATCH 10/22] Flake8 and mypy

---
 InnerEye/ML/configs/classification/CovidModel.py | 2 +-
 Tests/ML/test_metrics.py                         | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 3e681942c..740bdd0cd 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -257,7 +257,7 @@ def get_labels_and_predictions(df: pd.DataFrame) -> pd.DataFrame:
                                            LoggingColumns.ModelOutput.value: [np.argmax(predictions)],
                                            LoggingColumns.Label.value: [np.argmax(labels)]})
 
-        def get_accuracy(df):
+        def get_accuracy(df: pd.DataFrame) -> float:
             df = df.groupby(LoggingColumns.Patient.value, as_index=False).apply(get_labels_and_predictions).reset_index(
                 drop=True)
             df["tp+tn"] = df.apply(
diff --git a/Tests/ML/test_metrics.py b/Tests/ML/test_metrics.py
index dfc74a1a2..49f41ce8e 100644
--- a/Tests/ML/test_metrics.py
+++ b/Tests/ML/test_metrics.py
@@ -18,7 +18,6 @@
 from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
 from InnerEye.ML.configs.regression.DummyRegression import DummyRegression
 from InnerEye.ML.lightning_metrics import AverageWithoutNan, MetricForMultipleStructures, ScalarMetricsBase
-from InnerEye.ML.lightning_models import ScalarLightning
 from InnerEye.ML.metrics_dict import MetricsDict, get_column_name_for_logging
 
 

From 5453e638c8564da365b88410ed08da7e5949d75e Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 6 Jul 2021 15:11:56 +0100
Subject: [PATCH 11/22] Fix test

---
 InnerEye/ML/lightning_models.py                              | 5 ++---
 InnerEye/ML/scalar_config.py                                 | 4 +++-
 InnerEye/ML/sequence_config.py                               | 5 +++++
 .../models/architectures/sequential/test_rnn_classifier.py   | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 110663824..349064fcc 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -176,12 +176,11 @@ def __init__(self, config: ScalarModelBase, *args: Any, **kwargs: Any) -> None:
         if isinstance(config, SequenceModelBase):
             self.loss_fn = lambda model_output, loss: apply_sequence_model_loss(raw_loss, model_output, loss)
             self.target_indices = config.get_target_indices()
-            self.target_names = [SequenceMetricsDict.get_hue_name_from_target_index(p)
-                                 for p in config.sequence_target_positions]
         else:
             self.loss_fn = raw_loss
             self.target_indices = []
-            self.target_names = config.target_names
+
+        self.target_names = config.target_names
         self.is_classification_model = config.is_classification_model
         self.use_mean_teacher_model = config.compute_mean_teacher_model
         self.is_binary_classification_or_regression = True if len(config.class_names) == 1 else False
diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index 0a471b6f6..18fa5b273 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -132,7 +132,9 @@ class ScalarModelBase(ModelConfigBase):
                                              "reporting results. If provided, the length of this list must match the "
                                              "number of model outputs (and of transformed labels, if defined; see "
                                              "get_posthoc_label_transform()). By default, this inherits the value of "
-                                             "class_names at initialisation.")
+                                             "class_names at initialisation. This will be ignored in sequence models, "
+                                             "as target_names are determined automatically based on"
+                                             "sequence_target_positions")
     aggregation_type: AggregationType = param.ClassSelector(default=AggregationType.Average, class_=AggregationType,
                                                             doc="The type of global pooling aggregation to use between"
                                                                 " the encoder and the classifier.")
diff --git a/InnerEye/ML/sequence_config.py b/InnerEye/ML/sequence_config.py
index 7fad8d249..9122623e3 100644
--- a/InnerEye/ML/sequence_config.py
+++ b/InnerEye/ML/sequence_config.py
@@ -12,6 +12,7 @@
 from InnerEye.Common.metrics_constants import LoggingColumns
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.deep_learning_config import TemperatureScalingConfig
+from InnerEye.ML.metrics_dict import SequenceMetricsDict
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 
@@ -65,6 +66,10 @@ def __init__(self, **params: Any):
             logging.info(f"Temperature scaling will be performed on the "
                          f"validation set using the config: {self.temperature_scaling_config}")
 
+    def validate(self) -> None:
+        self.target_names = [SequenceMetricsDict.get_hue_name_from_target_index(p)
+                             for p in self.sequence_target_positions]
+
     def get_target_indices(self) -> List[int]:
         """
         Computes the zero based array indices inside of a sequence of items
diff --git a/Tests/ML/models/architectures/sequential/test_rnn_classifier.py b/Tests/ML/models/architectures/sequential/test_rnn_classifier.py
index f49c3cfd4..6febbc15c 100644
--- a/Tests/ML/models/architectures/sequential/test_rnn_classifier.py
+++ b/Tests/ML/models/architectures/sequential/test_rnn_classifier.py
@@ -437,7 +437,7 @@ def test_run_ml_with_multi_label_sequence_model(test_output_dirs: OutputFolderFo
     when it is started via run_ml.
     """
     logging_to_stdout()
-    config = ToyMultiLabelSequenceModel(should_validate=False)
+    config = ToyMultiLabelSequenceModel()
     assert config.get_target_indices() == [1, 2, 3]
     expected_prediction_targets = [f"{SEQUENCE_POSITION_HUE_NAME_PREFIX} {x}"
                                    for x in ["01", "02", "03"]]

From 5665e75e83451c77137fe0cb07cb11e1f0733ced Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Wed, 7 Jul 2021 09:22:30 +0100
Subject: [PATCH 12/22] Flake8

---
 InnerEye/ML/lightning_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 349064fcc..82a064fff 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -15,7 +15,7 @@
 from InnerEye.ML.lightning_base import InnerEyeLightning
 from InnerEye.ML.lightning_metrics import MetricForMultipleStructures
 from InnerEye.ML.metrics import compute_dice_across_patches
-from InnerEye.ML.metrics_dict import DataframeLogger, MetricsDict, SequenceMetricsDict
+from InnerEye.ML.metrics_dict import DataframeLogger, MetricsDict
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.sequence_config import SequenceModelBase

From 8e55bcc44ce6374184f82c783df20f913470ebd4 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Wed, 7 Jul 2021 09:33:31 +0100
Subject: [PATCH 13/22] Update CHANGELOG.md

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8c5612acf..7ba18a807 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,9 @@ jobs that run in AzureML.
 - ([#496](https://github.com/microsoft/InnerEye-DeepLearning/pull/496)) All plots are now saved as PNG, rather than JPG.
 - ([#497](https://github.com/microsoft/InnerEye-DeepLearning/pull/497)) Reducing the size of the code snapshot that
 gets uploaded to AzureML, by skipping all test folders.
+- ([#526](https://github.com/microsoft/InnerEye-DeepLearning/pull/526)) Updated Covid config to use a multiclass
+  formulation. Moved functions `create_metric_computers` and `compute_and_log_metrics` from `ScalarLightning` to
+  `ScalarModelBase`.
 
 ### Fixed
 
@@ -40,6 +43,9 @@ multiple large checkpoints can time out.
 ### Removed
 
 - ([#520](https://github.com/microsoft/InnerEye-DeepLearning/pull/520)) Disable glaucoma job from Azure pipeline.
+- ([#526](https://github.com/microsoft/InnerEye-DeepLearning/pull/526)) Removed `get_posthoc_label_transform` in
+  class `ScalarModelBase`. Instead, functions `get_loss_function` and `compute_and_log_metrics` in
+  `ScalarModelBase` can be implemented to compute the loss and metrics in a task-specific manner.
 
 ### Deprecated
 

From 79bef556fe724d75e1fd5d2c965b23d76ef5e1fe Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Wed, 7 Jul 2021 09:40:22 +0100
Subject: [PATCH 14/22] Refactor

---
 InnerEye/ML/configs/classification/CovidModel.py |  4 ++--
 InnerEye/ML/lightning_models.py                  | 11 ++++++++++-
 InnerEye/ML/scalar_config.py                     |  4 ++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 740bdd0cd..7a4f38080 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -215,13 +215,13 @@ def compute_and_log_metrics(self,
                                 is_training: bool,
                                 metrics: ModuleDict,
                                 logger: DataframeLogger,
-                                current_epoch: int) -> None:
+                                current_epoch: int,
+                                data_split: ModelExecutionMode) -> None:
         posteriors = self.get_post_loss_logits_normalization_function()(logits)
         labels = torch.argmax(targets.data.to(dtype=torch.int), dim=-1)
         metric = metrics[MetricsDict.DEFAULT_HUE_KEY][0]
         metric(posteriors, labels)
 
-        data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
         per_subject_outputs = list(zip(subject_ids, [MetricsDict.DEFAULT_HUE_KEY] * len(subject_ids), posteriors.tolist(), labels.tolist()))
         for subject, prediction_target, model_output, label in per_subject_outputs:
             logger.add_record({
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 82a064fff..401a7699b 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -9,6 +9,7 @@
 
 from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
 from InnerEye.Common.metrics_constants import LoggingColumns, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX
+from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.config import SegmentationModelBase
 from InnerEye.ML.dataset.sample import CroppedSample
 from InnerEye.ML.dataset.scalar_sample import ScalarItem
@@ -246,7 +247,15 @@ def training_or_validation_step(self,
         self.write_loss(is_training, loss)
         metrics = self.train_metric_computers if is_training else self.val_metric_computers
         logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
-        self.compute_and_log_metrics(logits, labels, subject_ids, is_training, metrics, logger, self.current_epoch)
+        data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
+        self.compute_and_log_metrics(logits=logits,
+                                     targets=labels,
+                                     subject_ids=subject_ids,
+                                     is_training=is_training,
+                                     metrics=metrics,
+                                     logger=logger,
+                                     current_epoch=self.current_epoch,
+                                     data_split=data_split)
         self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
                           value=len(model_inputs_and_labels.subject_ids),
                           is_training=is_training,
diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index 18fa5b273..f74bf8d76 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -535,7 +535,8 @@ def compute_and_log_metrics(self,
                                 is_training: bool,
                                 metrics: ModuleDict,
                                 logger: DataframeLogger,
-                                current_epoch: int) -> None:
+                                current_epoch: int,
+                                data_split: ModelExecutionMode) -> None:
         """
         Computes all the metrics for a given (logits, labels) pair, and writes them to the loggers.
         :param logits: The model output before normalization.
@@ -567,7 +568,6 @@ def compute_and_log_metrics(self,
                     zip(_subject_ids, [prediction_target] * len(_subject_ids), _posteriors.tolist(), _labels.tolist()))
         # Write a full breakdown of per-subject predictions and labels to a file. These files are local to the current
         # rank in distributed training, and will be aggregated after training.
-        data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
         for subject, prediction_target, model_output, label in per_subject_outputs:
             logger.add_record({
                 LoggingColumns.Epoch.value: current_epoch,

From 4061be9368c950c776ad327251963028ef1f8f1b Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Thu, 8 Jul 2021 09:58:54 +0100
Subject: [PATCH 15/22] Fix SSL config

---
 InnerEye/ML/configs/ssl/CovidContainers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/InnerEye/ML/configs/ssl/CovidContainers.py b/InnerEye/ML/configs/ssl/CovidContainers.py
index 2941b1b39..92f3521e8 100644
--- a/InnerEye/ML/configs/ssl/CovidContainers.py
+++ b/InnerEye/ML/configs/ssl/CovidContainers.py
@@ -33,4 +33,5 @@ def __init__(self,
                          linear_head_augmentation_config=path_linear_head_augmentation_cxr,
                          online_evaluator_lr=1e-5,
                          linear_head_batch_size=64,
+                         pl_find_unused_parameters=True,
                          **kwargs)

From 2bf571488f7ef8136b44cbe02f38f0328517ce22 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 13 Jul 2021 17:12:30 +0100
Subject: [PATCH 16/22] Flake8

---
 InnerEye/ML/lightning_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 2ba1ad8bd..affa415f9 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -23,7 +23,7 @@
 from InnerEye.ML.utils import image_util, metrics_util, model_util
 from InnerEye.ML.utils.dataset_util import DatasetExample, store_and_upload_example
 from InnerEye.ML.utils.model_util import get_scalar_model_inputs_and_labels
-from InnerEye.ML.utils.sequence_utils import apply_sequence_model_loss, get_masked_model_outputs_and_labels
+from InnerEye.ML.utils.sequence_utils import apply_sequence_model_loss
 from pytorch_lightning import Trainer
 
 SUBJECT_OUTPUT_PER_RANK_PREFIX = f"{SUBJECT_METRICS_FILE_NAME}.rank"

From e34b9ad966e4ae7a716b7f180b84580f25b6232b Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Tue, 13 Jul 2021 17:32:55 +0100
Subject: [PATCH 17/22] mypy

---
 InnerEye/ML/lightning_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index affa415f9..94a3cce34 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -259,7 +259,7 @@ def training_or_validation_step(self,
         loss = self.loss_fn(logits, labels)
         self.write_loss(is_training, loss)
         metrics = self.train_metric_computers if is_training else self.val_metric_computers
-        logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
+        logger: DataframeLogger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
         data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
         self.compute_and_log_metrics(logits=logits,
                                      targets=labels,

From 68b1737f5acd321aaac17ddd6670fe6471e68f35 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Wed, 14 Jul 2021 07:53:14 +0100
Subject: [PATCH 18/22] Address PR comments

---
 .../ML/configs/classification/CovidModel.py   | 19 +++++++++----------
 InnerEye/ML/lightning_models.py               |  2 +-
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 7a4f38080..232193c6a 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -222,12 +222,12 @@ def compute_and_log_metrics(self,
         metric = metrics[MetricsDict.DEFAULT_HUE_KEY][0]
         metric(posteriors, labels)
 
-        per_subject_outputs = list(zip(subject_ids, [MetricsDict.DEFAULT_HUE_KEY] * len(subject_ids), posteriors.tolist(), labels.tolist()))
-        for subject, prediction_target, model_output, label in per_subject_outputs:
+        per_subject_outputs = zip(subject_ids, posteriors.tolist(), labels.tolist())
+        for subject, model_output, label in per_subject_outputs:
             logger.add_record({
                 LoggingColumns.Epoch.value: current_epoch,
                 LoggingColumns.Patient.value: subject,
-                LoggingColumns.Hue.value: prediction_target,
+                LoggingColumns.Hue.value: MetricsDict.DEFAULT_HUE_KEY,
                 LoggingColumns.ModelOutput.value: model_output,
                 LoggingColumns.Label.value: label,
                 LoggingColumns.DataSplit.value: data_split.value
@@ -236,7 +236,7 @@ def compute_and_log_metrics(self,
     def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """
         Generate a custom report for the Covid model. This report will read the file model_output.csv generated for
-        the training, validation or test sets and compute a 4 class accuracy and confusion matrix based on this.
+        the training, validation or test sets and compute the multiclass accuracy based on this.
         :param report_dir: Directory report is to be written to
         :param model_proc: Whether this is a single or ensemble model (model_output.csv will be located in different
         paths for single vs ensemble runs.)
@@ -249,9 +249,10 @@ def get_output_csv_path(mode: ModelExecutionMode) -> Path:
         def get_labels_and_predictions(df: pd.DataFrame) -> pd.DataFrame:
             labels = []
             predictions = []
-            for i, target in enumerate(self.target_names):
-                predictions.append(df[df[LoggingColumns.Hue.value] == target][LoggingColumns.ModelOutput.value].item())
-                labels.append(df[df[LoggingColumns.Hue.value] == target][LoggingColumns.Label.value])
+            for target in self.target_names:
+                target_df = df[df[LoggingColumns.Hue.value] == target]
+                predictions.append(target_df[LoggingColumns.ModelOutput.value])
+                labels.append(target_df[LoggingColumns.Label.value])
 
             return pd.DataFrame.from_dict({LoggingColumns.Patient.value: [df.iloc[0][LoggingColumns.Patient.value]],
                                            LoggingColumns.ModelOutput.value: [np.argmax(predictions)],
@@ -260,9 +261,7 @@ def get_labels_and_predictions(df: pd.DataFrame) -> pd.DataFrame:
         def get_accuracy(df: pd.DataFrame) -> float:
             df = df.groupby(LoggingColumns.Patient.value, as_index=False).apply(get_labels_and_predictions).reset_index(
                 drop=True)
-            df["tp+tn"] = df.apply(
-                lambda x: 1 if x[LoggingColumns.ModelOutput.value] == x[LoggingColumns.Label.value] else 0, axis=1)
-            return np.sum(df["tp+tn"].values) / len(df)
+            return (df[LoggingColumns.ModelOutput.value] == df[LoggingColumns.Label.value]).mean()  # type: ignore
 
         train_metrics = get_output_csv_path(ModelExecutionMode.TRAIN)
         val_metrics = get_output_csv_path(ModelExecutionMode.VAL)
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 94a3cce34..affa415f9 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -259,7 +259,7 @@ def training_or_validation_step(self,
         loss = self.loss_fn(logits, labels)
         self.write_loss(is_training, loss)
         metrics = self.train_metric_computers if is_training else self.val_metric_computers
-        logger: DataframeLogger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
+        logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
         data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
         self.compute_and_log_metrics(logits=logits,
                                      targets=labels,

From 90bca979dd55706a38c95a31ceb18f3adf70404f Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Mon, 26 Jul 2021 14:53:13 +0100
Subject: [PATCH 19/22] Log model output per prediction target, address PR
 comments

---
 .../ML/configs/classification/CovidModel.py   | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/InnerEye/ML/configs/classification/CovidModel.py b/InnerEye/ML/configs/classification/CovidModel.py
index 232193c6a..7fb71230d 100644
--- a/InnerEye/ML/configs/classification/CovidModel.py
+++ b/InnerEye/ML/configs/classification/CovidModel.py
@@ -218,20 +218,21 @@ def compute_and_log_metrics(self,
                                 current_epoch: int,
                                 data_split: ModelExecutionMode) -> None:
         posteriors = self.get_post_loss_logits_normalization_function()(logits)
-        labels = torch.argmax(targets.data.to(dtype=torch.int), dim=-1)
+        labels = torch.argmax(targets, dim=-1)
         metric = metrics[MetricsDict.DEFAULT_HUE_KEY][0]
         metric(posteriors, labels)
 
-        per_subject_outputs = zip(subject_ids, posteriors.tolist(), labels.tolist())
-        for subject, model_output, label in per_subject_outputs:
-            logger.add_record({
-                LoggingColumns.Epoch.value: current_epoch,
-                LoggingColumns.Patient.value: subject,
-                LoggingColumns.Hue.value: MetricsDict.DEFAULT_HUE_KEY,
-                LoggingColumns.ModelOutput.value: model_output,
-                LoggingColumns.Label.value: label,
-                LoggingColumns.DataSplit.value: data_split.value
-            })
+        per_subject_outputs = zip(subject_ids, posteriors.tolist(), targets.tolist())
+        for subject, model_output, target in per_subject_outputs:
+            for i in range(len(self.target_names)):
+                logger.add_record({
+                    LoggingColumns.Epoch.value: current_epoch,
+                    LoggingColumns.Patient.value: subject,
+                    LoggingColumns.Hue.value: self.target_names[i],
+                    LoggingColumns.ModelOutput.value: model_output[i],
+                    LoggingColumns.Label.value: target[i],
+                    LoggingColumns.DataSplit.value: data_split.value
+                })
 
     def generate_custom_report(self, report_dir: Path, model_proc: ModelProcessing) -> Path:
         """

From 84ff381a0cabbefcbac343c4995ca1191a481a8c Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Mon, 26 Jul 2021 17:35:55 +0100
Subject: [PATCH 20/22] Update notebook: test results no longer in csv file

---
 InnerEye/ML/reports/notebook_report.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/reports/notebook_report.py b/InnerEye/ML/reports/notebook_report.py
index 5a7e49c22..cbb122c85 100644
--- a/InnerEye/ML/reports/notebook_report.py
+++ b/InnerEye/ML/reports/notebook_report.py
@@ -170,7 +170,7 @@ def generate_classification_crossval_notebook(result_notebook: Path,
             'innereye_path': str(fixed_paths.repository_root_directory()),
             'train_metrics_csv': "",
             'val_metrics_csv': str_or_empty(crossval_metrics),
-            'test_metrics_csv': str_or_empty(crossval_metrics),
+            'test_metrics_csv': "",
             "config": codecs.encode(pickle.dumps(config), "base64").decode(),
             "is_crossval_report": True
         }

From a17f541448d0c5f9df7d76b7a555f77d57841062 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Mon, 26 Jul 2021 17:37:21 +0100
Subject: [PATCH 21/22] mypy

---
 InnerEye/ML/lightning_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index affa415f9..f8ed4fd97 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -259,7 +259,7 @@ def training_or_validation_step(self,
         loss = self.loss_fn(logits, labels)
         self.write_loss(is_training, loss)
         metrics = self.train_metric_computers if is_training else self.val_metric_computers
-        logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger
+        logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger  # type: ignore
         data_split = ModelExecutionMode.TRAIN if is_training else ModelExecutionMode.VAL
         self.compute_and_log_metrics(logits=logits,
                                      targets=labels,

From c54b029a013c1bc4363ecb51416c2c0c2ce90e53 Mon Sep 17 00:00:00 2001
From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com>
Date: Mon, 26 Jul 2021 17:57:48 +0100
Subject: [PATCH 22/22] Update docstring

---
 InnerEye/ML/scalar_config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/InnerEye/ML/scalar_config.py b/InnerEye/ML/scalar_config.py
index a42f5491b..b156b94e4 100644
--- a/InnerEye/ML/scalar_config.py
+++ b/InnerEye/ML/scalar_config.py
@@ -546,6 +546,11 @@ def compute_and_log_metrics(self,
         :param targets: The expected model outputs.
         :param subject_ids: The subject IDs for the present minibatch.
         :param is_training: If True, write the metrics as training metrics, otherwise as validation metrics.
+        :param metrics: A dictionary mapping from names of prediction targets to a list of metric computers,
+        as returned by create_metric_computers.
+        :param logger: An object of type DataframeLogger which can be be used for logging within this function.
+        :param current_epoch: Current epoch number.
+        :param data_split: ModelExecutionMode object indicating if this is the train or validation split.
         :return:
         """
         per_subject_outputs: List[Tuple[str, str, torch.Tensor, torch.Tensor]] = []