DeepRank · gcroci2 · Jan 3, 2024 · Oct 19, 2023 · Oct 19, 2023 · Oct 20, 2023
diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml
@@ -50,8 +50,8 @@ runs:
         conda install -c bioconda msms
         ## PyTorch, PyG, PyG adds
         ### Installing for CPU only on the CI
-        conda install pytorch torchvision torchaudio cpuonly -c pytorch
-        conda install pyg -c pyg
+        conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 -c pytorch
+        pip install torch_geometric==2.3.1
         pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-$(python3 -c "import torch; print(torch.__version__)")+cpu.html
     - name: Install dependencies on MacOS
       shell: bash {0}

diff --git a/README.md b/README.md
@@ -61,6 +61,7 @@ Before installing deeprank2 you need to install some dependencies. We advise to
     * [Here](https://ssbio.readthedocs.io/en/latest/instructions/msms.html) for MacOS with M1 chip users.
 *  [PyTorch](https://pytorch.org/get-started/locally/)
     * We support torch's CPU library as well as CUDA.
+    * Currently, the package is tested using [PyTorch 2.0.1](https://pytorch.org/get-started/previous-versions/#v201). 
 *  [PyG](https://pytorch-geometric.readthedocs.io/en/latest/install/installation.html) and its optional dependencies: `torch_scatter`, `torch_sparse`, `torch_cluster`, `torch_spline_conv`.
 *  [DSSP 4](https://swift.cmbi.umcn.nl/gv/dssp/)
     * Check if `dssp` is installed: `dssp --version`. If this gives an error or shows a version lower than 4:
@@ -216,13 +217,13 @@ dataset_val = GraphDataset(
     hdf5_path = hdf5_paths,
     subset = valid_ids,
     train = False,
-    dataset_train = dataset_train
+    train_data = dataset_train
 )
 dataset_test = GraphDataset(
     hdf5_path = hdf5_paths,
     subset = test_ids,
     train = False,
-    dataset_train = dataset_train
+    train_data = dataset_train
 )
 ```
 
@@ -250,13 +251,13 @@ dataset_val = GridDataset(
     hdf5_path = hdf5_paths,
     subset = valid_ids,
     train = False,
-    dataset_train = dataset_train,
+    train_data = dataset_train,
 )
 dataset_test = GridDataset(
     hdf5_path = hdf5_paths,
     subset = test_ids,
     train = False,
-    dataset_train = dataset_train,
+    train_data = dataset_train,
 )
 ```
 

diff --git a/deeprank2/dataset.py b/deeprank2/dataset.py
diff --git a/deeprank2/trainer.py b/deeprank2/trainer.py
@@ -1,5 +1,7 @@
 import copy
+import inspect
 import logging
+import re
 from time import time
 from typing import List, Optional, Tuple, Union
 
@@ -64,13 +66,8 @@ def __init__( # pylint: disable=too-many-arguments # noqa: MC0001
                 over the epochs. If None, defaults to :class:`HDF5OutputExporter`, which saves all the results in an .HDF5 file stored in ./output directory.
                 Defaults to None.
         """
-        self.batch_size_train = None
-        self.batch_size_test = None
-        self.shuffle = None
-
-        self._init_output_exporters(output_exporters)
-
         self.neuralnet = neuralnet
+        self.pretrained_model_path = pretrained_model
 
         self._init_datasets(dataset_train, dataset_val, dataset_test,
                             val_size, test_size)
@@ -119,14 +116,22 @@ def __init__( # pylint: disable=too-many-arguments # noqa: MC0001
             _log.info(f"CUDA device name is {torch.cuda.get_device_name(0)}.")
             _log.info(f"Number of GPUs set to {self.ngpu}.")
 
-        if pretrained_model is None:
+        self._init_output_exporters(output_exporters)
+
+        # other attributes not set in init
+        self.data_type = None
+        self.batch_size_train = None
+        self.batch_size_test = None
+        self.shuffle = None
+        self.model_load_state_dict = None
+
+        if self.pretrained_model_path is None:
             if self.dataset_train is None:
                 raise ValueError("No training data specified. Training data is required if there is no pretrained model.")
             if self.neuralnet is None:
                 raise ValueError("No neural network specified. Specifying a model framework is required if there is no pretrained model.")
 
-            self.classes = self.dataset_train.classes
-            self.classes_to_index = self.dataset_train.classes_to_index
+            self._init_from_dataset(self.dataset_train)
             self.optimizer = None
             self.class_weights = class_weights
             self.subset = self.dataset_train.subset
@@ -158,20 +163,17 @@ def __init__( # pylint: disable=too-many-arguments # noqa: MC0001
                         "Please set clustering_method to 'mcl', 'louvain' or None. Default to 'mcl' \n\t")
 
         else:
-            if self.dataset_train is not None:
-                _log.warning("Pretrained model loaded: dataset_train will be ignored.")
-            if self.dataset_val is not None:
-                _log.warning("Pretrained model loaded: dataset_val will be ignored.")
             if self.neuralnet is None:
                 raise ValueError("No neural network class found. Please add it to complete loading the pretrained model.")
             if self.dataset_test is None:
                 raise ValueError("No dataset_test found. Please add it to evaluate the pretrained model.")
-            if self.target is None:
-                raise ValueError("No target set. Make sure the pretrained model explicitly defines the target to train against.")
-
-            self.pretrained_model_path = pretrained_model
-            self.classes_to_index = self.dataset_test.classes_to_index
-
+            if self.dataset_train is not None:
+                self.dataset_train = None
+                _log.warning("Pretrained model loaded: dataset_train will be ignored.")
+            if self.dataset_val is not None:
+                self.dataset_val = None
+                _log.warning("Pretrained model loaded: dataset_val will be ignored.")
+            self._init_from_dataset(self.dataset_test)
             self._load_params()
             self._load_pretrained_model()
 
@@ -210,30 +212,33 @@ def _init_datasets(self,  # pylint: disable=too-many-arguments
             else:
                 _log.warning("Validation dataset was provided to Trainer; val_size parameter is ignored.")
 
-        # Copy settings from the dataset that we will use.
-        if self.dataset_train is not None:
-            self._init_from_dataset(self.dataset_train)
-        else:
-            self._init_from_dataset(self.dataset_test)
-
     def _init_from_dataset(self, dataset: Union[GraphDataset, GridDataset]):
 
         if isinstance(dataset, GraphDataset):
             self.clustering_method = dataset.clustering_method
             self.node_features = dataset.node_features
             self.edge_features = dataset.edge_features
             self.features = None
+            self.features_transform = dataset.features_transform
+            self.means = dataset.means
+            self.devs = dataset.devs
 
         elif isinstance(dataset, GridDataset):
             self.clustering_method = None
             self.node_features = None
             self.edge_features = None
             self.features = dataset.features
+            self.features_transform = None
+            self.means = None
+            self.devs = None
         else:
             raise TypeError(type(dataset))
 
         self.target = dataset.target
+        self.target_transform = dataset.target_transform
         self.task = dataset.task
+        self.classes = dataset.classes
+        self.classes_to_index = dataset.classes_to_index
 
     def _load_model(self):
         """Loads the neural network model."""
@@ -269,10 +274,10 @@ def _check_dataset_value(self, dataset_train, dataset_check, type_dataset):
         if dataset_check.train is not False:
             raise ValueError(f"""{type_dataset} dataset has train parameter {dataset_check.train}
                         Make sure to set it as False""")
-        # Check dataset_train parameter in valid/test is equivalent to train which passed to Trainer.
-        if dataset_check.dataset_train != dataset_train:
-            raise ValueError(f"""{type_dataset} dataset has different dataset_train parameter compared to the one given in Trainer.
-                        Make sure to assign equivalent dataset_train in Trainer""")
+        # Check train_data parameter in valid/test is equivalent to train which passed to Trainer.
+        if dataset_check.train_data != dataset_train:
+            raise ValueError(f"""{type_dataset} dataset has different train_data parameter compared to the one given in Trainer.
+                        Make sure to assign equivalent train_data in Trainer""")
 
     def _load_pretrained_model(self):
         """
@@ -525,6 +530,10 @@ def train( # pylint: disable=too-many-arguments, too-many-branches, too-many-loc
             filename (str, optional): Name of the file where to save the selected model. If not None, the model is saved to `filename`.
                 If None, the model is not saved. Defaults to 'model.pth.tar'.
         """
+        if self.dataset_train is None:
+            raise ValueError("No training dataset provided.")
+
+        self.data_type = type(self.dataset_train)
         self.batch_size_train = batch_size
         self.shuffle = shuffle
 
@@ -549,6 +558,9 @@ def train( # pylint: disable=too-many-arguments, too-many-branches, too-many-loc
         else:
             self.valid_loader = None
             _log.info("No validation set provided\n")
+            _log.warning(
+                "Training data will be used both for learning and model selection, which may lead to overfitting." +
+                "\nIt is usually preferable to use a validation set during the training phase.")
 
         # Assign weights to each class
         if self.task == targets.CLASSIF and self.class_weights:
@@ -622,9 +634,6 @@ def train( # pylint: disable=too-many-arguments, too-many-branches, too-many-loc
                     # if no validation set, save the best performing model on the training set
                     if best_model:
                         if min(train_losses) == loss_:
-                            _log.warning(
-                                "Training data is used both for learning and model selection, which will to overfitting." +
-                                "\n\tIt is preferable to use an independent training and validation data sets.")
                             checkpoint_model = self._save_model()
                             self.epoch_saved_model = epoch
                             _log.info(f'Best model saved at epoch # {self.epoch_saved_model}.')
@@ -739,6 +748,9 @@ def _eval( # pylint: disable=too-many-locals
                 loss_ = loss_func(pred, y)
                 count_predictions += pred.shape[0]
                 sum_of_losses += loss_.detach().item() * pred.shape[0]
+            else:
+                target_vals += ['None'] * pred.shape[0]
+                eval_loss = 'None'
 
             # Get the outputs for export
             # Remember that non-linear activation is automatically applied in CrossEntropyLoss
@@ -755,7 +767,7 @@ def _eval( # pylint: disable=too-many-locals
         if count_predictions > 0:
             eval_loss = sum_of_losses / count_predictions
         else:
-            eval_loss = 0.0
+            eval_loss = 'None'
 
         self._output_exporters.process(
             pass_name, epoch_number, entry_names, outputs, target_vals, eval_loss)
@@ -822,6 +834,13 @@ def test(
             num_workers (int, optional): How many subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
                         Defaults to 0.
         """
+        if (not self.pretrained_model_path) and (not self.model_load_state_dict):
+            raise ValueError(
+                """
+                No pretrained model provided and no training performed.
+                Please provide a pretrained model or train the model before testing.\n
+                """)
+
         self.batch_size_test = batch_size
 
         if self.dataset_test is not None:
@@ -848,28 +867,38 @@ def _load_params(self):
         Loads the parameters of a pretrained model
         """
 
-        state = torch.load(self.pretrained_model_path)
+        if torch.cuda.is_available():
+            state = torch.load(self.pretrained_model_path)
+        else:
+            state = torch.load(self.pretrained_model_path, map_location=torch.device('cpu'))
 
+        self.data_type = state["data_type"]
+        self.model_load_state_dict = state["model_state"]
+        self.optimizer = state["optimizer"]
+        self.opt_loaded_state_dict = state["optimizer_state"]
+        self.lossfunction = state["lossfunction"]
         self.target = state["target"]
+        self.target_transform = state["target_transform"]
+        self.task = state["task"]
+        self.classes = state["classes"]
+        self.classes_to_index = state["classes_to_index"]
+        self.class_weights = state["class_weights"]
         self.batch_size_train = state["batch_size_train"]
         self.batch_size_test = state["batch_size_test"]
         self.val_size = state["val_size"]
         self.test_size = state["test_size"]
         self.lr = state["lr"]
         self.weight_decay = state["weight_decay"]
+        self.epoch_saved_model = state["epoch_saved_model"]
         self.subset = state["subset"]
-        self.class_weights = state["class_weights"]
-        self.task = state["task"]
-        self.classes = state["classes"]
         self.shuffle = state["shuffle"]
-        self.optimizer = state["optimizer"]
-        self.opt_loaded_state_dict = state["optimizer_state"]
-        self.lossfunction = state["lossfunction"]
-        self.model_load_state_dict = state["model_state"]
         self.clustering_method = state["clustering_method"]
         self.node_features = state["node_features"]
         self.edge_features = state["edge_features"]
         self.features = state["features"]
+        self.features_transform = state["features_transform"]
+        self.means = state["means"]
+        self.devs = state["devs"]
         self.cuda = state["cuda"]
         self.ngpu = state["ngpu"]
 
@@ -880,27 +909,44 @@ def _save_model(self):
         Args:
             filename (str, optional): Name of the file. Defaults to None.
         """
+        features_transform_to_save = copy.deepcopy(self.features_transform)
+        # prepare transform dictionary for being saved
+        if features_transform_to_save:
+            for _, key in features_transform_to_save.items():
+                if key['transform'] is None:
+                    continue
+                str_expr = inspect.getsource(key['transform'])
+                match = re.search(r'\'transform\':.*(lambda.*).*,.*\'standardize\'.*', str_expr).group(1)
+                key['transform'] = match
+
         state = {
+            "data_type": self.data_type,
             "model_state": self.model.state_dict(),
             "optimizer": self.optimizer,
             "optimizer_state": self.optimizer.state_dict(),
             "lossfunction": self.lossfunction,
             "target": self.target,
+            "target_transform": self.target_transform,
             "task": self.task,
             "classes": self.classes,
+            "classes_to_index": self.classes_to_index,
             "class_weights": self.class_weights,
             "batch_size_train": self.batch_size_train,
             "batch_size_test": self.batch_size_test,
             "val_size": self.val_size,
             "test_size": self.test_size,
             "lr": self.lr,
             "weight_decay": self.weight_decay,
+            "epoch_saved_model": self.epoch_saved_model,
             "subset": self.subset,
             "shuffle": self.shuffle,
             "clustering_method": self.clustering_method,
             "node_features": self.node_features,
             "edge_features": self.edge_features,
             "features": self.features,
+            "features_transform": features_transform_to_save,
+            "means": self.means,
+            "devs": self.devs,
             "cuda": self.cuda,
             "ngpu": self.ngpu
         }