WenjieDu · wx0chan · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/pypots/imputation/cdsa/__init__.py b/pypots/imputation/cdsa/__init__.py
@@ -0,0 +1,18 @@
+"""
+The package of the partially-observed time-series imputation model CDSA.
+
+Refer to the paper "Ma, J., Shou, Z., Zareian, A., Mansour, H., Vetro, A., & Chang, S. F. (2019).
+CDSA: cross-dimensional self-attention for multivariate, geo-tagged time series imputation.
+arXiv preprint arXiv:1905.09904."
+
+"""
+
+# Created by Weixuan Chen <wx_chan@qq.com> and Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from .model import CDSA
+
+__all__ = [
+    "CDSA",
+]
diff --git a/pypots/imputation/cdsa/data.py b/pypots/imputation/cdsa/data.py
@@ -0,0 +1,22 @@
+"""
+Dataset class for self-attention models trained with MIT (masked imputation task) task.
+"""
+
+# Created by Weixuan Chen <wx_chan@qq.com> and Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union
+
+from ..saits.data import DatasetForSAITS
+
+
+class DatasetForCDSA(DatasetForSAITS):
+    def __init__(
+        self,
+        data: Union[dict, str],
+        return_X_ori: bool,
+        return_labels: bool,
+        file_type: str = "h5py",
+        rate: float = 0.2,
+    ):
+        super().__init__(data, return_X_ori, return_labels, file_type, rate)
diff --git a/pypots/imputation/cdsa/model.py b/pypots/imputation/cdsa/model.py
@@ -0,0 +1,321 @@
+"""
+The implementation of CDSA for the partially-observed time-series imputation task.
+"""
+
+# Created by Weixuan Chen <wx_chan@qq.com> and Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union, Optional
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from .data import DatasetForCDSA
+from .modules import _CDSA
+from ..base import BaseNNImputer
+from ...data.base import BaseDataset
+from ...data.checking import check_X_ori_in_val_set
+from ...optim.adam import Adam
+from ...optim.base import Optimizer
+from ...utils.logging import logger
+
+
+class CDSA(BaseNNImputer):
+    """The PyTorch implementation of the CDSA model.
+
+    Parameters
+    ----------
+    n_steps :
+        The number of time steps in the time-series data sample.
+
+    n_features :
+        The number of features in the time-series data sample.
+
+    n_layers :
+        The number of encoder layers in the CDSA model.
+
+    d_model :
+        The dimension of the model's backbone.
+        It is the input dimension of the multi-head self-attention layers.
+
+    d_inner :
+        The dimension of the layer in the Feed-Forward Networks (FFN).
+
+    n_heads :
+        The number of heads in the multi-head self-attention mechanism.
+        ``d_model`` must be divisible by ``n_heads``, and the result should be equal to ``d_k``.
+
+    d_k :
+        The dimension of the `keys` (K) and the `queries` (Q) on time-series in the DMSA mechanism.
+        ``d_k`` should be the result of ``d_model`` divided by ``n_heads``. Although ``d_k`` can be directly calculated
+        with given ``d_model`` and ``n_heads``, we want it be explicitly given together with ``d_v`` by users to ensure
+        users be aware of them and to avoid any potential mistakes.
+
+    d_v :
+        The dimension of the `values` (V) in the DMSA mechanism.
+        Also is the dimension of the `keys` (K) and the `queries` (Q) on feature in the DMSA mechanism.
+
+    dropout :
+        The dropout rate for all fully-connected layers in the model.
+
+    attn_dropout :
+        The dropout rate for DMSA.
+
+    loss_task :
+        The task for loss calculation. Since the original CDSA was trained on complete data,
+        we use "MIT" or "ORT" for loss calculation to adapt to POTS.
+
+    batch_size :
+        The batch size for training and evaluating the model.
+
+    epochs :
+        The number of epochs for training the model.
+
+    patience :
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    optimizer :
+        The optimizer for model training.
+        If not given, will use a default Adam optimizer.
+
+    num_workers :
+        The number of subprocesses to use for data loading.
+        `0` means data loading will be in the main process, i.e. there won't be subprocesses.
+
+    device :
+        The device for the model to run on. It can be a string, a :class:`torch.device` object, or a list of them.
+        If not given, will try to use CUDA devices first (will use the default CUDA device if there are multiple),
+        then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models.
+        If given a list of devices, e.g. ['cuda:0', 'cuda:1'], or [torch.device('cuda:0'), torch.device('cuda:1')] , the
+        model will be parallely trained on the multiple devices (so far only support parallel training on CUDA devices).
+        Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.
+
+    saving_path :
+        The path for automatically saving model checkpoints and tensorboard files (i.e. loss values recorded during
+        training into a tensorboard file). Will not save if not given.
+
+    model_saving_strategy :
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
+        No model will be saved when it is set as None.
+        The "best" strategy will only automatically save the best model after the training finished.
+        The "better" strategy will automatically save the model during training whenever the model performs
+        better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
+
+    References
+    ----------
+    .. [1] `Ma, Jiawei, Zheng Shou, Alireza Zareian, Hassan Mansour, Anthony Vetro, and Shih-Fu Chang.
+        "CDSA: cross-dimensional self-attention for multivariate, geo-tagged time series imputation."
+        arXiv preprint arXiv:1905.09904 (2019).
+        <https://arxiv.org/pdf/1905.09904>`_
+
+    """
+
+    def __init__(
+        self,
+        n_steps: int,
+        n_features: int,
+        n_layers: int,
+        d_model: int,
+        d_inner: int,
+        n_heads: int,
+        d_k: int,
+        d_v: int,
+        dropout: float = 0,
+        attn_dropout: float = 0,
+        loss_task="MIT",
+        batch_size: int = 32,
+        epochs: int = 100,
+        patience: Optional[int] = None,
+        optimizer: Optional[Optimizer] = Adam(),
+        num_workers: int = 0,
+        device: Optional[Union[str, torch.device, list]] = None,
+        saving_path: str = None,
+        model_saving_strategy: Optional[str] = "best",
+    ):
+        super().__init__(
+            batch_size,
+            epochs,
+            patience,
+            num_workers,
+            device,
+            saving_path,
+            model_saving_strategy,
+        )
+
+        if d_model != n_heads * d_k:
+            logger.warning(
+                "‼️ d_model must = n_heads * d_k, it should be divisible by n_heads "
+                f"and the result should be equal to d_k, but got d_model={d_model}, n_heads={n_heads}, d_k={d_k}"
+            )
+            d_model = n_heads * d_k
+            logger.warning(
+                f"⚠️ d_model is reset to {d_model} = n_heads ({n_heads}) * d_k ({d_k})"
+            )
+
+        self.n_steps = n_steps
+        self.n_features = n_features
+        # model hype-parameters
+        self.n_layers = n_layers
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.n_heads = n_heads
+        self.d_k = d_k
+        self.d_v = d_v
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.loss_task = loss_task
+
+        # set up the model
+        self.model = _CDSA(
+            self.n_layers,
+            self.n_steps,
+            self.n_features,
+            self.d_model,
+            self.d_inner,
+            self.n_heads,
+            self.d_k,
+            self.d_v,
+            self.dropout,
+            self.attn_dropout,
+            self.loss_task,
+        )
+        self._send_model_to_given_device()
+        self._print_model_size()
+
+        # set up the optimizer
+        self.optimizer = optimizer
+        self.optimizer.init_optimizer(self.model.parameters())
+
+    def _assemble_input_for_training(self, data: list) -> dict:
+        (
+            indices,
+            X,
+            missing_mask,
+            X_ori,
+            indicating_mask,
+        ) = self._send_data_to_given_device(data)
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+            "X_ori": X_ori,
+            "indicating_mask": indicating_mask,
+        }
+
+        return inputs
+
+    def _assemble_input_for_validating(self, data: list) -> dict:
+        return self._assemble_input_for_training(data)
+
+    def _assemble_input_for_testing(self, data: list) -> dict:
+        indices, X, missing_mask = self._send_data_to_given_device(data)
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+        }
+
+        return inputs
+
+    def fit(
+        self,
+        train_set: Union[dict, str],
+        val_set: Optional[Union[dict, str]] = None,
+        file_type: str = "h5py",
+    ) -> None:
+        # Step 1: wrap the input data with classes Dataset and DataLoader
+        training_set = DatasetForCDSA(
+            train_set, return_X_ori=False, return_labels=False, file_type=file_type
+        )
+        training_loader = DataLoader(
+            training_set,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+        )
+        val_loader = None
+        if val_set is not None:
+            if not check_X_ori_in_val_set(val_set):
+                raise ValueError("val_set must contain 'X_ori' for model validation.")
+            val_set = DatasetForCDSA(
+                val_set, return_X_ori=True, return_labels=False, file_type=file_type
+            )
+            val_loader = DataLoader(
+                val_set,
+                batch_size=self.batch_size,
+                shuffle=False,
+                num_workers=self.num_workers,
+            )
+
+        # Step 2: train the model and freeze it
+        self._train_model(training_loader, val_loader)
+        self.model.load_state_dict(self.best_model_dict)
+        self.model.eval()  # set the model as eval status to freeze it.
+
+        # Step 3: save the model if necessary
+        self._auto_save_model_if_necessary(confirm_saving=True)
+
+    def predict(
+        self,
+        test_set: Union[dict, str],
+        file_type: str = "h5py",
+    ) -> dict:
+        self.model.eval()  # set the model as eval status to freeze it.
+        test_set = BaseDataset(
+            test_set, return_X_ori=False, return_labels=False, file_type=file_type
+        )
+        test_loader = DataLoader(
+            test_set,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+        )
+        imputation_collector = []
+
+        with torch.no_grad():
+            for idx, data in enumerate(test_loader):
+                inputs = self._assemble_input_for_testing(data)
+                results = self.model.forward(inputs, training=False)
+                imputed_data = results["imputed_data"]
+                imputation_collector.append(imputed_data)
+
+        imputation = torch.cat(imputation_collector).cpu().detach().numpy()
+        result_dict = {
+            "imputation": imputation,
+        }
+        return result_dict
+
+    def impute(
+        self,
+        X: Union[dict, str],
+        file_type="h5py",
+    ) -> np.ndarray:
+        """Impute missing values in the given data with the trained model.
+
+        Warnings
+        --------
+        The method impute is deprecated. Please use `predict()` instead.
+
+        Parameters
+        ----------
+        X :
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type :
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, sequence length (time steps), n_features],
+            Imputed data.
+        """
+        logger.warning(
+            "🚨DeprecationWarning: The method impute is deprecated. Please use `predict` instead."
+        )
+        results_dict = self.predict(X, file_type=file_type)
+        return results_dict["imputation"]
diff --git a/pypots/imputation/cdsa/modules/__init__.py b/pypots/imputation/cdsa/modules/__init__.py
@@ -0,0 +1,13 @@
+"""
+
+"""
+
+# Created by Weixuan Chen <wx_chan@qq.com> and Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from .core import _CDSA
+
+__all__ = [
+    "_CDSA",
+]