microsoft · you-n-g · Mar 17, 2021 · Feb 9, 2021 · Feb 9, 2021 · Feb 9, 2021
diff --git a/qlib/contrib/strategy/strategy.py b/qlib/contrib/strategy/strategy.py
@@ -7,7 +7,6 @@
 import pandas as pd
 
 from ..backtest.order import Order
-from ...utils import get_pre_trading_date
 from .order_generator import OrderGenWInteract
 
 
@@ -390,11 +389,11 @@ def filter_stock(l):
         current_stock_list = current_temp.get_stock_list()
         value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0
 
-        # open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it
-        # as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
+        # open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not
+        # consider it as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
         # value = value / (1+trade_exchange.open_cost) # set open_cost limit
         for code in buy:
-            # check is stock supended
+            # check is stock suspended
             if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date):
                 continue
             # buy order

diff --git a/qlib/model/base.py b/qlib/model/base.py
@@ -43,8 +43,8 @@ def fit(self, dataset: Dataset):
 
                 # get weights
                 try:
-                    wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"], data_key=DataHandlerLP.DK_L)
-                    w_train, w_valid = wdf_train["weight"], wdf_valid["weight"]
+                    wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"],
+                    data_key=DataHandlerLP.DK_L, w_train, w_valid = wdf_train["weight"], wdf_valid["weight"]
                 except KeyError as e:
                     w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index)
                     w_valid = pd.DataFrame(np.ones_like(y_valid.values), index=y_valid.index)

diff --git a/qlib/model/riskmodel.py b/qlib/model/riskmodel.py
@@ -1,11 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import warnings
 import numpy as np
 import pandas as pd
-
 from typing import Union
+from sklearn.decomposition import PCA, FactorAnalysis
 
 from qlib.model.base import BaseModel
 
@@ -124,7 +123,7 @@ def _preprocess(self, X: np.ndarray) -> Union[np.ndarray, np.ma.MaskedArray]:
             X = np.nan_to_num(X)
         elif self.nan_option == self.MASK_NAN:
             X = np.ma.masked_invalid(X)
-        # centerize
+        # centralize
         if not self.assume_centered:
             X = X - np.nanmean(X, axis=0)
         return X
@@ -162,8 +161,9 @@ class ShrinkCovEstimator(RiskModel):
         [3] Ledoit, O., & Wolf, M. (2003). Improved estimation of the covariance matrix of stock returns
             with an application to portfolio selection.
             Journal of Empirical Finance, 10(5), 603–621. https://doi.org/10.1016/S0927-5398(03)00007-0
-        [4] Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. (2010). Shrinkage algorithms for MMSE covariance estimation.
-            IEEE Transactions on Signal Processing, 58(10), 5016–5029. https://doi.org/10.1109/TSP.2010.2053029
+        [4] Chen, Y., Wiesel, A., Eldar, Y. C., & Hero, A. O. (2010). Shrinkage algorithms for MMSE covariance
+            estimation. IEEE Transactions on Signal Processing, 58(10), 5016–5029.
+            https://doi.org/10.1109/TSP.2010.2053029
         [5] https://www.econ.uzh.ch/dam/jcr:ffffffff-935a-b0d6-0000-00007f64e5b9/cov1para.m.zip
         [6] https://www.econ.uzh.ch/dam/jcr:ffffffff-935a-b0d6-ffff-ffffde5e2d4e/covCor.m.zip
         [7] https://www.econ.uzh.ch/dam/jcr:ffffffff-935a-b0d6-0000-0000648dfc98/covMarket.m.zip
@@ -465,3 +465,147 @@ def _predict(self, X: np.ndarray) -> np.ndarray:
         SigmaY = SigmaU + Lowrank
 
         return SigmaY
+
+
+class StructuredCovEstimator(RiskModel):
+    """Structured Covariance Estimator
+
+    This estimator assumes observations can be predicted by multiple factors
+        X = FB + U
+    where `F` can be specified by explicit risk factors or latent factors.
+
+    Therefore the structured covariance can be estimated by
+        cov(X) = F cov(B) F.T + cov(U)
+
+    We use latent factor models to estimate the structured covariance.
+    Specifically, the following latent factor models are supported:
+        - `pca`: Principal Component Analysis
+        - `fa`: Factor Analysis
+
+    Reference: [1] Fan, J., Liao, Y., & Liu, H. (2016). An overview of the estimation of large covariance and
+    precision matrices. Econometrics Journal, 19(1), C1–C32. https://doi.org/10.1111/ectj.12061
+    """
+
+    FACTOR_MODEL_PCA = "pca"
+    FACTOR_MODEL_FA = "fa"
+
+    def __init__(
+        self,
+        factor_model: str = "pca",
+        num_factors: int = 10,
+        nan_option: str = "ignore",
+        assume_centered: bool = False,
+        scale_return: bool = True,
+    ):
+        """
+        Args:
+            factor_model (str): the latent factor models used to estimate the structured covariance (`pca`/`fa`).
+            num_factors (int): number of components to keep.
+            nan_option (str): nan handling option (`ignore`/`fill`).
+            assume_centered (bool): whether the data is assumed to be centered.
+            scale_return (bool): whether scale returns as percentage.
+        """
+        super().__init__(nan_option, assume_centered, scale_return)
+
+        assert factor_model in [
+            self.FACTOR_MODEL_PCA,
+            self.FACTOR_MODEL_FA,
+        ], "factor_model={} is not supported".format(factor_model)
+        self.solver = PCA if factor_model == self.FACTOR_MODEL_PCA else FactorAnalysis
+
+        self.num_factors = num_factors
+
+    def predict(
+        self,
+        X: Union[pd.Series, pd.DataFrame, np.ndarray],
+        return_corr: bool = False,
+        is_price: bool = True,
+        return_decomposed_components=False,
+    ) -> Union[pd.DataFrame, np.ndarray, tuple]:
+        """
+        Args:
+            X (pd.Series, pd.DataFrame or np.ndarray): data from which to estimate the covariance,
+                with variables as columns and observations as rows.
+            return_corr (bool): whether return the correlation matrix.
+            is_price (bool): whether `X` contains price (if not assume stock returns).
+            return_decomposed_components (bool): whether return decomposed components of the covariance matrix.
+
+        Returns:
+            tuple or pd.DataFrame or np.ndarray: decomposed covariance matrix or estimated covariance or correlation.
+        """
+        assert (
+            not return_corr or not return_decomposed_components
+        ), "Can only return either correlation matrix or decomposed components."
+
+        # transform input into 2D array
+        if not isinstance(X, (pd.Series, pd.DataFrame)):
+            columns = None
+        else:
+            if isinstance(X.index, pd.MultiIndex):
+                if isinstance(X, pd.DataFrame):
+                    X = X.iloc[:, 0].unstack(level="instrument")  # always use the first column
+                else:
+                    X = X.unstack(level="instrument")
+            else:
+                # X is 2D DataFrame
+                pass
+            columns = X.columns  # will be used to restore dataframe
+            X = X.values
+
+        # calculate pct_change
+        if is_price:
+            X = X[1:] / X[:-1] - 1  # NOTE: resulting `n - 1` rows
+
+        # scale return
+        if self.scale_return:
+            X *= 100
+
+        # handle nan and centered
+        X = self._preprocess(X)
+
+        if return_decomposed_components:
+            F, cov_b, var_u = self._predict(X, return_structured=True)
+            return F, cov_b, var_u
+        else:
+            # estimate covariance
+            S = self._predict(X)
+
+            # return correlation if needed
+            if return_corr:
+                vola = np.sqrt(np.diag(S))
+                corr = S / np.outer(vola, vola)
+                if columns is None:
+                    return corr
+                return pd.DataFrame(corr, index=columns, columns=columns)
+
+            # return covariance
+            if columns is None:
+                return S
+            return pd.DataFrame(S, index=columns, columns=columns)
+
+    def _predict(self, X: np.ndarray, return_structured=False) -> Union[np.ndarray, tuple]:
+        """
+        covariance estimation implementation
+
+        Args:
+            X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows).
+            return_structured (bool): whether return decomposed components of the covariance matrix.
+
+        Returns:
+            tuple or np.ndarray: decomposed covariance matrix or covariance matrix.
+        """
+
+        model = self.solver(self.num_factors, random_state=0).fit(X)
+
+        F = model.components_.T  # num_features x num_factors
+        B = model.transform(X)  # num_samples x num_factors
+        U = X - B @ F.T
+        cov_b = np.cov(B.T)  # num_factors x num_factors
+        var_u = np.var(U, axis=0)  # diagonal
+
+        if return_structured:
+            return F, cov_b, var_u
+
+        cov_x = F @ cov_b @ F.T + np.diag(var_u)
+
+        return cov_x