Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Enhanced Indexing as a Portfolio Optimizer #280

Merged
merged 32 commits into from
Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
988b42e
Add Structured Covariance Estimator to riskmodel.py
Feb 9, 2021
7b01c5c
Add an implementation of Enhanced Indexing to optimizer.py
Feb 9, 2021
9c2653f
Add an implementation of Enhanced Indexing to optimizer.py
Feb 9, 2021
4000518
Separate specific implementation of Portfolio Optimizer to folder.
Feb 22, 2021
b2e2142
Applied slight modification to follow PEP 8.
Feb 22, 2021
2cc057e
Fix minor mismatches of type hints.
Feb 22, 2021
9448a6e
Add a abstract class as the base class for all optimization related p…
Feb 22, 2021
42f8825
Reformat code to follow PEP 8.
Feb 22, 2021
f7d3e56
Merge optimization related portfolio construction back to portfolio/o…
Feb 22, 2021
58f74cf
Reformat code to follow PEP 8.
Feb 22, 2021
164687d
Add scikit-learn to dependencies.
Feb 22, 2021
b8647c1
Reformat code to follow PEP 8.
Feb 22, 2021
2f9d45e
Reformat code with black.
Feb 22, 2021
3787138
Format code with the latest version of black.
yongzhengqi Feb 22, 2021
dc4aa67
Black format
Derek-Wds Feb 22, 2021
f947a2f
Correct two mistakes in annotation.
Feb 22, 2021
d3caea6
Add unittest for TestStructuredCovEstimator.
Feb 22, 2021
527718a
Allow enhanced indexing to generate portfolio without industry relate…
Feb 22, 2021
2bff6eb
Split classes in riskmodel.py & optimizer.py into seperate files.
Mar 4, 2021
83c6e74
Reindex files.
Mar 4, 2021
0f3e3d2
Update __init__.py.
Mar 4, 2021
79c1142
Pass nan_option to structured covariance estimator.
Mar 8, 2021
4d5a30b
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
81b86f8
Update test to cover changes in structured_cov_estimator
Mar 8, 2021
351d598
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
c6675be
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
fc89fec
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
2f9af1a
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
7022675
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
6a305c7
Resolve https://github.com/microsoft/qlib/pull/280\#discussion_r58916…
Mar 8, 2021
8b9065c
Reformat with black.
Mar 8, 2021
53cf89d
Reformat with black.
Mar 8, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions qlib/contrib/strategy/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd

from ..backtest.order import Order
from ...utils import get_pre_trading_date
from .order_generator import OrderGenWInteract


Expand Down Expand Up @@ -390,11 +389,11 @@ def filter_stock(l):
current_stock_list = current_temp.get_stock_list()
value = cash * self.risk_degree / len(buy) if len(buy) > 0 else 0

# open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not consider it
# as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
# open_cost should be considered in the real trading environment, while the backtest in evaluate.py does not
# consider it as the aim of demo is to accomplish same strategy as evaluate.py, so comment out this line
# value = value / (1+trade_exchange.open_cost) # set open_cost limit
for code in buy:
# check is stock supended
# check is stock suspended
if not trade_exchange.is_stock_tradable(stock_id=code, trade_date=trade_date):
continue
# buy order
Expand Down
3 changes: 2 additions & 1 deletion qlib/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def fit(self, dataset: Dataset):

# get weights
try:
wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"], data_key=DataHandlerLP.DK_L)
wdf_train, wdf_valid = dataset.prepare(["train", "valid"], col_set=["weight"],
data_key=DataHandlerLP.DK_L)
w_train, w_valid = wdf_train["weight"], wdf_valid["weight"]
except KeyError as e:
w_train = pd.DataFrame(np.ones_like(y_train.values), index=y_train.index)
Expand Down
7 changes: 7 additions & 0 deletions qlib/model/riskmodel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from .base import RiskModel
from .poet import POETCovEstimator
from .shrink import ShrinkCovEstimator
from .structured import StructuredCovEstimator
147 changes: 147 additions & 0 deletions qlib/model/riskmodel/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import inspect
import numpy as np
import pandas as pd
from typing import Union

from qlib.model.base import BaseModel


class RiskModel(BaseModel):
"""Risk Model

A risk model is used to estimate the covariance matrix of stock returns.
"""

MASK_NAN = "mask"
FILL_NAN = "fill"
IGNORE_NAN = "ignore"

def __init__(self, nan_option: str = "ignore", assume_centered: bool = False, scale_return: bool = True):
"""
Args:
nan_option (str): nan handling option (`ignore`/`mask`/`fill`).
assume_centered (bool): whether the data is assumed to be centered.
scale_return (bool): whether scale returns as percentage.
"""
# nan
assert nan_option in [
self.MASK_NAN,
self.FILL_NAN,
self.IGNORE_NAN,
], f"`nan_option={nan_option}` is not supported"
self.nan_option = nan_option

self.assume_centered = assume_centered
self.scale_return = scale_return

def predict(
self,
X: Union[pd.Series, pd.DataFrame, np.ndarray],
return_corr: bool = False,
is_price: bool = True,
return_decomposed_components=False,
) -> Union[pd.DataFrame, np.ndarray, tuple]:
"""
Args:
X (pd.Series, pd.DataFrame or np.ndarray): data from which to estimate the covariance,
with variables as columns and observations as rows.
return_corr (bool): whether return the correlation matrix.
is_price (bool): whether `X` contains price (if not assume stock returns).
return_decomposed_components (bool): whether return decomposed components of the covariance matrix.

Returns:
pd.DataFrame or np.ndarray: estimated covariance (or correlation).
"""
assert (
not return_corr or not return_decomposed_components
), "Can only return either correlation matrix or decomposed components."

# transform input into 2D array
if not isinstance(X, (pd.Series, pd.DataFrame)):
columns = None
else:
if isinstance(X.index, pd.MultiIndex):
if isinstance(X, pd.DataFrame):
X = X.iloc[:, 0].unstack(level="instrument") # always use the first column
else:
X = X.unstack(level="instrument")
else:
# X is 2D DataFrame
pass
columns = X.columns # will be used to restore dataframe
X = X.values

# calculate pct_change
if is_price:
X = X[1:] / X[:-1] - 1 # NOTE: resulting `n - 1` rows

# scale return
if self.scale_return:
X *= 100

# handle nan and centered
X = self._preprocess(X)

# return decomposed components if needed
if return_decomposed_components:
assert (
"return_decomposed_components" in inspect.getfullargspec(self._predict).args
), "This risk model does not support return decomposed components of the covariance matrix "

F, cov_b, var_u = self._predict(X, return_decomposed_components=True)
return F, cov_b, var_u

# estimate covariance
S = self._predict(X)

# return correlation if needed
if return_corr:
vola = np.sqrt(np.diag(S))
corr = S / np.outer(vola, vola)
if columns is None:
return corr
return pd.DataFrame(corr, index=columns, columns=columns)

# return covariance
if columns is None:
return S
return pd.DataFrame(S, index=columns, columns=columns)

def _predict(self, X: np.ndarray) -> np.ndarray:
"""covariance estimation implementation

This method should be overridden by child classes.

By default, this method implements the empirical covariance estimation.

Args:
X (np.ndarray): data matrix containing multiple variables (columns) and observations (rows).

Returns:
np.ndarray: covariance matrix.
"""
xTx = np.asarray(X.T.dot(X))
N = len(X)
if isinstance(X, np.ma.MaskedArray):
M = 1 - X.mask
N = M.T.dot(M) # each pair has distinct number of samples
return xTx / N

def _preprocess(self, X: np.ndarray) -> Union[np.ndarray, np.ma.MaskedArray]:
"""handle nan and centerize data

Note:
if `nan_option='mask'` then the returned array will be `np.ma.MaskedArray`.
"""
# handle nan
if self.nan_option == self.FILL_NAN:
X = np.nan_to_num(X)
elif self.nan_option == self.MASK_NAN:
X = np.ma.masked_invalid(X)
# centralize
if not self.assume_centered:
X = X - np.nanmean(X, axis=0)
return X
84 changes: 84 additions & 0 deletions qlib/model/riskmodel/poet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import numpy as np

from qlib.model.riskmodel import RiskModel


class POETCovEstimator(RiskModel):
"""Principal Orthogonal Complement Thresholding Estimator (POET)

Reference:
[1] Fan, J., Liao, Y., & Mincheva, M. (2013). Large covariance estimation by thresholding principal orthogonal complements.
Journal of the Royal Statistical Society. Series B: Statistical Methodology, 75(4), 603–680. https://doi.org/10.1111/rssb.12016
[2] http://econweb.rutgers.edu/yl1114/papers/poet/POET.m
"""

THRESH_SOFT = "soft"
THRESH_HARD = "hard"
THRESH_SCAD = "scad"

def __init__(self, num_factors: int = 0, thresh: float = 1.0, thresh_method: str = "soft", **kwargs):
"""
Args:
num_factors (int): number of factors (if set to zero, no factor model will be used).
thresh (float): the positive constant for thresholding.
thresh_method (str): thresholding method, which can be
- 'soft': soft thresholding.
- 'hard': hard thresholding.
- 'scad': scad thresholding.
kwargs: see `RiskModel` for more information.
"""
super().__init__(**kwargs)

assert num_factors >= 0, "`num_factors` requires a positive integer"
self.num_factors = num_factors

assert thresh >= 0, "`thresh` requires a positive float number"
self.thresh = thresh

assert thresh_method in [
self.THRESH_HARD,
self.THRESH_SOFT,
self.THRESH_SCAD,
], "`thresh_method` should be `soft`/`hard`/`scad`"
self.thresh_method = thresh_method

def _predict(self, X: np.ndarray) -> np.ndarray:

Y = X.T # NOTE: to match POET's implementation
p, n = Y.shape

if self.num_factors > 0:
Dd, V = np.linalg.eig(Y.T.dot(Y))
V = V[:, np.argsort(Dd)]
F = V[:, -self.num_factors :][:, ::-1] * np.sqrt(n)
LamPCA = Y.dot(F) / n
uhat = np.asarray(Y - LamPCA.dot(F.T))
Lowrank = np.asarray(LamPCA.dot(LamPCA.T))
rate = 1 / np.sqrt(p) + np.sqrt(np.log(p) / n)
else:
uhat = np.asarray(Y)
rate = np.sqrt(np.log(p) / n)
Lowrank = 0

lamb = rate * self.thresh
SuPCA = uhat.dot(uhat.T) / n
SuDiag = np.diag(np.diag(SuPCA))
R = np.linalg.inv(SuDiag ** 0.5).dot(SuPCA).dot(np.linalg.inv(SuDiag ** 0.5))

if self.thresh_method == self.THRESH_HARD:
M = R * (np.abs(R) > lamb)
elif self.thresh_method == self.THRESH_SOFT:
res = np.abs(R) - lamb
res = (res + np.abs(res)) / 2
M = np.sign(R) * res
else:
M1 = (np.abs(R) < 2 * lamb) * np.sign(R) * (np.abs(R) - lamb) * (np.abs(R) > lamb)
M2 = (np.abs(R) < 3.7 * lamb) * (np.abs(R) >= 2 * lamb) * (2.7 * R - 3.7 * np.sign(R) * lamb) / 1.7
M3 = (np.abs(R) >= 3.7 * lamb) * R
M = M1 + M2 + M3

Rthresh = M - np.diag(np.diag(M)) + np.eye(p)
SigmaU = (SuDiag ** 0.5).dot(Rthresh).dot(SuDiag ** 0.5)
SigmaY = SigmaU + Lowrank

return SigmaY
Loading